From dd06f86b98527b6b6425ea679bea5cc347f5afb4 Mon Sep 17 00:00:00 2001
From: Vignesh Venkatasubramanian <vigneshv@google.com>
Date: Fri, 2 Jun 2023 03:02:16 +0000
Subject: libyuv: Update to r1871 (2a6cb743)

Changes from upstream:
https://chromium.googlesource.com/libyuv/libyuv/+log/d53f1bee..2a6cb743

The intention of the CL is to import the functions necessary to
enable AV1 (and AVIF) 12-bit color conversion.

Bug: 268505204
Test: Builds. Media and Camera CTS tests pass.
(cherry picked from https://googleplex-android-review.googlesource.com/q/commit:27750a13c6eaacb9f716da3fe1734a8d106d7ff4)
Merged-In: I756d3bd5047d4719659f9e1a449217b1940e51a4
Change-Id: I756d3bd5047d4719659f9e1a449217b1940e51a4
---
 METADATA                                      |    4 +-
 README.version                                |    2 +-
 files/.gn                                     |    4 +-
 files/.vpython3                               |    4 +-
 files/Android.bp                              |    1 +
 files/BUILD.gn                                |   22 +-
 files/CMakeLists.txt                          |   22 +-
 files/DEPS                                    | 1091 ++--
 files/README.chromium                         |    2 +-
 files/README.md                               |    1 +
 files/build_overrides/build.gni               |    3 +
 files/build_overrides/partition_alloc.gni     |   17 +
 files/docs/environment_variables.md           |    3 +
 files/docs/getting_started.md                 |   29 +
 files/include/libyuv/convert.h                |  123 +
 files/include/libyuv/convert_argb.h           |  126 +
 files/include/libyuv/convert_from_argb.h      |   51 +-
 files/include/libyuv/cpu_id.h                 |    7 +
 files/include/libyuv/planar_functions.h       |   56 +-
 files/include/libyuv/rotate.h                 |   64 +
 files/include/libyuv/rotate_row.h             |   45 +
 files/include/libyuv/row.h                    | 1077 +++-
 files/include/libyuv/scale_row.h              |   43 +
 files/include/libyuv/version.h                |    2 +-
 files/infra/config/PRESUBMIT.py               |    2 +
 files/infra/config/cr-buildbucket.cfg         |  252 +-
 files/infra/config/main.star                  |   20 +-
 files/infra/config/project.cfg                |    2 +-
 files/infra/config/realms.cfg                 |    4 +
 files/libyuv.gni                              |    3 +-
 files/riscv_script/prepare_toolchain_qemu.sh  |   74 +
 files/riscv_script/riscv-clang.cmake          |   52 +
 files/riscv_script/run_qemu.sh                |   15 +
 files/source/compare.cc                       |    6 +-
 files/source/compare_gcc.cc                   |    2 +-
 files/source/compare_mmi.cc                   |  123 -
 files/source/convert.cc                       |  893 ++-
 files/source/convert_argb.cc                  | 1426 ++++-
 files/source/convert_from.cc                  |   24 +
 files/source/convert_from_argb.cc             | 1061 +++-
 files/source/cpu_id.cc                        |  100 +-
 files/source/mjpeg_decoder.cc                 |    4 +-
 files/source/planar_functions.cc              |  659 ++-
 files/source/rotate.cc                        |  394 +-
 files/source/rotate_argb.cc                   |   16 +-
 files/source/rotate_common.cc                 |  127 +-
 files/source/rotate_gcc.cc                    |  130 +
 files/source/rotate_mmi.cc                    |  291 -
 files/source/rotate_neon.cc                   |   40 +
 files/source/rotate_neon64.cc                 |   71 +-
 files/source/row_any.cc                       |  852 ++-
 files/source/row_common.cc                    |  826 ++-
 files/source/row_gcc.cc                       |  578 +-
 files/source/row_lasx.cc                      |  370 +-
 files/source/row_lsx.cc                       | 1580 ++++-
 files/source/row_mmi.cc                       | 7842 -------------------------
 files/source/row_neon.cc                      |  268 +-
 files/source/row_neon64.cc                    |  255 +-
 files/source/row_rvv.cc                       |  956 +++
 files/source/row_win.cc                       |   65 +-
 files/source/scale.cc                         |  106 +-
 files/source/scale_any.cc                     |   16 +
 files/source/scale_argb.cc                    |   98 +-
 files/source/scale_common.cc                  |  191 +-
 files/source/scale_gcc.cc                     |    5 +-
 files/source/scale_mmi.cc                     | 1168 ----
 files/source/scale_neon.cc                    |   39 +
 files/source/scale_neon64.cc                  |   39 +
 files/source/scale_uv.cc                      |  142 +-
 files/tools_libyuv/autoroller/roll_deps.py    |  582 +-
 files/unit_test/convert_test.cc               |  762 ++-
 files/unit_test/cpu_test.cc                   |  146 +-
 files/unit_test/planar_test.cc                |   97 +-
 files/unit_test/rotate_argb_test.cc           |  106 +
 files/unit_test/rotate_test.cc                |  363 ++
 files/unit_test/scale_uv_test.cc              |   79 +-
 files/unit_test/testdata/riscv64.txt          |    4 +
 files/unit_test/testdata/riscv64_rvv.txt      |    4 +
 files/unit_test/testdata/riscv64_rvv_zvfh.txt |    4 +
 files/unit_test/unit_test.cc                  |    5 +
 files/unit_test/unit_test.h                   |   15 +-
 files/util/cpuid.c                            |   60 +-
 files/util/yuvconstants.c                     |   11 +-
 files/util/yuvconvert.cc                      |   10 +-
 84 files changed, 13621 insertions(+), 12613 deletions(-)
 create mode 100644 files/build_overrides/partition_alloc.gni
 create mode 100755 files/riscv_script/prepare_toolchain_qemu.sh
 create mode 100644 files/riscv_script/riscv-clang.cmake
 create mode 100755 files/riscv_script/run_qemu.sh
 delete mode 100644 files/source/compare_mmi.cc
 delete mode 100644 files/source/rotate_mmi.cc
 delete mode 100644 files/source/row_mmi.cc
 create mode 100644 files/source/row_rvv.cc
 delete mode 100644 files/source/scale_mmi.cc
 create mode 100644 files/unit_test/testdata/riscv64.txt
 create mode 100644 files/unit_test/testdata/riscv64_rvv.txt
 create mode 100644 files/unit_test/testdata/riscv64_rvv_zvfh.txt

diff --git a/METADATA b/METADATA
index bff062d8..5508de20 100644
--- a/METADATA
+++ b/METADATA
@@ -8,7 +8,7 @@ third_party {
     type: GIT
     value: "https://chromium.googlesource.com/libyuv/libyuv/"
   }
-  version: "d53f1beecdd8d959f7a3f2e19bd0bd7e7227a233"
-  last_upgrade_date { year: 2022 month: 8 day: 5 }
+  version: "2a6cb7431939faba1b40d3f08883847f0cf63572"
+  last_upgrade_date { year: 2023 month: 6 day: 1 }
   license_type: NOTICE
 }
diff --git a/README.version b/README.version
index 5deb188e..6eb9dc8c 100644
--- a/README.version
+++ b/README.version
@@ -1,4 +1,4 @@
-Version: r1837
+Version: r1871
 BugComponent: 42195
 Owner: lajos
 Local Modifications:
diff --git a/files/.gn b/files/.gn
index a765caa5..f9a5ee6c 100644
--- a/files/.gn
+++ b/files/.gn
@@ -34,7 +34,5 @@ exec_script_whitelist = build_dotfile_settings.exec_script_whitelist +
 
 default_args = {
   mac_sdk_min = "10.12"
-
-  # https://bugs.chromium.org/p/libyuv/issues/detail?id=826
-  ios_deployment_target = "10.0"
+  ios_deployment_target = "12.0"
 }
diff --git a/files/.vpython3 b/files/.vpython3
index 0a9aa38b..28d819e7 100644
--- a/files/.vpython3
+++ b/files/.vpython3
@@ -76,8 +76,8 @@ wheel: <
   version: "version:5.8.0.chromium.2"
 >
 wheel: <
-  name: "infra/python/wheels/requests-py2_py3"
-  version: "version:2.26.0"
+  name: "infra/python/wheels/requests-py3"
+  version: "version:2.31.0"
 >
 
 # Used by various python unit tests.
diff --git a/files/Android.bp b/files/Android.bp
index 0c46f7f1..d02b56f3 100644
--- a/files/Android.bp
+++ b/files/Android.bp
@@ -62,6 +62,7 @@ cc_library {
         "source/row_msa.cc",
         "source/row_neon.cc",
         "source/row_neon64.cc",
+        "source/row_rvv.cc",
         "source/scale.cc",
         "source/scale_any.cc",
         "source/scale_argb.cc",
diff --git a/files/BUILD.gn b/files/BUILD.gn
index a72ff065..adaae9d8 100644
--- a/files/BUILD.gn
+++ b/files/BUILD.gn
@@ -6,6 +6,7 @@
 # in the file PATENTS. All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
+import("//build/config/features.gni")
 import("//testing/test.gni")
 import("libyuv.gni")
 
@@ -21,15 +22,19 @@ declare_args() {
 
 config("libyuv_config") {
   include_dirs = [ "include" ]
-  if (is_android && current_cpu == "arm64") {
-    ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ]
-  }
-  if (is_android && current_cpu != "arm64") {
-    ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ]
+  if (is_android) {
+    if (target_cpu == "arm" || target_cpu == "x86" || target_cpu == "mipsel") {
+      ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ]
+    } else {
+      ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ]
+    }
   }
-
+  defines = []
   if (!libyuv_use_neon) {
-    defines = [ "LIBYUV_DISABLE_NEON" ]
+    defines += [ "LIBYUV_DISABLE_NEON" ]
+  }
+  if (libyuv_disable_rvv) {
+    defines += [ "LIBYUV_DISABLE_RVV" ]
   }
 }
 
@@ -129,6 +134,7 @@ static_library("libyuv_internal") {
     "source/row_any.cc",
     "source/row_common.cc",
     "source/row_gcc.cc",
+    "source/row_rvv.cc",
     "source/row_win.cc",
     "source/scale.cc",
     "source/scale_any.cc",
@@ -150,7 +156,7 @@ static_library("libyuv_internal") {
     configs += [ "//build/config/gcc:symbol_visibility_default" ]
   }
 
-  if (!is_ios && !libyuv_disable_jpeg) {
+  if ((!is_ios || use_blink) && !libyuv_disable_jpeg) {
     defines += [ "HAVE_JPEG" ]
 
     # Needed to pull in libjpeg headers. Can't add //third_party:jpeg to deps
diff --git a/files/CMakeLists.txt b/files/CMakeLists.txt
index d190507b..7a4a1994 100644
--- a/files/CMakeLists.txt
+++ b/files/CMakeLists.txt
@@ -4,7 +4,7 @@
 
 PROJECT ( YUV C CXX )	# "C" is required even for C++ projects
 CMAKE_MINIMUM_REQUIRED( VERSION 2.8.12 )
-OPTION( TEST "Built unit tests" OFF )
+OPTION( UNIT_TEST "Built unit tests" OFF )
 
 SET ( ly_base_dir	${PROJECT_SOURCE_DIR} )
 SET ( ly_src_dir	${ly_base_dir}/source )
@@ -41,18 +41,24 @@ endif()
 ADD_EXECUTABLE			( yuvconvert ${ly_base_dir}/util/yuvconvert.cc )
 TARGET_LINK_LIBRARIES	( yuvconvert ${ly_lib_static} )
 
+# this creates the yuvconstants tool
+ADD_EXECUTABLE      ( yuvconstants ${ly_base_dir}/util/yuvconstants.c )
+TARGET_LINK_LIBRARIES  ( yuvconstants ${ly_lib_static} )
 
-INCLUDE ( FindJPEG )
+find_package ( JPEG )
 if (JPEG_FOUND)
   include_directories( ${JPEG_INCLUDE_DIR} )
-  target_link_libraries( yuvconvert ${JPEG_LIBRARY} )
+  target_link_libraries( ${ly_lib_shared} ${JPEG_LIBRARY} )
   add_definitions( -DHAVE_JPEG )
 endif()
 
-if(TEST)
+if(UNIT_TEST)
   find_library(GTEST_LIBRARY gtest)
   if(GTEST_LIBRARY STREQUAL "GTEST_LIBRARY-NOTFOUND")
     set(GTEST_SRC_DIR /usr/src/gtest CACHE STRING "Location of gtest sources")
+    if (CMAKE_CROSSCOMPILING)
+      set(GTEST_SRC_DIR third_party/googletest/src/googletest)
+    endif()
     if(EXISTS ${GTEST_SRC_DIR}/src/gtest-all.cc)
       message(STATUS "building gtest from sources in ${GTEST_SRC_DIR}")
       set(gtest_sources ${GTEST_SRC_DIR}/src/gtest-all.cc)
@@ -61,7 +67,7 @@ if(TEST)
       include_directories(${GTEST_SRC_DIR}/include)
       set(GTEST_LIBRARY gtest)
     else()
-      message(FATAL_ERROR "TEST is set but unable to find gtest library")
+      message(FATAL_ERROR "UNIT_TEST is set but unable to find gtest library")
     endif()
   endif()
 
@@ -78,6 +84,12 @@ if(TEST)
   if(NACL AND NACL_LIBC STREQUAL "newlib")
     target_link_libraries(libyuv_unittest glibc-compat)
   endif()
+
+  find_library(GFLAGS_LIBRARY gflags)
+  if(NOT GFLAGS_LIBRARY STREQUAL "GFLAGS_LIBRARY-NOTFOUND")
+    target_link_libraries(libyuv_unittest gflags)
+    add_definitions(-DLIBYUV_USE_GFLAGS)
+  endif()
 endif()
 
 
diff --git a/files/DEPS b/files/DEPS
index 3cf2dbe0..a7bec8d3 100644
--- a/files/DEPS
+++ b/files/DEPS
@@ -5,43 +5,62 @@ gclient_gn_args = [
 
 vars = {
   'chromium_git': 'https://chromium.googlesource.com',
-  'chromium_revision': '829c6df33dce1085a61d8fd44209fc84bbf9a6a7',
-  'gn_version': 'git_revision:6f13aaac55a977e1948910942675c69f2b4f7a94',
+  'chromium_revision': 'd1501576384de23ddf8d8815ee7c95be2f708de5',
+  'gn_version': 'git_revision:e3978de3e8dafb50a2b11efa784e08699a43faf8',
+  # ninja CIPD package version.
+  # https://chrome-infra-packages.appspot.com/p/infra/3pp/tools/ninja
+  'ninja_version': 'version:2@1.11.1.chromium.6',
+  # reclient CIPD package version
+  'reclient_version': 're_client_version:0.107.1.0b39c4c-gomaip',
 
   # Keep the Chromium default of generating location tags.
   'generate_location_tags': True,
+
+  # By default, download the fuchsia sdk from the public sdk directory.
+  'fuchsia_sdk_cipd_prefix': 'fuchsia/sdk/gn/',
+  'fuchsia_version': 'version:12.20230530.1.1',
+  # By default, download the fuchsia images from the fuchsia GCS bucket.
+  'fuchsia_images_bucket': 'fuchsia',
+  'checkout_fuchsia': False,
+  # Since the images are hundreds of MB, default to only downloading the image
+  # most commonly useful for developers. Bots and developers that need to use
+  # other images can override this with additional images.
+  'checkout_fuchsia_boot_images': "terminal.qemu-x64",
+  'checkout_fuchsia_product_bundles': '"{checkout_fuchsia_boot_images}" != ""',
 }
 
 deps = {
   'src/build':
-    Var('chromium_git') + '/chromium/src/build' + '@' + 'dcea3443035f48d58193788e0bc56daca4e5db33',
+    Var('chromium_git') + '/chromium/src/build' + '@' + 'd0c2b4cf4fdd43866e066fb6722099aa8bf4ce79',
   'src/buildtools':
-    Var('chromium_git') + '/chromium/src/buildtools' + '@' + '075dd7e22837a69189003e4fa84499acf63188cf',
+    Var('chromium_git') + '/chromium/src/buildtools' + '@' + 'edbefcee3d2cc45cdb0c60c2b01b673f8ba728bc',
   'src/testing':
-    Var('chromium_git') + '/chromium/src/testing' + '@' + 'f4e42be13265ec304b0f3085eee2b15f30f44077',
+    Var('chromium_git') + '/chromium/src/testing' + '@' + 'a13817e1ea0255a375d13aeb3bb2527bd528495b',
   'src/third_party':
-    Var('chromium_git') + '/chromium/src/third_party' + '@' + '42c249feeb71bc0cd184849f0509aefef599343d',
+    Var('chromium_git') + '/chromium/src/third_party' + '@' + '824e26c9fcbd00fccf6cdb712f8f127aae133042',
 
   'src/buildtools/linux64': {
     'packages': [
       {
-        'package': 'gn/gn/linux-amd64',
+        'package': 'gn/gn/linux-${{arch}}',
         'version': Var('gn_version'),
       }
     ],
     'dep_type': 'cipd',
-    'condition': 'checkout_linux',
+    'condition': 'host_os == "linux"',
   },
+
   'src/buildtools/mac': {
     'packages': [
       {
-        'package': 'gn/gn/mac-amd64',
+        'package': 'gn/gn/mac-${{arch}}',
         'version': Var('gn_version'),
       }
     ],
     'dep_type': 'cipd',
-    'condition': 'checkout_mac',
+    'condition': 'host_os == "mac"',
   },
+
   'src/buildtools/win': {
     'packages': [
       {
@@ -50,43 +69,57 @@ deps = {
       }
     ],
     'dep_type': 'cipd',
-    'condition': 'checkout_win',
+    'condition': 'host_os == "win"',
+  },
+
+  'src/buildtools/reclient': {
+    'packages': [
+      {
+        'package': 'infra/rbe/client/${{platform}}',
+        'version': Var('reclient_version'),
+      }
+    ],
+    'dep_type': 'cipd',
   },
 
   'src/buildtools/clang_format/script':
-    Var('chromium_git') + '/external/github.com/llvm/llvm-project/clang/tools/clang-format.git' + '@' + '99876cacf78329e5f99c244dbe42ccd1654517a0',
+    Var('chromium_git') + '/external/github.com/llvm/llvm-project/clang/tools/clang-format.git' + '@' + 'f97059df7f8b205064625cdb5f97b56668a125ef',
   'src/buildtools/third_party/libc++/trunk':
-    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxx.git' + '@' + '79a2e924d96e2fc1e4b937c42efd08898fa472d7',
+    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxx.git' + '@' + 'f8279b01085b800724f5c5629dc365b9f040dc53',
   'src/buildtools/third_party/libc++abi/trunk':
-    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxxabi.git' + '@' + '665b74f7d1b3bb295cd6ba7d8fcec1acd3d2ac84',
+    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxxabi.git' + '@' + '5c8dbff7a4911fe1e0af0bc1628891e4187a3c90',
   'src/buildtools/third_party/libunwind/trunk':
-    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'f51a154281bdfe746c46c07cd4fb05be97f9441d',
+    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'cd144ced35285edaa064a91561969e5b22c219b1',
 
   'src/third_party/catapult':
-    Var('chromium_git') + '/catapult.git' + '@' + '75423c310eb303d28978be892fcf7b9c2c824909',
+    Var('chromium_git') + '/catapult.git' + '@' + '9f3ef9c2eae9b1adabde88efe5dcc438ba76e205',
   'src/third_party/colorama/src':
-    Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8',
+    Var('chromium_git') + '/external/colorama.git' + '@' + '3de9f013df4b470069d03d250224062e8cf15c49',
+  'src/third_party/cpu_features/src': {
+    'url': Var('chromium_git') + '/external/github.com/google/cpu_features.git' + '@' + '936b9ab5515dead115606559502e3864958f7f6e',
+    'condition': 'checkout_android',
+  },
   'src/third_party/depot_tools':
-    Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + '2ffa1bde797a8127c0f72908d0bd74051fd65d0d',
+    Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + '05ab73be51774f098eb580eda6e96a49e1010b1b',
   'src/third_party/freetype/src':
-    Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + 'cff026d41599945498044d2f4dcc0e610ffb6929',
+    Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + '80a507a6b8e3d2906ad2c8ba69329bd2fb2a85ef',
   'src/third_party/googletest/src':
-    Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + 'e2f3978937c0244508135f126e2617a7734a68be',
+    Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + 'af29db7ec28d6df1c7f0f745186884091e602e07',
   'src/third_party/harfbuzz-ng/src':
-    Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + '64b29dbd5994a511acee69cb9b45ad650ef88359',
+    Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + '8df5cdbcda495a582e72a7e2ce35d6106401edce',
   'src/third_party/libjpeg_turbo':
-    Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + '02959c3ee17abacfd1339ec22ea93301292ffd56',
+    Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + 'aa4075f116e4312537d0d3e9dbd5e31096539f94',
   'src/third_party/nasm':
-    Var('chromium_git') + '/chromium/deps/nasm.git' + '@' + '9215e8e1d0fe474ffd3e16c1a07a0f97089e6224',
+    Var('chromium_git') + '/chromium/deps/nasm.git' + '@' + '7fc833e889d1afda72c06220e5bed8fb43b2e5ce',
   'src/tools':
-    Var('chromium_git') + '/chromium/src/tools' + '@' + '198dc879529652b39ba6e223bcc0bcad5f1facd6',
+    Var('chromium_git') + '/chromium/src/tools' + '@' + '916dfffd61cbf61075c47d7b480425d7de1483fd',
 
   # libyuv-only dependencies (not present in Chromium).
   'src/third_party/gtest-parallel':
     Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '1dad0e9f6d82ff994130b529d7d814b40eb32b0e',
 
   'src/third_party/lss': {
-    'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + '92a65a8f5d705d1928874420c8d0d15bde8c89e5',
+    'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + 'ce877209e11aa69dcfffbd53ef90ea1d07136521',
     'condition': 'checkout_android or checkout_linux',
   },
 
@@ -101,14 +134,32 @@ deps = {
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/auto/src': {
-    'url': Var('chromium_git') + '/external/github.com/google/auto.git' + '@' + 'fe67d853d6356943dc79541c892ab6d3e6a7b61a',
-    'condition': 'checkout_android',
+
+  'src/third_party/kotlin_stdlib': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/kotlin_stdlib',
+              'version': 'z4_AYYz2Tw5GKikuiDLTuxxf0NJVGLkC3CVcyiIpc-gC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/kotlinc/current': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/kotlinc',
+              'version': 'J3BAlA7yf4corBopDhlwuT9W4jR1Z9R55KD3BUTVldQC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
   },
+
   'src/third_party/boringssl/src':
-    'https://boringssl.googlesource.com/boringssl.git' + '@' + '3a667d10e94186fd503966f5638e134fe9fb4080',
+    'https://boringssl.googlesource.com/boringssl.git' + '@' + 'dd5219451c3ce26221762a15d867edf43b463bb2',
   'src/base': {
-    'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'e9e639622449a893a1b5e32781d072cec08ead72',
+    'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'b4c5ce6cb1a7c90de3fdddc80ed439fe87eab443',
     'condition': 'checkout_android',
   },
   'src/third_party/bazel': {
@@ -132,19 +183,21 @@ deps = {
       'dep_type': 'cipd',
   },
   'src/third_party/android_ndk': {
-    'url': Var('chromium_git') + '/android_ndk.git' + '@' + '401019bf85744311b26c88ced255cd53401af8b7',
+    'url': Var('chromium_git') + '/android_ndk.git' + '@' + '310956bd122ec2b96049f8d7398de6b717f3452e',
     'condition': 'checkout_android',
   },
+
   'src/third_party/androidx': {
     'packages': [
       {
           'package': 'chromium/third_party/androidx',
-          'version': '6d8ij5pzYh29WWjPbdbAWFBJSA1nUgkWf2p6wCVZKIsC',
+          'version': 'Wr5b9WJiFAzJcmjmvQIePIxk5IgpDl62kaGY_SiLxJEC',
       },
     ],
     'condition': 'checkout_android',
     'dep_type': 'cipd',
   },
+
   'src/third_party/android_support_test_runner': {
       'packages': [
           {
@@ -158,16 +211,12 @@ deps = {
   'src/third_party/android_sdk/public': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_sdk/public/build-tools/31.0.0',
-              'version': 'tRoD45SCi7UleQqSV7MrMQO1_e5P8ysphkCcj6z_cCQC',
+              'package': 'chromium/third_party/android_sdk/public/build-tools/33.0.0',
+              'version': '-VRKr36Uw8L_iFqqo9nevIBgNMggND5iWxjidyjnCgsC',
           },
           {
               'package': 'chromium/third_party/android_sdk/public/emulator',
-              'version': 'gMHhUuoQRKfxr-MBn3fNNXZtkAVXtOwMwT7kfx8jkIgC',
-          },
-          {
-              'package': 'chromium/third_party/android_sdk/public/extras',
-              'version': 'ppQ4TnqDvBHQ3lXx5KPq97egzF5X2FFyOrVHkGmiTMQC',
+              'version': '9lGp8nTUCRRWGMnI_96HcKfzjnxEJKUcfvfwmA3wXNkC',
           },
           {
               'package': 'chromium/third_party/android_sdk/public/patcher',
@@ -175,11 +224,15 @@ deps = {
           },
           {
               'package': 'chromium/third_party/android_sdk/public/platform-tools',
-              'version': 'g7n_-r6yJd_SGRklujGB1wEt8iyr77FZTUJVS9w6O34C',
+              'version': 'RSI3iwryh7URLGRgJHsCvUxj092woTPnKt4pwFcJ6L8C',
           },
           {
-              'package': 'chromium/third_party/android_sdk/public/platforms/android-31',
-              'version': 'lL3IGexKjYlwjO_1Ga-xwxgwbE_w-lmi2Zi1uOlWUIAC',
+              'package': 'chromium/third_party/android_sdk/public/platforms/android-33',
+              'version': 'eo5KvW6UVor92LwZai8Zulc624BQZoCu-yn7wa1z_YcC',
+          },
+   {
+              'package': 'chromium/third_party/android_sdk/public/platforms/android-tiramisuprivacysandbox',
+              'version': 'YWMYkzyxGBgVsty0GhXL1oxbY0pGXQIgFc0Rh7ZMRPYC',
           },
           {
               'package': 'chromium/third_party/android_sdk/public/sources/android-31',
@@ -187,7 +240,7 @@ deps = {
           },
           {
               'package': 'chromium/third_party/android_sdk/public/cmdline-tools',
-              'version': 'Ez2NWws2SJYCF6qw2O-mSCqK6424l3ZdSTpppLyVR_cC',
+              'version': 'EWnL2r7oV5GtE9Ef7GyohyFam42wtMtEKYU4dCb3U1YC',
           },
       ],
       'condition': 'checkout_android',
@@ -207,7 +260,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_build_tools/aapt2',
-              'version': 'version:3.6.0-alpha03-5516695-cr0',
+              'version': 'STY0BXlZxsEhudnlXQFed-B5UpwehcoM0sYqor6qRqsC',
           },
       ],
       'condition': 'checkout_android',
@@ -223,6 +276,16 @@ deps = {
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+  'src/third_party/byte_buddy/android_sdk_build_tools_25_0_2': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_sdk/public/build-tools',
+              'version': 'kwIs2vdfTm93yEP8LG5aSnchN4BVEdVxbqQtF4XpPdkC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
   'src/third_party/ced/src': {
     'url': Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + 'ba412eaaacd3186085babcd901679a48863c7dd5',
     'condition': 'checkout_android',
@@ -267,7 +330,7 @@ deps = {
   },
 
   'src/third_party/icu': {
-    'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'bf66d373ae781a3498f2babe7b61d933dd774b82',
+    'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'a2961dc659b4ae847a9c6120718cc2517ee57d9e',
   },
   'src/third_party/icu4j': {
       'packages': [
@@ -293,11 +356,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/jdk',
-              'version': 'PfRSnxe8Od6WU4zBXomq-zsgcJgWmm3z4gMQNB-r2QcC',
-          },
-          {
-              'package': 'chromium/third_party/jdk/extras',
-              'version': 'fkhuOQ3r-zKtWEdKplpo6k0vKkjl-LY_rJTmtzFCQN4C',
+              'version': '2Of9Pe_OdO4xoAATuiLDiMVNebKTNO3WrwJGqil4RosC',
           },
       ],
       'condition': 'checkout_android',
@@ -308,22 +367,31 @@ deps = {
     'condition': 'checkout_android',
   },
   'src/third_party/junit/src': {
-    'url': Var('chromium_git') + '/external/junit.git' + '@' + '64155f8a9babcfcf4263cf4d08253a1556e75481',
+    'url': Var('chromium_git') + '/external/junit.git' + '@' + '05fe2a64f59127c02135be22f416e91260d6ede6',
     'condition': 'checkout_android',
   },
   'src/third_party/libunwindstack': {
-      'url': Var('chromium_git') + '/chromium/src/third_party/libunwindstack.git' + '@' + '6868358481bb1e5e20d155c1084dc436c88b5e6b',
+      'url': Var('chromium_git') + '/chromium/src/third_party/libunwindstack.git' + '@' + '4dbfa0e8c844c8e243b297bc185e54a99ff94f9e',
       'condition': 'checkout_android',
   },
+  'src/third_party/ninja': {
+    'packages': [
+      {
+        'package': 'infra/3pp/tools/ninja/${{platform}}',
+        'version': Var('ninja_version'),
+      }
+    ],
+    'dep_type': 'cipd',
+  },
   'src/third_party/mockito/src': {
-    'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + '04a2a289a4222f80ad20717c25144981210d2eac',
+    'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + '7c3641bcef717ffa7d765f2c86b847d0aab1aac9',
     'condition': 'checkout_android',
   },
   'src/third_party/objenesis': {
       'packages': [
           {
               'package': 'chromium/third_party/objenesis',
-              'version': '9e367f55e5a65781ee77bfcbaa88fb82b30e75c0',
+              'version': 'tknDblENYi8IaJYyD6tUahUyHYZlzJ_Y74_QZSz4DpIC',
           },
       ],
       'condition': 'checkout_android',
@@ -343,7 +411,20 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/r8',
-              'version': 'Nu_mvQJe34CotIXadFlA3w732CJ9EvQGuVs4udcZedAC',
+              'version': '4Oq32DG2vuDh7Frxj6tH5xyi77sVgBWpvvl4hwvZRR4C',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  # This duplication is intentional, so we avoid updating the r8.jar used by
+  # dexing unless necessary, since each update invalidates all incremental
+  # dexing and unnecessarily slows down all bots.
+  'src/third_party/r8/d8': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/r8',
+              'version': 'PwglNZFRNPkBBXdnY9NfrZFk2ULWDTRxhV9rl2kvkpUC',
           },
       ],
       'condition': 'checkout_android',
@@ -367,7 +448,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/robolectric',
-              'version': 'iC6RDM5EH3GEAzR-1shW_Mg0FeeNE5shq1okkFfuuNQC',
+              'version': 'hzetqh1qFI32FOgQroZvGcGdomrgVBJ6WKRnl1KFw6EC',
           },
       ],
       'condition': 'checkout_android',
@@ -377,7 +458,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/sqlite4java',
-              'version': '889660698187baa7c8b0d79f7bf58563125fbd66',
+              'version': 'LofjKH9dgXIAJhRYCPQlMFywSwxYimrfDeBmaHc-Z5EC',
           },
       ],
       'condition': 'checkout_android',
@@ -387,7 +468,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/turbine',
-              'version': 'Om6yIEXgJxuqghErK29h9RcMH6VaymMbxwScwXmcN6EC',
+              'version': 'Foa7uRpVoKr4YoayCKc9EERkjpmGOE3DAUTWFLL7gKEC',
           },
       ],
       'condition': 'checkout_android',
@@ -400,1718 +481,1822 @@ deps = {
 
   # iOS deps:
   'src/ios': {
-    'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '81826d980c159f949c2c7901f4dbec9a09788964',
+    'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '241921896b64f85de9a32d461462913cbff4baeb',
     'condition': 'checkout_ios'
   },
 
   # Everything coming after this is automatically updated by the auto-roller.
   # === ANDROID_DEPS Generated Code Start ===
-
+  # Generated by //third_party/android_deps/fetch_all.py
   'src/third_party/android_deps/libs/android_arch_core_common': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_core_common',
-              'version': 'version:2@1.1.1.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/android_arch_core_runtime': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_core_runtime',
-              'version': 'version:2@1.1.1.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/android_arch_lifecycle_common': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common',
-              'version': 'version:2@1.1.1.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/android_arch_lifecycle_common_java8': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common_java8',
-              'version': 'version:2@1.1.1.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/android_arch_lifecycle_livedata': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata',
-              'version': 'version:2@1.1.1.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/android_arch_lifecycle_livedata_core': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata_core',
-              'version': 'version:2@1.1.1.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/android_arch_lifecycle_runtime': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_runtime',
-              'version': 'version:2@1.1.1.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/android_arch_lifecycle_viewmodel': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_viewmodel',
-              'version': 'version:2@1.1.1.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/backport_util_concurrent_backport_util_concurrent': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/backport_util_concurrent_backport_util_concurrent',
-              'version': 'version:2@3.1.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/classworlds_classworlds': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/classworlds_classworlds',
-              'version': 'version:2@1.1-alpha-2.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_animated_vector_drawable': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_animated_vector_drawable',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_appcompat_v7': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_appcompat_v7',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_asynclayoutinflater': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_asynclayoutinflater',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_cardview_v7': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_cardview_v7',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_collections': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_collections',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_coordinatorlayout': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_coordinatorlayout',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_cursoradapter': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_cursoradapter',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_customview': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_customview',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_design': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_design',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_documentfile': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_documentfile',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_drawerlayout': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_drawerlayout',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_interpolator': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_interpolator',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_loader': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_loader',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_localbroadcastmanager': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_localbroadcastmanager',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_multidex': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_multidex',
-              'version': 'version:2@1.0.0.cr0',
+              'version': 'version:2@1.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_print': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_print',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_recyclerview_v7': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_recyclerview_v7',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_slidingpanelayout': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_slidingpanelayout',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_annotations',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_compat': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_compat',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_core_ui': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_ui',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_core_utils': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_utils',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_fragment': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_fragment',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_media_compat': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_media_compat',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_v4': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v4',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_vector_drawable': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_vector_drawable',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_swiperefreshlayout': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_swiperefreshlayout',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_transition': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_transition',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_versionedparcelable': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_versionedparcelable',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_viewpager': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_viewpager',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_tools_common': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_tools_common',
-              'version': 'version:2@30.0.0-alpha10.cr0',
+              'version': 'version:2@30.2.0-beta01.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs': {
+
+  'src/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs',
-              'version': 'version:2@1.1.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api',
+              'version': 'version:2@30.2.0-beta01.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs_configuration': {
+
+  'src/third_party/android_deps/libs/com_android_tools_sdk_common': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs_configuration',
-              'version': 'version:2@1.1.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_android_tools_sdk_common',
+              'version': 'version:2@30.2.0-beta01.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api': {
+
+  'src/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api',
-              'version': 'version:2@30.0.0-alpha10.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine',
+              'version': 'version:2@2.8.8.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/com_android_tools_sdk_common': {
+
+  'src/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/com_android_tools_sdk_common',
-              'version': 'version:2@30.0.0-alpha10.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms',
+              'version': 'version:2@1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine': {
+
+  'src/third_party/android_deps/libs/com_google_android_annotations': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine',
-              'version': 'version:2@2.8.8.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_annotations',
+              'version': 'version:2@4.1.1.4.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms': {
+
+  'src/third_party/android_deps/libs/com_google_android_apps_common_testing_accessibility_framework_accessibility_test_framework': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms',
-              'version': 'version:2@1.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_apps_common_testing_accessibility_framework_accessibility_test_framework',
+              'version': 'version:2@4.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_datatransport_transport_api': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_datatransport_transport_api',
-              'version': 'version:2@2.2.1.cr0',
+              'version': 'version:2@2.2.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@20.1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone',
-              'version': 'version:2@17.5.0.cr0',
+              'version': 'version:2@18.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@18.0.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_base': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_base',
-              'version': 'version:2@17.5.0.cr0',
+              'version': 'version:2@18.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_basement': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_basement',
-              'version': 'version:2@17.5.0.cr0',
+              'version': 'version:2@18.1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_cloud_messaging': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cloud_messaging',
-              'version': 'version:2@16.0.0.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_fido': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_fido',
-              'version': 'version:2@19.0.0-beta.cr0',
+              'version': 'version:2@16.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_flags': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_flags',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_gcm': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_gcm',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_iid': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_iid',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@18.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_location': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_location',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@19.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_stats': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_stats',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_tasks': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_tasks',
-              'version': 'version:2@17.2.0.cr0',
+              'version': 'version:2@18.0.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision',
-              'version': 'version:2@18.0.0.cr0',
+              'version': 'version:2@20.1.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common',
-              'version': 'version:2@18.0.0.cr0',
+              'version': 'version:2@19.1.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_material_material': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_material_material',
-              'version': 'version:2@1.6.0-alpha01.cr0',
+              'version': 'version:2@1.7.0-alpha02.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_play_core_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_play_core_common',
+              'version': 'version:2@2.0.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/com_google_android_play_core': {
+
+  'src/third_party/android_deps/libs/com_google_android_play_feature_delivery': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_play_core',
-              'version': 'version:2@1.10.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_play_feature_delivery',
+              'version': 'version:2@2.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_auto_auto_common': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_auto_auto_common',
-              'version': 'version:2@1.1.2.cr0',
+              'version': 'version:2@1.2.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_auto_service_auto_service': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service',
-              'version': 'version:2@1.0-rc6.cr0',
+              'version': 'version:2@1.0-rc6.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations',
-              'version': 'version:2@1.0-rc6.cr0',
+              'version': 'version:2@1.0-rc6.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations',
-              'version': 'version:2@1.7.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/com_google_code_findbugs_jformatstring': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jformatstring',
-              'version': 'version:2@3.0.0.cr0',
+              'version': 'version:2@1.10.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_code_findbugs_jsr305': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jsr305',
-              'version': 'version:2@3.0.2.cr0',
+              'version': 'version:2@3.0.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_code_gson_gson': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_code_gson_gson',
-              'version': 'version:2@2.8.0.cr0',
+              'version': 'version:2@2.9.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_dagger_dagger': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger',
-              'version': 'version:2@2.30.cr0',
+              'version': 'version:2@2.30.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_dagger_dagger_compiler': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_compiler',
-              'version': 'version:2@2.30.cr0',
+              'version': 'version:2@2.30.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_dagger_dagger_producers': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_producers',
-              'version': 'version:2@2.30.cr0',
+              'version': 'version:2@2.30.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_dagger_dagger_spi': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_spi',
-              'version': 'version:2@2.30.cr0',
+              'version': 'version:2@2.30.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation',
-              'version': 'version:2@2.10.0.cr0',
+              'version': 'version:2@2.11.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations',
-              'version': 'version:2@2.10.0.cr0',
+              'version': 'version:2@2.18.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api',
-              'version': 'version:2@2.10.0.cr0',
+              'version': 'version:2@2.11.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_errorprone_error_prone_core': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_core',
-              'version': 'version:2@2.10.0.cr0',
+              'version': 'version:2@2.11.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations',
-              'version': 'version:2@2.10.0.cr0',
+              'version': 'version:2@2.11.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_errorprone_javac': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac',
-              'version': 'version:2@9+181-r4173-1.cr0',
+              'version': 'version:2@9+181-r4173-1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_errorprone_javac_shaded': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac_shaded',
-              'version': 'version:2@9-dev-r4023-3.cr0',
+              'version': 'version:2@9-dev-r4023-3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_annotations',
-              'version': 'version:2@16.0.0.cr0',
+              'version': 'version:2@16.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_common': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_common',
-              'version': 'version:2@19.5.0.cr0',
+              'version': 'version:2@19.5.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_components': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_components',
-              'version': 'version:2@16.1.0.cr0',
+              'version': 'version:2@16.1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_encoders': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_encoders',
-              'version': 'version:2@16.1.0.cr0',
+              'version': 'version:2@16.1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_encoders_json': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_encoders_json',
-              'version': 'version:2@17.1.0.cr0',
+              'version': 'version:2@17.1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_iid': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_iid',
-              'version': 'version:2@21.0.1.cr0',
+              'version': 'version:2@21.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_iid_interop': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_iid_interop',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_installations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_installations',
-              'version': 'version:2@16.3.5.cr0',
+              'version': 'version:2@16.3.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_installations_interop': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_installations_interop',
-              'version': 'version:2@16.0.1.cr0',
+              'version': 'version:2@16.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_measurement_connector': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_measurement_connector',
-              'version': 'version:2@18.0.0.cr0',
+              'version': 'version:2@18.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_messaging': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_messaging',
-              'version': 'version:2@21.0.1.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/com_google_flatbuffers_flatbuffers_java': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_flatbuffers_flatbuffers_java',
-              'version': 'version:2@2.0.3.cr0',
+              'version': 'version:2@21.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format',
-              'version': 'version:2@1.5.cr0',
+              'version': 'version:2@1.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_guava_failureaccess': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_guava_failureaccess',
-              'version': 'version:2@1.0.1.cr0',
+              'version': 'version:2@1.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_guava_guava': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava',
-              'version': 'version:2@31.0-jre.cr0',
+              'version': 'version:2@31.1-jre.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_guava_guava_android': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava_android',
-              'version': 'version:2@31.0-android.cr0',
+              'version': 'version:2@31.1-android.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_guava_listenablefuture': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_guava_listenablefuture',
-              'version': 'version:2@1.0.cr0',
+              'version': 'version:2@1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations',
-              'version': 'version:2@1.3.cr0',
+              'version': 'version:2@1.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_protobuf_protobuf_java': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_java',
-              'version': 'version:2@3.4.0.cr0',
+              'version': 'version:2@3.19.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite',
-              'version': 'version:2@3.13.0.cr0',
+              'version': 'version:2@3.21.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils',
-              'version': 'version:2@1.3.0.cr0',
+              'version': 'version:2@1.3.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_squareup_javapoet': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_squareup_javapoet',
-              'version': 'version:2@1.13.0.cr0',
+              'version': 'version:2@1.13.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_squareup_javawriter': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_squareup_javawriter',
-              'version': 'version:2@2.1.1.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils',
-              'version': 'version:2@4.0.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/javax_annotation_javax_annotation_api': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/javax_annotation_javax_annotation_api',
-              'version': 'version:2@1.3.2.cr0',
+              'version': 'version:2@2.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/javax_annotation_jsr250_api': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/javax_annotation_jsr250_api',
-              'version': 'version:2@1.0.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/javax_inject_javax_inject': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/javax_inject_javax_inject',
-              'version': 'version:2@1.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/nekohtml_nekohtml': {
+
+  'src/third_party/android_deps/libs/com_squareup_okio_okio_jvm': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/nekohtml_nekohtml',
-              'version': 'version:2@1.9.6.2.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_squareup_okio_okio_jvm',
+              'version': 'version:2@3.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/nekohtml_xercesminimal': {
+
+  'src/third_party/android_deps/libs/com_squareup_wire_wire_runtime_jvm': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/nekohtml_xercesminimal',
-              'version': 'version:2@1.9.6.2.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_squareup_wire_wire_runtime_jvm',
+              'version': 'version:2@4.5.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/net_ltgt_gradle_incap_incap': {
+
+  'src/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/net_ltgt_gradle_incap_incap',
-              'version': 'version:2@0.2.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils',
+              'version': 'version:2@4.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/net_sf_kxml_kxml2': {
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_api': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/net_sf_kxml_kxml2',
-              'version': 'version:2@2.3.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_api',
+              'version': 'version:2@1.49.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_ant_ant': {
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_binder': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_ant_ant',
-              'version': 'version:2@1.8.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_binder',
+              'version': 'version:2@1.49.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_ant_ant_launcher': {
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_context': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_ant_ant_launcher',
-              'version': 'version:2@1.8.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_context',
+              'version': 'version:2@1.49.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_ant_tasks': {
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_core': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_ant_tasks',
-              'version': 'version:2@2.1.3.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_core',
+              'version': 'version:2@1.49.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_artifact': {
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_protobuf_lite': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_artifact',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_protobuf_lite',
+              'version': 'version:2@1.49.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_artifact_manager': {
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_stub': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_artifact_manager',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_stub',
+              'version': 'version:2@1.49.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_error_diagnostics': {
+
+  'src/third_party/android_deps/libs/io_perfmark_perfmark_api': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_error_diagnostics',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_perfmark_perfmark_api',
+              'version': 'version:2@0.25.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_model': {
+
+  'src/third_party/android_deps/libs/javax_annotation_javax_annotation_api': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_model',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/javax_annotation_javax_annotation_api',
+              'version': 'version:2@1.3.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_plugin_registry': {
+
+  'src/third_party/android_deps/libs/javax_annotation_jsr250_api': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_plugin_registry',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/javax_annotation_jsr250_api',
+              'version': 'version:2@1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_profile': {
+
+  'src/third_party/android_deps/libs/javax_inject_javax_inject': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_profile',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/javax_inject_javax_inject',
+              'version': 'version:2@1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_project': {
+
+  'src/third_party/android_deps/libs/net_bytebuddy_byte_buddy': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_project',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/net_bytebuddy_byte_buddy',
+              'version': 'version:2@1.14.4.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_repository_metadata': {
+
+  'src/third_party/android_deps/libs/net_bytebuddy_byte_buddy_agent': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_repository_metadata',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/net_bytebuddy_byte_buddy_agent',
+              'version': 'version:2@1.14.4.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_settings': {
+
+  'src/third_party/android_deps/libs/net_ltgt_gradle_incap_incap': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_settings',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/net_ltgt_gradle_incap_incap',
+              'version': 'version:2@0.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_file': {
+
+  'src/third_party/android_deps/libs/org_bouncycastle_bcprov_jdk18on': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_file',
-              'version': 'version:2@1.0-beta-6.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_bouncycastle_bcprov_jdk18on',
+              'version': 'version:2@1.72.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_lightweight': {
+
+  'src/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_lightweight',
-              'version': 'version:2@1.0-beta-6.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup',
+              'version': 'version:2@1.2.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_shared': {
+
+  'src/third_party/android_deps/libs/org_checkerframework_checker_compat_qual': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_shared',
-              'version': 'version:2@1.0-beta-6.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_compat_qual',
+              'version': 'version:2@2.5.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_provider_api': {
+
+  'src/third_party/android_deps/libs/org_checkerframework_checker_qual': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_provider_api',
-              'version': 'version:2@1.0-beta-6.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_qual',
+              'version': 'version:2@3.25.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup': {
+
+  'src/third_party/android_deps/libs/org_checkerframework_checker_util': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup',
-              'version': 'version:2@1.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_util',
+              'version': 'version:2@3.25.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_checkerframework_checker_compat_qual': {
+
+  'src/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_compat_qual',
-              'version': 'version:2@2.5.5.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone',
+              'version': 'version:2@3.15.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_checkerframework_checker_qual': {
+
+  'src/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_qual',
-              'version': 'version:2@3.12.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations',
+              'version': 'version:2@1.21.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone': {
+
+  'src/third_party/android_deps/libs/org_conscrypt_conscrypt_openjdk_uber': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone',
-              'version': 'version:2@3.15.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_conscrypt_conscrypt_openjdk_uber',
+              'version': 'version:2@2.5.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations': {
+
+  'src/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations',
-              'version': 'version:2@1.17.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit',
+              'version': 'version:2@4.4.1.201607150455-r.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_container_default': {
+
+  'src/third_party/android_deps/libs/org_hamcrest_hamcrest': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_container_default',
-              'version': 'version:2@1.0-alpha-9-stable-1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_hamcrest_hamcrest',
+              'version': 'version:2@2.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_interpolation': {
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_interpolation',
-              'version': 'version:2@1.11.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7',
+              'version': 'version:2@1.8.20.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_utils': {
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_utils',
-              'version': 'version:2@1.5.15.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8',
+              'version': 'version:2@1.8.20.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit': {
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit',
-              'version': 'version:2@4.4.1.201607150455-r.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android',
+              'version': 'version:2@1.6.4.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_annotations': {
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_annotations',
-              'version': 'version:2@13.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm',
+              'version': 'version:2@1.6.4.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib': {
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_guava': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib',
-              'version': 'version:2@1.6.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_guava',
+              'version': 'version:2@1.6.4.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_common': {
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_common',
-              'version': 'version:2@1.6.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm',
+              'version': 'version:2@0.1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7': {
+
+  'src/third_party/android_deps/libs/org_jsoup_jsoup': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7',
-              'version': 'version:2@1.5.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_jsoup_jsoup',
+              'version': 'version:2@1.15.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8': {
+
+  'src/third_party/android_deps/libs/org_mockito_mockito_android': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8',
-              'version': 'version:2@1.5.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_android',
+              'version': 'version:2@5.3.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android': {
+
+  'src/third_party/android_deps/libs/org_mockito_mockito_core': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android',
-              'version': 'version:2@1.5.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_core',
+              'version': 'version:2@5.3.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm': {
+
+  'src/third_party/android_deps/libs/org_mockito_mockito_subclass': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm',
-              'version': 'version:2@1.5.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_subclass',
+              'version': 'version:2@5.3.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm': {
+
+  'src/third_party/android_deps/libs/org_objenesis_objenesis': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm',
-              'version': 'version:2@0.1.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_objenesis_objenesis',
+              'version': 'version:2@3.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_ow2_asm_asm': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm',
-              'version': 'version:2@7.0.cr0',
+              'version': 'version:2@9.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_ow2_asm_asm_analysis': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_analysis',
-              'version': 'version:2@7.0.cr0',
+              'version': 'version:2@9.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_ow2_asm_asm_commons': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_commons',
-              'version': 'version:2@7.0.cr0',
+              'version': 'version:2@9.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_ow2_asm_asm_tree': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_tree',
-              'version': 'version:2@7.0.cr0',
+              'version': 'version:2@9.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_ow2_asm_asm_util': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_util',
-              'version': 'version:2@7.0.cr0',
+              'version': 'version:2@9.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_pcollections_pcollections': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_pcollections_pcollections',
-              'version': 'version:2@2.1.2.cr0',
+              'version': 'version:2@3.1.4.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_annotations',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_junit': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_junit',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_nativeruntime': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_nativeruntime',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
+  'src/third_party/android_deps/libs/org_robolectric_nativeruntime_dist_compat': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_nativeruntime_dist_compat',
+              'version': 'version:2@1.0.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
   'src/third_party/android_deps/libs/org_robolectric_pluginapi': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_pluginapi',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_resources': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_resources',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_robolectric': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_robolectric',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_sandbox': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_sandbox',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_shadowapi': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadowapi',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_shadows_framework': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_framework',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_shadows_playservices': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_playservices',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_utils': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_utils_reflector': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils_reflector',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
@@ -2197,29 +2382,74 @@ hooks = [
     'condition': 'checkout_mac',
   },
   {
-    'name': 'msan_chained_origins',
+    'name': 'msan_chained_origins_focal',
+    'pattern': '.',
+    'condition': 'checkout_instrumented_libraries',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-instrumented-libraries',
+                '-s', 'src/third_party/instrumented_libraries/binaries/msan-chained-origins-focal.tgz.sha1',
+              ],
+  },
+  {
+    'name': 'msan_no_origins_focal',
+    'pattern': '.',
+    'condition': 'checkout_instrumented_libraries',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-instrumented-libraries',
+                '-s', 'src/third_party/instrumented_libraries/binaries/msan-no-origins-focal.tgz.sha1',
+              ],
+  },
+  {
+    'name': 'msan_chained_origins_focal',
     'pattern': '.',
     'condition': 'checkout_instrumented_libraries',
     'action': [ 'python3',
                 'src/third_party/depot_tools/download_from_google_storage.py',
-                "--no_resume",
-                "--no_auth",
-                "--bucket", "chromium-instrumented-libraries",
-                "-s", "src/third_party/instrumented_libraries/binaries/msan-chained-origins.tgz.sha1",
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-instrumented-libraries',
+                '-s', 'src/third_party/instrumented_libraries/binaries/msan-chained-origins-focal.tgz.sha1',
               ],
   },
   {
-    'name': 'msan_no_origins',
+    'name': 'msan_no_origins_focal',
     'pattern': '.',
     'condition': 'checkout_instrumented_libraries',
     'action': [ 'python3',
                 'src/third_party/depot_tools/download_from_google_storage.py',
-                "--no_resume",
-                "--no_auth",
-                "--bucket", "chromium-instrumented-libraries",
-                "-s", "src/third_party/instrumented_libraries/binaries/msan-no-origins.tgz.sha1",
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-instrumented-libraries',
+                '-s', 'src/third_party/instrumented_libraries/binaries/msan-no-origins-focal.tgz.sha1',
               ],
   },
+  {
+    'name': 'Download Fuchsia SDK from GCS',
+    'pattern': '.',
+    'condition': 'checkout_fuchsia',
+    'action': [
+      'python3',
+      'src/build/fuchsia/update_sdk.py',
+      '--cipd-prefix={fuchsia_sdk_cipd_prefix}',
+      '--version={fuchsia_version}',
+    ],
+  },
+  {
+    'name': 'Download Fuchsia system images',
+    'pattern': '.',
+    'condition': 'checkout_fuchsia and checkout_fuchsia_product_bundles',
+    'action': [
+      'python3',
+      'src/build/fuchsia/update_product_bundles.py',
+      '{checkout_fuchsia_boot_images}',
+    ],
+  },
   {
     # Pull clang if needed or requested via GYP_DEFINES.
     # Note: On Win, this should run after win_toolchain, as it may use it.
@@ -2238,7 +2468,9 @@ hooks = [
   {
     'name': 'clang_format_win',
     'pattern': '.',
-    'action': [ 'download_from_google_storage',
+    'condition': 'host_os == "win"',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
                 '--no_resume',
                 '--platform=win32',
                 '--no_auth',
@@ -2247,21 +2479,38 @@ hooks = [
     ],
   },
   {
-    'name': 'clang_format_mac',
+    'name': 'clang_format_mac_x64',
     'pattern': '.',
-    'action': [ 'download_from_google_storage',
+    'condition': 'host_os == "mac" and host_cpu == "x64"',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
                 '--no_resume',
                 '--platform=darwin',
                 '--no_auth',
                 '--bucket', 'chromium-clang-format',
-                '-s', 'src/buildtools/mac/clang-format.sha1',
+                '-s', 'src/buildtools/mac/clang-format.x64.sha1',
+                '-o', 'src/buildtools/mac/clang-format',
     ],
   },
+  {
+    'name': 'clang_format_mac_arm64',
+    'pattern': '.',
+    'condition': 'host_os == "mac" and host_cpu == "arm64"',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-clang-format',
+                '-s', 'src/buildtools/mac/clang-format.arm64.sha1',
+                '-o', 'src/buildtools/mac/clang-format',
+     ],
+  },
   {
     'name': 'clang_format_linux',
     'pattern': '.',
     'condition': 'host_os == "linux"',
-    'action': [ 'download_from_google_storage',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
                 '--no_resume',
                 '--platform=linux*',
                 '--no_auth',
@@ -2303,18 +2552,6 @@ hooks = [
                 '-d', 'src/tools/luci-go/linux64',
     ],
   },
-  {
-    # We used to use src as a CIPD root. We moved it to a different directory
-    # in crrev.com/c/930178 but left the clobber here to ensure that that CL
-    # could be reverted safely. This can be safely removed once crbug.com/794764
-    # is resolved.
-    'name': 'Android Clobber Deprecated CIPD Root',
-    'pattern': '.',
-    'condition': 'checkout_android',
-    'action': ['src/build/cipd/clobber_cipd_root.py',
-               '--root', 'src',
-    ],
-  },
   {
     'name': 'Generate component metadata for tests',
     'pattern': '.',
diff --git a/files/README.chromium b/files/README.chromium
index 3f68e21e..880191e4 100644
--- a/files/README.chromium
+++ b/files/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1837
+Version: 1871
 License: BSD
 License File: LICENSE
 
diff --git a/files/README.md b/files/README.md
index db70b7f0..95eeb04c 100644
--- a/files/README.md
+++ b/files/README.md
@@ -7,6 +7,7 @@
 * Optimized for SSSE3/AVX2 on x86/x64.
 * Optimized for Neon on Arm.
 * Optimized for MSA on Mips.
+* Optimized for RVV on RISC-V.
 
 ### Development
 
diff --git a/files/build_overrides/build.gni b/files/build_overrides/build.gni
index c8490313..d9d01d51 100644
--- a/files/build_overrides/build.gni
+++ b/files/build_overrides/build.gni
@@ -13,6 +13,9 @@ build_with_chromium = false
 # Some non-Chromium builds don't support building java targets.
 enable_java_templates = true
 
+# Enables assertions on safety checks in libc++.
+enable_safe_libcxx = true
+
 # Allow using custom suppressions files (currently not used by libyuv).
 asan_suppressions_file = "//build/sanitizers/asan_suppressions.cc"
 lsan_suppressions_file = "//build/sanitizers/lsan_suppressions.cc"
diff --git a/files/build_overrides/partition_alloc.gni b/files/build_overrides/partition_alloc.gni
new file mode 100644
index 00000000..dcf8ac2d
--- /dev/null
+++ b/files/build_overrides/partition_alloc.gni
@@ -0,0 +1,17 @@
+# Copyright 2022 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Use default values for PartitionAlloc as standalone library from
+# base/allocator/partition_allocator/build_overrides/partition_alloc.gni
+use_partition_alloc_as_malloc_default = false
+use_allocator_shim_default = false
+enable_backup_ref_ptr_support_default = false
+enable_mte_checked_ptr_support_default = false
+put_ref_count_in_previous_slot_default = false
+enable_backup_ref_ptr_slow_checks_default = false
+enable_dangling_raw_ptr_checks_default = false
diff --git a/files/docs/environment_variables.md b/files/docs/environment_variables.md
index dd5d59fb..4eb09659 100644
--- a/files/docs/environment_variables.md
+++ b/files/docs/environment_variables.md
@@ -40,6 +40,9 @@ By default the cpu is detected and the most advanced form of SIMD is used.  But
     LIBYUV_DISABLE_LSX
     LIBYUV_DISABLE_LASX
 
+## RISCV CPUs
+    LIBYUV_DISABLE_RVV
+
 # Test Width/Height/Repeat
 
 The unittests default to a small image (128x72) to run fast.  This can be set by environment variable to test a specific resolutions.
diff --git a/files/docs/getting_started.md b/files/docs/getting_started.md
index 15b19ab2..b19f0009 100644
--- a/files/docs/getting_started.md
+++ b/files/docs/getting_started.md
@@ -220,6 +220,35 @@ Install cmake: http://www.cmake.org/
     make -j4
     make package
 
+## Building RISC-V target with cmake
+
+### Prerequisite: build risc-v clang toolchain and qemu
+
+If you don't have prebuilt clang and riscv64 qemu, run the script to download source and build them.
+
+    ./riscv_script/prepare_toolchain_qemu.sh
+
+After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`.
+
+### Cross-compile for RISC-V target
+    cmake -B out/Release/ -DUNIT_TEST=ON \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_TOOLCHAIN_FILE="./riscv_script/riscv-clang.cmake" \
+          -DTOOLCHAIN_PATH={TOOLCHAIN_PATH} \
+          -DUSE_RVV=ON .
+    cmake --build out/Release/
+
+
+### Run on QEMU
+
+#### Run libyuv_unittest on QEMU
+    cd out/Release/
+    USE_RVV=ON \
+    TOOLCHAIN_PATH={TOOLCHAIN_PATH} \
+    QEMU_PREFIX_PATH={QEMU_PREFIX_PATH} \
+    ../../riscv_script/run_qemu.sh libyuv_unittest
+
+
 ## Setup for Arm Cross compile
 
 See also https://www.ccoderun.ca/programming/2015-12-20_CrossCompiling/index.html
diff --git a/files/include/libyuv/convert.h b/files/include/libyuv/convert.h
index 46d37159..88619a4f 100644
--- a/files/include/libyuv/convert.h
+++ b/files/include/libyuv/convert.h
@@ -151,6 +151,33 @@ int MM21ToI420(const uint8_t* src_y,
                int width,
                int height);
 
+// Convert MM21 to YUY2
+LIBYUV_API
+int MM21ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
+
+// Convert MT2T to P010
+// Note that src_y and src_uv point to packed 10-bit values, so the Y plane will
+// be 10 / 8 times the dimensions of the image. Also for this reason,
+// src_stride_y and src_stride_uv are given in bytes.
+LIBYUV_API
+int MT2TToP010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
 // Convert I422 to NV21.
 LIBYUV_API
 int I422ToNV21(const uint8_t* src_y,
@@ -272,6 +299,23 @@ int I210ToI422(const uint16_t* src_y,
                int width,
                int height);
 
+#define H410ToH420 I410ToI420
+LIBYUV_API
+int I410ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 #define H410ToH444 I410ToI444
 LIBYUV_API
 int I410ToI444(const uint16_t* src_y,
@@ -323,6 +367,23 @@ int I212ToI422(const uint16_t* src_y,
                int width,
                int height);
 
+#define H212ToH420 I212ToI420
+LIBYUV_API
+int I212ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 #define H412ToH444 I412ToI444
 LIBYUV_API
 int I412ToI444(const uint16_t* src_y,
@@ -340,6 +401,23 @@ int I412ToI444(const uint16_t* src_y,
                int width,
                int height);
 
+#define H412ToH420 I412ToI420
+LIBYUV_API
+int I412ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 #define I412ToI012 I410ToI010
 #define H410ToH010 I410ToI010
 #define H412ToH012 I410ToI010
@@ -560,6 +638,36 @@ int NV16ToNV24(const uint8_t* src_y,
                int width,
                int height);
 
+// Convert P010 to I010.
+LIBYUV_API
+int P010ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert P012 to I012.
+LIBYUV_API
+int P012ToI012(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 // Convert P010 to P410.
 LIBYUV_API
 int P010ToP410(const uint16_t* src_y,
@@ -677,6 +785,21 @@ int ARGBToI420(const uint8_t* src_argb,
                int width,
                int height);
 
+// Convert ARGB to I420 with Alpha
+LIBYUV_API
+int ARGBToI420Alpha(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    uint8_t* dst_a,
+                    int dst_stride_a,
+                    int width,
+                    int height);
+
 // BGRA little endian (argb in memory) to I420.
 LIBYUV_API
 int BGRAToI420(const uint8_t* src_bgra,
diff --git a/files/include/libyuv/convert_argb.h b/files/include/libyuv/convert_argb.h
index f66d20ce..35eeac9b 100644
--- a/files/include/libyuv/convert_argb.h
+++ b/files/include/libyuv/convert_argb.h
@@ -67,6 +67,8 @@ LIBYUV_API extern const struct YuvConstants kYvuV2020Constants;  // BT.2020 full
   I210ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
 #define I410ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
   I410ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I012ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I012ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
 #define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
   I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
 #define I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
@@ -404,6 +406,32 @@ int U444ToABGR(const uint8_t* src_y,
                int width,
                int height);
 
+// Convert I444 to RGB24.
+LIBYUV_API
+int I444ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+// Convert I444 to RAW.
+LIBYUV_API
+int I444ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
 // Convert I010 to ARGB.
 LIBYUV_API
 int I010ToARGB(const uint16_t* src_y,
@@ -1312,6 +1340,32 @@ int J420ToRAW(const uint8_t* src_y,
               int width,
               int height);
 
+// Convert I422 to RGB24.
+LIBYUV_API
+int I422ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+// Convert I422 to RAW.
+LIBYUV_API
+int I422ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
 LIBYUV_API
 int I420ToRGB565(const uint8_t* src_y,
                  int src_stride_y,
@@ -1495,6 +1549,20 @@ int I444ToARGBMatrix(const uint8_t* src_y,
                      int width,
                      int height);
 
+// Convert I444 to RGB24 with matrix.
+LIBYUV_API
+int I444ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+
 // Convert 10 bit 420 YUV to ARGB with matrix.
 LIBYUV_API
 int I010ToAR30Matrix(const uint16_t* src_y,
@@ -1893,6 +1961,20 @@ int I420ToRGB24Matrix(const uint8_t* src_y,
                       int width,
                       int height);
 
+// Convert I422 to RGB24 with matrix.
+LIBYUV_API
+int I422ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+
 // Convert I420 to RGB565 with specified color matrix.
 LIBYUV_API
 int I420ToRGB565Matrix(const uint8_t* src_y,
@@ -1907,6 +1989,20 @@ int I420ToRGB565Matrix(const uint8_t* src_y,
                        int width,
                        int height);
 
+// Convert I422 to RGB565 with specified color matrix.
+LIBYUV_API
+int I422ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height);
+
 // Convert I420 to AR30 with matrix.
 LIBYUV_API
 int I420ToAR30Matrix(const uint8_t* src_y,
@@ -1961,6 +2057,36 @@ int I422ToARGBMatrixFilter(const uint8_t* src_y,
                            int height,
                            enum FilterMode filter);
 
+// Convert I422 to RGB24 with matrix and UV filter mode.
+LIBYUV_API
+int I422ToRGB24MatrixFilter(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgb24,
+                            int dst_stride_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height,
+                            enum FilterMode filter);
+
+// Convert I420 to RGB24 with matrix and UV filter mode.
+LIBYUV_API
+int I420ToRGB24MatrixFilter(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgb24,
+                            int dst_stride_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height,
+                            enum FilterMode filter);
+
 // Convert I010 to AR30 with matrix and UV filter mode.
 LIBYUV_API
 int I010ToAR30MatrixFilter(const uint16_t* src_y,
diff --git a/files/include/libyuv/convert_from_argb.h b/files/include/libyuv/convert_from_argb.h
index 2a488838..ff2a581a 100644
--- a/files/include/libyuv/convert_from_argb.h
+++ b/files/include/libyuv/convert_from_argb.h
@@ -209,10 +209,10 @@ int ARGBToJ420(const uint8_t* src_argb,
                int src_stride_argb,
                uint8_t* dst_yj,
                int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height);
 
@@ -222,10 +222,10 @@ int ARGBToJ422(const uint8_t* src_argb,
                int src_stride_argb,
                uint8_t* dst_yj,
                int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height);
 
@@ -238,6 +238,41 @@ int ARGBToJ400(const uint8_t* src_argb,
                int width,
                int height);
 
+// Convert ABGR to J420. (JPeg full range I420).
+LIBYUV_API
+int ABGRToJ420(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height);
+
+// Convert ABGR to J422.
+LIBYUV_API
+int ABGRToJ422(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height);
+
+// Convert ABGR to J400. (JPeg full range).
+LIBYUV_API
+int ABGRToJ400(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height);
+
 // Convert RGBA to J400. (JPeg full range).
 LIBYUV_API
 int RGBAToJ400(const uint8_t* src_rgba,
diff --git a/files/include/libyuv/cpu_id.h b/files/include/libyuv/cpu_id.h
index fb90c6c7..203f7e0d 100644
--- a/files/include/libyuv/cpu_id.h
+++ b/files/include/libyuv/cpu_id.h
@@ -55,6 +55,11 @@ static const int kCpuHasLOONGARCH = 0x2000000;
 static const int kCpuHasLSX = 0x4000000;
 static const int kCpuHasLASX = 0x8000000;
 
+// These flags are only valid on RISCV processors.
+static const int kCpuHasRISCV = 0x10000000;
+static const int kCpuHasRVV = 0x20000000;
+static const int kCpuHasRVVZVFH = 0x40000000;
+
 // Optional init function. TestCpuFlag does an auto-init.
 // Returns cpu_info flags.
 LIBYUV_API
@@ -78,6 +83,8 @@ LIBYUV_API
 int ArmCpuCaps(const char* cpuinfo_name);
 LIBYUV_API
 int MipsCpuCaps(const char* cpuinfo_name);
+LIBYUV_API
+int RiscvCpuCaps(const char* cpuinfo_name);
 
 // For testing, allow CPU flags to be disabled.
 // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
diff --git a/files/include/libyuv/planar_functions.h b/files/include/libyuv/planar_functions.h
index 1ef2256b..154f2f21 100644
--- a/files/include/libyuv/planar_functions.h
+++ b/files/include/libyuv/planar_functions.h
@@ -85,13 +85,23 @@ void SetPlane(uint8_t* dst_y,
 
 // Convert a plane of tiles of 16 x H to linear.
 LIBYUV_API
-void DetilePlane(const uint8_t* src_y,
-                 int src_stride_y,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 int width,
-                 int height,
-                 int tile_height);
+int DetilePlane(const uint8_t* src_y,
+                int src_stride_y,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                int width,
+                int height,
+                int tile_height);
+
+// Convert a plane of 16 bit tiles of 16 x H to linear.
+LIBYUV_API
+int DetilePlane_16(const uint16_t* src_y,
+                   int src_stride_y,
+                   uint16_t* dst_y,
+                   int dst_stride_y,
+                   int width,
+                   int height,
+                   int tile_height);
 
 // Convert a UV plane of tiles of 16 x H into linear U and V planes.
 LIBYUV_API
@@ -105,6 +115,18 @@ void DetileSplitUVPlane(const uint8_t* src_uv,
                         int height,
                         int tile_height);
 
+// Convert a Y and UV plane of tiles into interlaced YUY2.
+LIBYUV_API
+void DetileToYUY2(const uint8_t* src_y,
+                  int src_stride_y,
+                  const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_yuy2,
+                  int dst_stride_yuy2,
+                  int width,
+                  int height,
+                  int tile_height);
+
 // Split interleaved UV plane into separate U and V planes.
 LIBYUV_API
 void SplitUVPlane(const uint8_t* src_uv,
@@ -370,7 +392,26 @@ int I210Copy(const uint16_t* src_y,
              int width,
              int height);
 
+// Copy I410 to I410.
+#define I410ToI410 I410Copy
+LIBYUV_API
+int I410Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
 // Copy NV12. Supports inverting.
+LIBYUV_API
 int NV12Copy(const uint8_t* src_y,
              int src_stride_y,
              const uint8_t* src_uv,
@@ -383,6 +424,7 @@ int NV12Copy(const uint8_t* src_y,
              int height);
 
 // Copy NV21. Supports inverting.
+LIBYUV_API
 int NV21Copy(const uint8_t* src_y,
              int src_stride_y,
              const uint8_t* src_vu,
diff --git a/files/include/libyuv/rotate.h b/files/include/libyuv/rotate.h
index 684ed5e6..37460c4a 100644
--- a/files/include/libyuv/rotate.h
+++ b/files/include/libyuv/rotate.h
@@ -85,6 +85,60 @@ int I444Rotate(const uint8_t* src_y,
                int height,
                enum RotationMode mode);
 
+// Rotate I010 frame.
+LIBYUV_API
+int I010Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
+// Rotate I210 frame.
+LIBYUV_API
+int I210Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
+// Rotate I410 frame.
+LIBYUV_API
+int I410Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
 // Rotate NV12 input and store in I420.
 LIBYUV_API
 int NV12ToI420Rotate(const uint8_t* src_y,
@@ -156,6 +210,16 @@ void RotatePlane270(const uint8_t* src,
                     int width,
                     int height);
 
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane_16(const uint16_t* src,
+                   int src_stride,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height,
+                   enum RotationMode mode);
+
 // Rotations for when U and V are interleaved.
 // These functions take one UV input pointer and
 // split the data into two buffers while
diff --git a/files/include/libyuv/rotate_row.h b/files/include/libyuv/rotate_row.h
index aa8528a9..2dd8c03d 100644
--- a/files/include/libyuv/rotate_row.h
+++ b/files/include/libyuv/rotate_row.h
@@ -42,6 +42,8 @@ extern "C" {
 // The following are available for GCC 32 or 64 bit:
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
 #define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSE4X4_32_SSE2
+#define HAS_TRANSPOSE4X4_32_AVX2
 #endif
 
 // The following are available for 64 bit GCC:
@@ -54,6 +56,7 @@ extern "C" {
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_TRANSPOSEWX8_NEON
 #define HAS_TRANSPOSEUVWX8_NEON
+#define HAS_TRANSPOSE4X4_32_NEON
 #endif
 
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -215,6 +218,48 @@ void TransposeUVWx16_Any_LSX(const uint8_t* src,
                              uint8_t* dst_b,
                              int dst_stride_b,
                              int width);
+void TransposeWxH_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width,
+                       int height);
+
+void TransposeWx8_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width);
+void TransposeWx1_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width);
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_NEON(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+
+void Transpose4x4_32_SSE2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+
+void Transpose4x4_32_AVX2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+
+void Transpose4x4_32_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/files/include/libyuv/row.h b/files/include/libyuv/row.h
index 1a1cf4b6..5b244d77 100644
--- a/files/include/libyuv/row.h
+++ b/files/include/libyuv/row.h
@@ -11,7 +11,8 @@
 #ifndef INCLUDE_LIBYUV_ROW_H_
 #define INCLUDE_LIBYUV_ROW_H_
 
-#include <stdlib.h>  // For malloc.
+#include <stddef.h>  // For NULL
+#include <stdlib.h>  // For malloc
 
 #include "libyuv/basic_types.h"
 
@@ -75,9 +76,6 @@ extern "C" {
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 // Conversions:
 #define HAS_ABGRTOYROW_SSSE3
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ABGRTOUVROW_SSSE3
-#endif
 #define HAS_ARGB1555TOARGBROW_SSE2
 #define HAS_ARGB4444TOARGBROW_SSE2
 #define HAS_ARGBEXTRACTALPHAROW_SSE2
@@ -92,12 +90,6 @@ extern "C" {
 #define HAS_ARGBTOYJROW_SSSE3
 #define HAS_ARGBTOYROW_SSSE3
 #define HAS_BGRATOYROW_SSSE3
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ARGBTOUV444ROW_SSSE3
-#define HAS_ARGBTOUVJROW_SSSE3
-#define HAS_ARGBTOUVROW_SSSE3
-#define HAS_BGRATOUVROW_SSSE3
-#endif
 #define HAS_COPYROW_ERMS
 #define HAS_COPYROW_SSE2
 #define HAS_H422TOARGBROW_SSSE3
@@ -111,6 +103,7 @@ extern "C" {
 #define HAS_I422TOUYVYROW_SSE2
 #define HAS_I422TOYUY2ROW_SSE2
 #define HAS_I444TOARGBROW_SSSE3
+#define HAS_I444TORGB24ROW_SSSE3
 #define HAS_INTERPOLATEROW_SSSE3
 #define HAS_J400TOARGBROW_SSE2
 #define HAS_J422TOARGBROW_SSSE3
@@ -124,16 +117,13 @@ extern "C" {
 #define HAS_NV21TORGB24ROW_SSSE3
 #define HAS_RAWTOARGBROW_SSSE3
 #define HAS_RAWTORGB24ROW_SSSE3
+#define HAS_RAWTOYJROW_SSSE3
 #define HAS_RAWTOYROW_SSSE3
 #define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB24TOYJROW_SSSE3
 #define HAS_RGB24TOYROW_SSSE3
 #define HAS_RGB565TOARGBROW_SSE2
 #define HAS_RGBATOYROW_SSSE3
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_RGB24TOYJROW_SSSE3
-#define HAS_RAWTOYJROW_SSSE3
-#define HAS_RGBATOUVROW_SSSE3
-#endif
 #define HAS_SETROW_ERMS
 #define HAS_SETROW_X86
 #define HAS_SPLITUVROW_SSE2
@@ -145,13 +135,18 @@ extern "C" {
 #define HAS_YUY2TOUV422ROW_SSE2
 #define HAS_YUY2TOUVROW_SSE2
 #define HAS_YUY2TOYROW_SSE2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_ARGBTOUV444ROW_SSSE3
+#define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_RGBATOUVROW_SSSE3
+#endif
 
 // Effects:
 #define HAS_ARGBADDROW_SSE2
 #define HAS_ARGBAFFINEROW_SSE2
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ARGBATTENUATEROW_SSSE3
-#endif
 #define HAS_ARGBBLENDROW_SSSE3
 #define HAS_ARGBCOLORMATRIXROW_SSSE3
 #define HAS_ARGBCOLORTABLEROW_X86
@@ -176,6 +171,9 @@ extern "C" {
 #define HAS_SOBELXROW_SSE2
 #define HAS_SOBELXYROW_SSE2
 #define HAS_SOBELYROW_SSE2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ARGBATTENUATEROW_SSSE3
+#endif
 
 // The following functions fail on gcc/clang 32 bit with fpic and framepointer.
 // caveat: clangcl uses row_win.cc which works.
@@ -201,17 +199,10 @@ extern "C" {
 #define HAS_ARGBSHUFFLEROW_AVX2
 #define HAS_ARGBTORGB565DITHERROW_AVX2
 #define HAS_ARGBTOYJROW_AVX2
-#define HAS_RAWTOYJROW_AVX2
-#define HAS_RGB24TOYJROW_AVX2
 #define HAS_ARGBTOYROW_AVX2
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ARGBTOUVJROW_AVX2
-#define HAS_ARGBTOUVROW_AVX2
-#endif
 #define HAS_COPYROW_AVX
 #define HAS_H422TOARGBROW_AVX2
 #define HAS_HALFFLOATROW_AVX2
-//  #define HAS_HALFFLOATROW_F16C  // Enable to test halffloat cast
 #define HAS_I422TOARGB1555ROW_AVX2
 #define HAS_I422TOARGB4444ROW_AVX2
 #define HAS_I422TOARGBROW_AVX2
@@ -219,6 +210,7 @@ extern "C" {
 #define HAS_I422TORGB565ROW_AVX2
 #define HAS_I422TORGBAROW_AVX2
 #define HAS_I444TOARGBROW_AVX2
+#define HAS_I444TORGB24ROW_AVX2
 #define HAS_INTERPOLATEROW_AVX2
 #define HAS_J422TOARGBROW_AVX2
 #define HAS_MERGEUVROW_AVX2
@@ -228,6 +220,8 @@ extern "C" {
 #define HAS_NV12TORGB565ROW_AVX2
 #define HAS_NV21TOARGBROW_AVX2
 #define HAS_NV21TORGB24ROW_AVX2
+#define HAS_RAWTOYJROW_AVX2
+#define HAS_RGB24TOYJROW_AVX2
 #define HAS_SPLITUVROW_AVX2
 #define HAS_UYVYTOARGBROW_AVX2
 #define HAS_UYVYTOUV422ROW_AVX2
@@ -237,16 +231,21 @@ extern "C" {
 #define HAS_YUY2TOUV422ROW_AVX2
 #define HAS_YUY2TOUVROW_AVX2
 #define HAS_YUY2TOYROW_AVX2
+//  #define HAS_HALFFLOATROW_F16C  // Enable to test half float cast
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ARGBTOUVJROW_AVX2
+#define HAS_ARGBTOUVROW_AVX2
+#endif
 
 // Effects:
 #define HAS_ARGBADDROW_AVX2
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ARGBATTENUATEROW_AVX2
-#endif
 #define HAS_ARGBMULTIPLYROW_AVX2
 #define HAS_ARGBSUBTRACTROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
 #define HAS_BLENDPLANEROW_AVX2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ARGBATTENUATEROW_AVX2
+#endif
 
 #if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
     defined(_MSC_VER)
@@ -282,28 +281,32 @@ extern "C" {
 // The following are available for gcc/clang x86 platforms:
 // TODO(fbarchard): Port to Visual C
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_AB64TOARGBROW_SSSE3
 #define HAS_ABGRTOAR30ROW_SSSE3
+#define HAS_ABGRTOYJROW_SSSE3
+#define HAS_AR64TOARGBROW_SSSE3
+#define HAS_ARGBTOAB64ROW_SSSE3
 #define HAS_ARGBTOAR30ROW_SSSE3
 #define HAS_ARGBTOAR64ROW_SSSE3
-#define HAS_ARGBTOAB64ROW_SSSE3
-#define HAS_AR64TOARGBROW_SSSE3
-#define HAS_AB64TOARGBROW_SSSE3
 #define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_CONVERT8TO16ROW_SSE2
 #define HAS_DETILEROW_SSE2
+#define HAS_DETILEROW_16_SSE2
+#define HAS_DETILEROW_16_AVX
 #define HAS_DETILESPLITUVROW_SSSE3
+#define HAS_DETILETOYUY2_SSE2
 #define HAS_HALFMERGEUVROW_SSSE3
 #define HAS_I210TOAR30ROW_SSSE3
 #define HAS_I210TOARGBROW_SSSE3
 #define HAS_I212TOAR30ROW_SSSE3
 #define HAS_I212TOARGBROW_SSSE3
 #define HAS_I400TOARGBROW_SSE2
-#define HAS_I422TOAR30ROW_SSSE3
 #define HAS_I410TOAR30ROW_SSSE3
 #define HAS_I410TOARGBROW_SSSE3
+#define HAS_I422TOAR30ROW_SSSE3
 #define HAS_MERGEARGBROW_SSE2
-#define HAS_MERGEXRGBROW_SSE2
 #define HAS_MERGERGBROW_SSSE3
+#define HAS_MERGEXRGBROW_SSE2
 #define HAS_MIRRORUVROW_SSSE3
 #define HAS_NV21TOYUV24ROW_SSSE3
 #define HAS_P210TOAR30ROW_SSSE3
@@ -312,15 +315,17 @@ extern "C" {
 #define HAS_P410TOARGBROW_SSSE3
 #define HAS_RAWTORGBAROW_SSSE3
 #define HAS_RGB24MIRRORROW_SSSE3
-#if !defined(LIBYUV_BIT_EXACT)
 #define HAS_RGBATOYJROW_SSSE3
-#endif
 #define HAS_SPLITARGBROW_SSE2
 #define HAS_SPLITARGBROW_SSSE3
+#define HAS_SPLITRGBROW_SSSE3
 #define HAS_SPLITXRGBROW_SSE2
 #define HAS_SPLITXRGBROW_SSSE3
-#define HAS_SPLITRGBROW_SSSE3
 #define HAS_SWAPUVROW_SSSE3
+#define HAS_YUY2TONVUVROW_SSE2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ABGRTOUVJROW_SSSE3
+#endif
 
 #if defined(__x86_64__) || !defined(__pic__)
 // TODO(fbarchard): fix build error on android_full_debug=1
@@ -335,31 +340,20 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) &&               \
     (defined(__x86_64__) || defined(__i386__)) && \
     (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_AB64TOARGBROW_AVX2
 #define HAS_ABGRTOAR30ROW_AVX2
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ABGRTOUVROW_AVX2
+#define HAS_ABGRTOYJROW_AVX2
 #define HAS_ABGRTOYROW_AVX2
-#endif
+#define HAS_AR64TOARGBROW_AVX2
+#define HAS_ARGBTOAB64ROW_AVX2
 #define HAS_ARGBTOAR30ROW_AVX2
+#define HAS_ARGBTOAR64ROW_AVX2
 #define HAS_ARGBTORAWROW_AVX2
 #define HAS_ARGBTORGB24ROW_AVX2
-#define HAS_ARGBTOAR64ROW_AVX2
-#define HAS_ARGBTOAB64ROW_AVX2
-#define HAS_AR64TOARGBROW_AVX2
-#define HAS_AB64TOARGBROW_AVX2
 #define HAS_CONVERT16TO8ROW_AVX2
-#define HAS_INTERPOLATEROW_16TO8_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
 #define HAS_DIVIDEROW_16_AVX2
 #define HAS_HALFMERGEUVROW_AVX2
-#define HAS_MERGEAR64ROW_AVX2
-#define HAS_MERGEARGB16TO8ROW_AVX2
-#define HAS_MERGEARGBROW_AVX2
-#define HAS_MERGEXR30ROW_AVX2
-#define HAS_MERGEXR64ROW_AVX2
-#define HAS_MERGEXRGB16TO8ROW_AVX2
-#define HAS_MERGEXRGBROW_AVX2
-#define HAS_NV21TOYUV24ROW_AVX2
 #define HAS_I210TOAR30ROW_AVX2
 #define HAS_I210TOARGBROW_AVX2
 #define HAS_I212TOAR30ROW_AVX2
@@ -367,23 +361,35 @@ extern "C" {
 #define HAS_I400TOARGBROW_AVX2
 #define HAS_I410TOAR30ROW_AVX2
 #define HAS_I410TOARGBROW_AVX2
-#define HAS_P210TOAR30ROW_AVX2
-#define HAS_P210TOARGBROW_AVX2
-#define HAS_P410TOAR30ROW_AVX2
-#define HAS_P410TOARGBROW_AVX2
 #define HAS_I422TOAR30ROW_AVX2
 #define HAS_I422TOUYVYROW_AVX2
 #define HAS_I422TOYUY2ROW_AVX2
+#define HAS_INTERPOLATEROW_16TO8_AVX2
+#define HAS_MERGEAR64ROW_AVX2
+#define HAS_MERGEARGB16TO8ROW_AVX2
+#define HAS_MERGEARGBROW_AVX2
 #define HAS_MERGEUVROW_16_AVX2
+#define HAS_MERGEXR30ROW_AVX2
+#define HAS_MERGEXR64ROW_AVX2
+#define HAS_MERGEXRGB16TO8ROW_AVX2
+#define HAS_MERGEXRGBROW_AVX2
 #define HAS_MIRRORUVROW_AVX2
 #define HAS_MULTIPLYROW_16_AVX2
-#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_NV21TOYUV24ROW_AVX2
+#define HAS_P210TOAR30ROW_AVX2
+#define HAS_P210TOARGBROW_AVX2
+#define HAS_P410TOAR30ROW_AVX2
+#define HAS_P410TOARGBROW_AVX2
 #define HAS_RGBATOYJROW_AVX2
-#endif
 #define HAS_SPLITARGBROW_AVX2
-#define HAS_SPLITXRGBROW_AVX2
 #define HAS_SPLITUVROW_16_AVX2
+#define HAS_SPLITXRGBROW_AVX2
 #define HAS_SWAPUVROW_AVX2
+#define HAS_YUY2TONVUVROW_AVX2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ABGRTOUVJROW_AVX2
+#define HAS_ABGRTOUVROW_AVX2
+#endif
 
 #if defined(__x86_64__) || !defined(__pic__)
 // TODO(fbarchard): fix build error on android_full_debug=1
@@ -397,8 +403,9 @@ extern "C" {
 // TODO(fbarchard): Port to GCC and Visual C
 // TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
 #if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512))
+    (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512)
 #define HAS_ARGBTORGB24ROW_AVX512VBMI
+#define HAS_MERGEUVROW_AVX512BW
 #endif
 
 // The following are available for AVX512 clang x64 platforms:
@@ -412,7 +419,9 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_AB64TOARGBROW_NEON
+#define HAS_ABGRTOUVJROW_NEON
 #define HAS_ABGRTOUVROW_NEON
+#define HAS_ABGRTOYJROW_NEON
 #define HAS_ABGRTOYROW_NEON
 #define HAS_AR64TOARGBROW_NEON
 #define HAS_ARGB1555TOARGBROW_NEON
@@ -444,8 +453,11 @@ extern "C" {
 #define HAS_BYTETOFLOATROW_NEON
 #define HAS_CONVERT16TO8ROW_NEON
 #define HAS_COPYROW_NEON
+#define HAS_DETILEROW_16_NEON
 #define HAS_DETILEROW_NEON
 #define HAS_DETILESPLITUVROW_NEON
+#define HAS_DETILETOYUY2_NEON
+#define HAS_UNPACKMT2T_NEON
 #define HAS_DIVIDEROW_16_NEON
 #define HAS_HALFFLOATROW_NEON
 #define HAS_HALFMERGEUVROW_NEON
@@ -461,6 +473,7 @@ extern "C" {
 #define HAS_I422TOYUY2ROW_NEON
 #define HAS_I444ALPHATOARGBROW_NEON
 #define HAS_I444TOARGBROW_NEON
+#define HAS_I444TORGB24ROW_NEON
 #define HAS_INTERPOLATEROW_16_NEON
 #define HAS_INTERPOLATEROW_NEON
 #define HAS_J400TOARGBROW_NEON
@@ -513,6 +526,7 @@ extern "C" {
 #define HAS_UYVYTOUVROW_NEON
 #define HAS_UYVYTOYROW_NEON
 #define HAS_YUY2TOARGBROW_NEON
+#define HAS_YUY2TONVUVROW_NEON
 #define HAS_YUY2TOUV422ROW_NEON
 #define HAS_YUY2TOUVROW_NEON
 #define HAS_YUY2TOYROW_NEON
@@ -524,13 +538,13 @@ extern "C" {
 #define HAS_ARGBCOLORMATRIXROW_NEON
 #define HAS_ARGBGRAYROW_NEON
 #define HAS_ARGBMIRRORROW_NEON
-#define HAS_RGB24MIRRORROW_NEON
 #define HAS_ARGBMULTIPLYROW_NEON
 #define HAS_ARGBQUANTIZEROW_NEON
 #define HAS_ARGBSEPIAROW_NEON
 #define HAS_ARGBSHADEROW_NEON
 #define HAS_ARGBSHUFFLEROW_NEON
 #define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_RGB24MIRRORROW_NEON
 #define HAS_SOBELROW_NEON
 #define HAS_SOBELTOPLANEROW_NEON
 #define HAS_SOBELXROW_NEON
@@ -540,12 +554,13 @@ extern "C" {
 
 // The following are available on AArch64 platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_GAUSSCOL_F32_NEON
+#define HAS_GAUSSROW_F32_NEON
 #define HAS_INTERPOLATEROW_16TO8_NEON
 #define HAS_SCALESUMSAMPLES_NEON
-#define HAS_GAUSSROW_F32_NEON
-#define HAS_GAUSSCOL_F32_NEON
 #endif
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_ABGRTOUVJROW_MSA
 #define HAS_ABGRTOUVROW_MSA
 #define HAS_ABGRTOYROW_MSA
 #define HAS_ARGB1555TOARGBROW_MSA
@@ -581,27 +596,25 @@ extern "C" {
 #define HAS_BGRATOYROW_MSA
 #define HAS_HALFFLOATROW_MSA
 #define HAS_I400TOARGBROW_MSA
-#define HAS_I422TOUYVYROW_MSA
-#define HAS_I422TOYUY2ROW_MSA
-#define HAS_I422TOARGBROW_MSA
-#define HAS_I422TORGBAROW_MSA
 #define HAS_I422ALPHATOARGBROW_MSA
+#define HAS_I422TOARGB1555ROW_MSA
+#define HAS_I422TOARGB4444ROW_MSA
+#define HAS_I422TOARGBROW_MSA
 #define HAS_I422TORGB24ROW_MSA
 #define HAS_I422TORGB565ROW_MSA
-#define HAS_I422TOARGB4444ROW_MSA
-#define HAS_I422TOARGB1555ROW_MSA
-#define HAS_NV12TOARGBROW_MSA
-#define HAS_NV12TORGB565ROW_MSA
-#define HAS_NV21TOARGBROW_MSA
-#define HAS_YUY2TOARGBROW_MSA
-#define HAS_UYVYTOARGBROW_MSA
+#define HAS_I422TORGBAROW_MSA
+#define HAS_I422TOUYVYROW_MSA
+#define HAS_I422TOYUY2ROW_MSA
 #define HAS_I444TOARGBROW_MSA
 #define HAS_INTERPOLATEROW_MSA
 #define HAS_J400TOARGBROW_MSA
 #define HAS_MERGEUVROW_MSA
 #define HAS_MIRRORROW_MSA
-#define HAS_MIRRORUVROW_MSA
 #define HAS_MIRRORSPLITUVROW_MSA
+#define HAS_MIRRORUVROW_MSA
+#define HAS_NV12TOARGBROW_MSA
+#define HAS_NV12TORGB565ROW_MSA
+#define HAS_NV21TOARGBROW_MSA
 #define HAS_RAWTOARGBROW_MSA
 #define HAS_RAWTORGB24ROW_MSA
 #define HAS_RAWTOUVROW_MSA
@@ -621,113 +634,208 @@ extern "C" {
 #define HAS_SOBELXYROW_MSA
 #define HAS_SOBELYROW_MSA
 #define HAS_SPLITUVROW_MSA
+#define HAS_UYVYTOARGBROW_MSA
 #define HAS_UYVYTOUVROW_MSA
 #define HAS_UYVYTOYROW_MSA
+#define HAS_YUY2TOARGBROW_MSA
 #define HAS_YUY2TOUV422ROW_MSA
 #define HAS_YUY2TOUVROW_MSA
 #define HAS_YUY2TOYROW_MSA
 #endif
 
 #if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
-#define HAS_ARGB4444TOARGBROW_LSX
+#define HAS_ABGRTOUVROW_LSX
+#define HAS_ABGRTOYROW_LSX
 #define HAS_ARGB1555TOARGBROW_LSX
-#define HAS_RGB565TOARGBROW_LSX
-#define HAS_RGB24TOARGBROW_LSX
-#define HAS_RAWTOARGBROW_LSX
-#define HAS_ARGB1555TOYROW_LSX
 #define HAS_ARGB1555TOUVROW_LSX
-#define HAS_RGB565TOYROW_LSX
-#define HAS_RGB565TOUVROW_LSX
-#define HAS_RGB24TOYROW_LSX
-#define HAS_RGB24TOUVROW_LSX
-#define HAS_RAWTOYROW_LSX
-#define HAS_RAWTOUVROW_LSX
+#define HAS_ARGB1555TOYROW_LSX
+#define HAS_ARGB4444TOARGBROW_LSX
+#define HAS_ARGBADDROW_LSX
+#define HAS_ARGBATTENUATEROW_LSX
+#define HAS_ARGBBLENDROW_LSX
+#define HAS_ARGBCOLORMATRIXROW_LSX
+#define HAS_ARGBEXTRACTALPHAROW_LSX
+#define HAS_ARGBGRAYROW_LSX
+#define HAS_ARGBSEPIAROW_LSX
+#define HAS_ARGBSHADEROW_LSX
+#define HAS_ARGBSHUFFLEROW_LSX
+#define HAS_ARGBSUBTRACTROW_LSX
+#define HAS_ARGBQUANTIZEROW_LSX
+#define HAS_ARGBSETROW_LSX
+#define HAS_ARGBTOARGB1555ROW_LSX
+#define HAS_ARGBTOARGB4444ROW_LSX
+#define HAS_ARGBTORAWROW_LSX
+#define HAS_ARGBTORGB24ROW_LSX
+#define HAS_ARGBTORGB565ROW_LSX
+#define HAS_ARGBTORGB565DITHERROW_LSX
+#define HAS_ARGBTOUVJROW_LSX
+#define HAS_ARGBTOUV444ROW_LSX
+#define HAS_ARGBTOUVROW_LSX
+#define HAS_ARGBTOYJROW_LSX
+#define HAS_ARGBMIRRORROW_LSX
+#define HAS_ARGBMULTIPLYROW_LSX
+#define HAS_BGRATOUVROW_LSX
+#define HAS_BGRATOYROW_LSX
+#define HAS_I400TOARGBROW_LSX
+#define HAS_I444TOARGBROW_LSX
+#define HAS_INTERPOLATEROW_LSX
+#define HAS_I422ALPHATOARGBROW_LSX
+#define HAS_I422TOARGB1555ROW_LSX
+#define HAS_I422TOARGB4444ROW_LSX
+#define HAS_I422TORGB24ROW_LSX
+#define HAS_I422TORGB565ROW_LSX
+#define HAS_I422TORGBAROW_LSX
+#define HAS_I422TOUYVYROW_LSX
+#define HAS_I422TOYUY2ROW_LSX
+#define HAS_J400TOARGBROW_LSX
+#define HAS_MERGEUVROW_LSX
+#define HAS_MIRRORROW_LSX
+#define HAS_MIRRORUVROW_LSX
+#define HAS_MIRRORSPLITUVROW_LSX
 #define HAS_NV12TOARGBROW_LSX
 #define HAS_NV12TORGB565ROW_LSX
 #define HAS_NV21TOARGBROW_LSX
+#define HAS_RAWTOARGBROW_LSX
+#define HAS_RAWTORGB24ROW_LSX
+#define HAS_RAWTOUVROW_LSX
+#define HAS_RAWTOYROW_LSX
+#define HAS_RGB24TOARGBROW_LSX
+#define HAS_RGB24TOUVROW_LSX
+#define HAS_RGB24TOYROW_LSX
+#define HAS_RGB565TOARGBROW_LSX
+#define HAS_RGB565TOUVROW_LSX
+#define HAS_RGB565TOYROW_LSX
+#define HAS_RGBATOUVROW_LSX
+#define HAS_RGBATOYROW_LSX
+#define HAS_SETROW_LSX
 #define HAS_SOBELROW_LSX
 #define HAS_SOBELTOPLANEROW_LSX
 #define HAS_SOBELXYROW_LSX
-#define HAS_ARGBTOYJROW_LSX
-#define HAS_BGRATOYROW_LSX
-#define HAS_BGRATOUVROW_LSX
-#define HAS_ABGRTOYROW_LSX
-#define HAS_ABGRTOUVROW_LSX
-#define HAS_RGBATOYROW_LSX
-#define HAS_RGBATOUVROW_LSX
-#define HAS_ARGBTOUVJROW_LSX
-#define HAS_I444TOARGBROW_LSX
-#define HAS_I400TOARGBROW_LSX
-#define HAS_J400TOARGBROW_LSX
-#define HAS_YUY2TOARGBROW_LSX
-#define HAS_UYVYTOARGBROW_LSX
-#define HAS_INTERPOLATEROW_LSX
-#define HAS_ARGBSETROW_LSX
-#define HAS_RAWTORGB24ROW_LSX
-#define HAS_MERGEUVROW_LSX
-#define HAS_ARGBEXTRACTALPHAROW_LSX
-#define HAS_ARGBBLENDROW_LSX
-#define HAS_ARGBQUANTIZEROW_LSX
-#define HAS_ARGBCOLORMATRIXROW_LSX
 #define HAS_SPLITUVROW_LSX
-#define HAS_SETROW_LSX
-#define HAS_MIRRORSPLITUVROW_LSX
+#define HAS_UYVYTOARGBROW_LSX
+#define HAS_UYVYTOUV422ROW_LSX
+#define HAS_UYVYTOUVROW_LSX
+#define HAS_UYVYTOYROW_LSX
+#define HAS_YUY2TOARGBROW_LSX
+#define HAS_YUY2TOUVROW_LSX
+#define HAS_YUY2TOUV422ROW_LSX
+#define HAS_YUY2TOYROW_LSX
+#define HAS_ARGBTOYROW_LSX
+#define HAS_ABGRTOYJROW_LSX
+#define HAS_RGBATOYJROW_LSX
+#define HAS_RGB24TOYJROW_LSX
+#define HAS_RAWTOYJROW_LSX
+#endif
+
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#define HAS_I422TOARGBROW_LSX
 #endif
 
 #if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
+#define HAS_ARGB1555TOARGBROW_LASX
+#define HAS_ARGB1555TOUVROW_LASX
+#define HAS_ARGB1555TOYROW_LASX
+#define HAS_ARGB4444TOARGBROW_LASX
+#define HAS_ARGBADDROW_LASX
+#define HAS_ARGBATTENUATEROW_LASX
+#define HAS_ARGBGRAYROW_LASX
+#define HAS_ARGBMIRRORROW_LASX
+#define HAS_ARGBMULTIPLYROW_LASX
+#define HAS_ARGBSEPIAROW_LASX
+#define HAS_ARGBSHADEROW_LASX
+#define HAS_ARGBSHUFFLEROW_LASX
+#define HAS_ARGBSUBTRACTROW_LASX
+#define HAS_ARGBTOARGB1555ROW_LASX
+#define HAS_ARGBTOARGB4444ROW_LASX
+#define HAS_ARGBTORAWROW_LASX
+#define HAS_ARGBTORGB24ROW_LASX
+#define HAS_ARGBTORGB565DITHERROW_LASX
+#define HAS_ARGBTORGB565ROW_LASX
+#define HAS_ARGBTOUV444ROW_LASX
+#define HAS_ARGBTOUVJROW_LASX
+#define HAS_ARGBTOUVROW_LASX
+#define HAS_ARGBTOYJROW_LASX
+#define HAS_ARGBTOYROW_LASX
+#define HAS_ABGRTOYJROW_LASX
+#define HAS_ABGRTOYROW_LASX
+#define HAS_I422ALPHATOARGBROW_LASX
+#define HAS_I422TOARGB1555ROW_LASX
+#define HAS_I422TOARGB4444ROW_LASX
 #define HAS_I422TOARGBROW_LASX
+#define HAS_I422TORGB24ROW_LASX
+#define HAS_I422TORGB565ROW_LASX
 #define HAS_I422TORGBAROW_LASX
-#define HAS_I422ALPHATOARGBROW_LASX
-#define HAS_I422TOYUY2ROW_LASX
 #define HAS_I422TOUYVYROW_LASX
+#define HAS_I422TOYUY2ROW_LASX
 #define HAS_MIRRORROW_LASX
 #define HAS_MIRRORUVROW_LASX
-#define HAS_ARGBMIRRORROW_LASX
-#define HAS_I422TORGB24ROW_LASX
-#define HAS_I422TORGB565ROW_LASX
-#define HAS_I422TOARGB4444ROW_LASX
-#define HAS_I422TOARGB1555ROW_LASX
-#define HAS_YUY2TOUVROW_LASX
-#define HAS_YUY2TOYROW_LASX
-#define HAS_YUY2TOUV422ROW_LASX
-#define HAS_UYVYTOYROW_LASX
-#define HAS_UYVYTOUVROW_LASX
-#define HAS_UYVYTOUV422ROW_LASX
-#define HAS_ARGBTOYROW_LASX
-#define HAS_ARGBTOUVROW_LASX
-#define HAS_ARGBTORGB24ROW_LASX
-#define HAS_ARGBTORAWROW_LASX
-#define HAS_ARGBTORGB565ROW_LASX
-#define HAS_ARGBTOARGB1555ROW_LASX
-#define HAS_ARGBTOARGB4444ROW_LASX
-#define HAS_ARGBTOUV444ROW_LASX
-#define HAS_ARGBMULTIPLYROW_LASX
-#define HAS_ARGBADDROW_LASX
-#define HAS_ARGBSUBTRACTROW_LASX
-#define HAS_ARGBATTENUATEROW_LASX
-#define HAS_ARGBTORGB565DITHERROW_LASX
-#define HAS_ARGBSHUFFLEROW_LASX
-#define HAS_ARGBSHADEROW_LASX
-#define HAS_ARGBGRAYROW_LASX
-#define HAS_ARGBSEPIAROW_LASX
-#define HAS_ARGB4444TOARGBROW_LASX
-#define HAS_ARGB1555TOARGBROW_LASX
-#define HAS_RGB565TOARGBROW_LASX
-#define HAS_RGB24TOARGBROW_LASX
-#define HAS_RAWTOARGBROW_LASX
-#define HAS_ARGB1555TOYROW_LASX
-#define HAS_ARGB1555TOUVROW_LASX
-#define HAS_RGB565TOYROW_LASX
-#define HAS_RGB565TOUVROW_LASX
-#define HAS_RGB24TOYROW_LASX
-#define HAS_RGB24TOUVROW_LASX
-#define HAS_RAWTOYROW_LASX
-#define HAS_RAWTOUVROW_LASX
 #define HAS_NV12TOARGBROW_LASX
 #define HAS_NV12TORGB565ROW_LASX
 #define HAS_NV21TOARGBROW_LASX
-#define HAS_ARGBTOYJROW_LASX
-#define HAS_ARGBTOUVJROW_LASX
+#define HAS_RAWTOARGBROW_LASX
+#define HAS_RAWTOUVROW_LASX
+#define HAS_RAWTOYROW_LASX
+#define HAS_RGB24TOARGBROW_LASX
+#define HAS_RGB24TOUVROW_LASX
+#define HAS_RGB24TOYROW_LASX
+#define HAS_RGB565TOARGBROW_LASX
+#define HAS_RGB565TOUVROW_LASX
+#define HAS_RGB565TOYROW_LASX
+#define HAS_UYVYTOUV422ROW_LASX
+#define HAS_UYVYTOUVROW_LASX
+#define HAS_UYVYTOYROW_LASX
+#define HAS_YUY2TOUV422ROW_LASX
+#define HAS_YUY2TOUVROW_LASX
+#define HAS_YUY2TOYROW_LASX
+#define HAS_RGBATOYROW_LASX
+#define HAS_RGBATOYJROW_LASX
+#define HAS_BGRATOYROW_LASX
+#define HAS_RGB24TOYJROW_LASX
+#define HAS_RAWTOYJROW_LASX
+#endif
+
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
+#define HAS_AB64TOARGBROW_RVV
+#define HAS_AR64TOARGBROW_RVV
+#define HAS_ARGBATTENUATEROW_RVV
+#define HAS_ARGBTOAB64ROW_RVV
+#define HAS_ARGBTOAR64ROW_RVV
+#define HAS_ARGBTORAWROW_RVV
+#define HAS_ARGBTORGB24ROW_RVV
+#define HAS_ARGBTOYROW_RVV
+#define HAS_ARGBTOYJROW_RVV
+#define HAS_ABGRTOYROW_RVV
+#define HAS_ABGRTOYJROW_RVV
+#define HAS_BGRATOYROW_RVV
+#define HAS_COPYROW_RVV
+#define HAS_I400TOARGBROW_RVV
+#define HAS_I422ALPHATOARGBROW_RVV
+#define HAS_I422TOARGBROW_RVV
+#define HAS_I422TORGB24ROW_RVV
+#define HAS_I422TORGBAROW_RVV
+#define HAS_I444ALPHATOARGBROW_RVV
+#define HAS_I444TOARGBROW_RVV
+#define HAS_I444TORGB24ROW_RVV
+#define HAS_INTERPOLATEROW_RVV
+#define HAS_J400TOARGBROW_RVV
+#define HAS_MERGEARGBROW_RVV
+#define HAS_MERGERGBROW_RVV
+#define HAS_MERGEUVROW_RVV
+#define HAS_MERGEXRGBROW_RVV
+#define HAS_SPLITARGBROW_RVV
+#define HAS_SPLITRGBROW_RVV
+#define HAS_SPLITUVROW_RVV
+#define HAS_SPLITXRGBROW_RVV
+#define HAS_RAWTOARGBROW_RVV
+#define HAS_RAWTORGB24ROW_RVV
+#define HAS_RAWTORGBAROW_RVV
+#define HAS_RAWTOYJROW_RVV
+#define HAS_RAWTOYROW_RVV
+#define HAS_RGB24TOARGBROW_RVV
+#define HAS_RGB24TOYJROW_RVV
+#define HAS_RGB24TOYROW_RVV
+#define HAS_RGBATOYROW_RVV
+#define HAS_RGBATOYJROW_RVV
 #endif
 
 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -789,8 +897,8 @@ typedef uint32_t ulvec32[8];
 typedef uint8_t ulvec8[32];
 #endif
 
-#if defined(__aarch64__) || defined(__arm__)
-// This struct is for ARM color conversion.
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
+// This struct is for ARM and RISC-V color conversion.
 struct YuvConstants {
   uvec8 kUVCoeff;
   vec16 kRGBCoeffBias;
@@ -816,13 +924,21 @@ struct YuvConstants {
 
 #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
 
-#define align_buffer_64(var, size)                                           \
-  uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63));         /* NOLINT */ \
-  uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
+#define align_buffer_64(var, size)                                         \
+  void* var##_mem = malloc((size) + 63);                      /* NOLINT */ \
+  uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */
 
 #define free_aligned_buffer_64(var) \
   free(var##_mem);                  \
-  var = 0
+  var = NULL
+
+#define align_buffer_64_16(var, size)                                        \
+  void* var##_mem = malloc((size)*2 + 63);                      /* NOLINT */ \
+  uint16_t* var = (uint16_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */
+
+#define free_aligned_buffer_64_16(var) \
+  free(var##_mem);                     \
+  var = NULL
 
 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
 #define OMITFP
@@ -894,6 +1010,12 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I444ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
 void I422ToARGBRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -981,6 +1103,50 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I444ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I444ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGBARow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgba,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I444ToARGBRow_MSA(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
@@ -1000,6 +1166,12 @@ void I422ToARGBRow_MSA(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I422ToARGBRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
 void I422ToARGBRow_LASX(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -1012,6 +1184,12 @@ void I422ToRGBARow_MSA(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I422ToRGBARow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
 void I422ToRGBARow_LASX(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -1025,6 +1203,13 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I422AlphaToARGBRow_LSX(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I422AlphaToARGBRow_LASX(const uint8_t* src_y,
                              const uint8_t* src_u,
                              const uint8_t* src_v,
@@ -1038,6 +1223,12 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I422ToRGB24Row_LSX(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I422ToRGB24Row_LASX(const uint8_t* src_y,
                          const uint8_t* src_u,
                          const uint8_t* src_v,
@@ -1050,6 +1241,12 @@ void I422ToRGB565Row_MSA(const uint8_t* src_y,
                          uint8_t* dst_rgb565,
                          const struct YuvConstants* yuvconstants,
                          int width);
+void I422ToRGB565Row_LSX(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
 void I422ToRGB565Row_LASX(const uint8_t* src_y,
                           const uint8_t* src_u,
                           const uint8_t* src_v,
@@ -1062,6 +1259,12 @@ void I422ToARGB4444Row_MSA(const uint8_t* src_y,
                            uint8_t* dst_argb4444,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I422ToARGB4444Row_LSX(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void I422ToARGB4444Row_LASX(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@@ -1074,6 +1277,12 @@ void I422ToARGB1555Row_MSA(const uint8_t* src_y,
                            uint8_t* dst_argb1555,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I422ToARGB1555Row_LSX(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void I422ToARGB1555Row_LASX(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@@ -1148,9 +1357,13 @@ void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
@@ -1164,13 +1377,23 @@ void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width);
 void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
 void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width);
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
+void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void ABGRToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
+void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
 void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
 void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYRow_LSX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
 void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
 void ARGBToYJRow_LSX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
+void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
 void ARGBToYJRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
+void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -1189,11 +1412,20 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void ARGBToUVRow_LSX(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
 void ARGBToUVRow_LASX(const uint8_t* src_argb,
                       int src_stride_argb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void ARGBToUV444Row_LSX(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 void ARGBToUV444Row_LASX(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -1203,6 +1435,11 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
+void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_uj,
+                       uint8_t* dst_vj,
+                       int width);
 void BGRAToUVRow_NEON(const uint8_t* src_bgra,
                       int src_stride_bgra,
                       uint8_t* dst_u,
@@ -1258,6 +1495,11 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void ABGRToUVJRow_MSA(const uint8_t* src_rgb,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
 void BGRAToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
@@ -1372,6 +1614,13 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                          uint8_t* dst_y,
                          int width);
+void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width);
 void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
@@ -1384,6 +1633,8 @@ void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width);
 void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555,
                          uint8_t* dst_y,
                          int width);
@@ -1393,9 +1644,15 @@ void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
 void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
 void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width);
 void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width);
+void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width);
 
 void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void ABGRToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
@@ -1409,6 +1666,7 @@ void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
 void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
 void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -1423,6 +1681,7 @@ void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -1453,10 +1712,15 @@ void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr,
 void BGRAToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB24ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB565ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGB1555ToYRow_Any_LSX(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
@@ -1465,7 +1729,14 @@ void RGB565ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB24ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGB1555ToYRow_Any_LASX(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
@@ -1485,6 +1756,11 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
+void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
 void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
@@ -1495,6 +1771,11 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr,
+                        int src_stride_abgr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
                        int src_stride_bgra,
                        uint8_t* dst_u,
@@ -1525,6 +1806,11 @@ void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
+void ABGRToUVJRow_Any_AVX2(const uint8_t* src_ptr,
+                           int src_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
 void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr,
                            int src_stride,
                            uint8_t* dst_u,
@@ -1535,6 +1821,11 @@ void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
+void ABGRToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
+                            int src_stride,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
 void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
                            int src_stride,
                            uint8_t* dst_u,
@@ -1568,11 +1859,20 @@ void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void ARGBToUVRow_Any_LSX(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void ARGBToUVRow_Any_LASX(const uint8_t* src_ptr,
                           int src_stride_ptr,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void ARGBToUV444Row_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
 void ARGBToUV444Row_Any_LASX(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
@@ -1582,6 +1882,11 @@ void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
+void ABGRToUVJRow_Any_NEON(const uint8_t* src_ptr,
+                           int src_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
 void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
                           int src_stride,
                           uint8_t* dst_u,
@@ -1747,16 +2052,16 @@ void ARGBToUVJRow_C(const uint8_t* src_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
+void ABGRToUVJRow_C(const uint8_t* src_rgb,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
 void ARGBToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void ARGBToUVJRow_C(const uint8_t* src_rgb,
-                    int src_stride_rgb,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width);
 void BGRAToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
@@ -1772,6 +2077,11 @@ void RGBAToUVRow_C(const uint8_t* src_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
+void RGBAToUVJRow_C(const uint8_t* src_rgb,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
 void RGB24ToUVRow_C(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
@@ -1826,6 +2136,7 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -1833,17 +2144,20 @@ void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 
 void MirrorSplitUVRow_SSSE3(const uint8_t* src,
@@ -1867,10 +2181,13 @@ void MirrorSplitUVRow_C(const uint8_t* src_uv,
                         uint8_t* dst_v,
                         int width);
 
+void MirrorRow_16_C(const uint16_t* src, uint16_t* dst, int width);
+
 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr,
@@ -1883,6 +2200,7 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
 void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBMirrorRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBMirrorRow_Any_LASX(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
@@ -1925,6 +2243,10 @@ void SplitUVRow_LSX(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
+void SplitUVRow_RVV(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
 void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -1949,7 +2271,6 @@ void DetileRow_C(const uint8_t* src,
                  ptrdiff_t src_tile_stride,
                  uint8_t* dst,
                  int width);
-
 void DetileRow_NEON(const uint8_t* src,
                     ptrdiff_t src_tile_stride,
                     uint8_t* dst,
@@ -1966,6 +2287,42 @@ void DetileRow_Any_SSE2(const uint8_t* src,
                         ptrdiff_t src_tile_stride,
                         uint8_t* dst,
                         int width);
+void DetileRow_AVX(const uint8_t* src,
+                   ptrdiff_t src_tile_stride,
+                   uint8_t* dst,
+                   int width);
+void DetileRow_Any_AVX(const uint8_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint8_t* dst,
+                       int width);
+void DetileRow_16_C(const uint16_t* src,
+                    ptrdiff_t src_tile_stride,
+                    uint16_t* dst,
+                    int width);
+void DetileRow_16_NEON(const uint16_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint16_t* dst,
+                       int width);
+void DetileRow_16_Any_NEON(const uint16_t* src,
+                           ptrdiff_t src_tile_stride,
+                           uint16_t* dst,
+                           int width);
+void DetileRow_16_SSE2(const uint16_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint16_t* dst,
+                       int width);
+void DetileRow_16_Any_SSE2(const uint16_t* src,
+                           ptrdiff_t src_tile_stride,
+                           uint16_t* dst,
+                           int width);
+void DetileRow_16_AVX(const uint16_t* src,
+                      ptrdiff_t src_tile_stride,
+                      uint16_t* dst,
+                      int width);
+void DetileRow_16_Any_AVX(const uint16_t* src,
+                          ptrdiff_t src_tile_stride,
+                          uint16_t* dst,
+                          int width);
 void DetileSplitUVRow_C(const uint8_t* src_uv,
                         ptrdiff_t src_tile_stride,
                         uint8_t* dst_u,
@@ -1991,6 +2348,38 @@ void DetileSplitUVRow_Any_NEON(const uint8_t* src_uv,
                                uint8_t* dst_u,
                                uint8_t* dst_v,
                                int width);
+void DetileToYUY2_C(const uint8_t* src_y,
+                    ptrdiff_t src_y_tile_stride,
+                    const uint8_t* src_uv,
+                    ptrdiff_t src_uv_tile_stride,
+                    uint8_t* dst_yuy2,
+                    int width);
+void DetileToYUY2_SSE2(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width);
+void DetileToYUY2_Any_SSE2(const uint8_t* src_y,
+                           ptrdiff_t src_y_tile_stride,
+                           const uint8_t* src_uv,
+                           ptrdiff_t src_uv_tile_stride,
+                           uint8_t* dst_yuy2,
+                           int width);
+void DetileToYUY2_NEON(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width);
+void DetileToYUY2_Any_NEON(const uint8_t* src_y,
+                           ptrdiff_t src_y_tile_stride,
+                           const uint8_t* src_uv,
+                           ptrdiff_t src_uv_tile_stride,
+                           uint8_t* dst_yuy2,
+                           int width);
+void UnpackMT2T_C(const uint8_t* src, uint16_t* dst, size_t size);
+void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size);
 void MergeUVRow_C(const uint8_t* src_u,
                   const uint8_t* src_v,
                   uint8_t* dst_uv,
@@ -2003,6 +2392,10 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width);
+void MergeUVRow_AVX512BW(const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_uv,
+                         int width);
 void MergeUVRow_NEON(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
@@ -2015,6 +2408,10 @@ void MergeUVRow_LSX(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width);
+void MergeUVRow_RVV(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width);
 void MergeUVRow_Any_SSE2(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
@@ -2023,6 +2420,10 @@ void MergeUVRow_Any_AVX2(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
                          int width);
+void MergeUVRow_Any_AVX512BW(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 void MergeUVRow_Any_NEON(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
@@ -2079,6 +2480,11 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width);
+void SplitRGBRow_RVV(const uint8_t* src_rgb,
+                     uint8_t* dst_r,
+                     uint8_t* dst_g,
+                     uint8_t* dst_b,
+                     int width);
 void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr,
                            uint8_t* dst_r,
                            uint8_t* dst_g,
@@ -2105,6 +2511,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_b,
                       uint8_t* dst_rgb,
                       int width);
+void MergeRGBRow_RVV(const uint8_t* src_r,
+                     const uint8_t* src_g,
+                     const uint8_t* src_b,
+                     uint8_t* dst_rgb,
+                     int width);
 void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
@@ -2139,6 +2550,12 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width);
+void MergeARGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      const uint8_t* src_a,
+                      uint8_t* dst_argb,
+                      int width);
 void MergeARGBRow_Any_SSE2(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
@@ -2187,6 +2604,12 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width);
+void SplitARGBRow_RVV(const uint8_t* src_rgba,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      uint8_t* dst_a,
+                      int width);
 void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr,
                            uint8_t* dst_r,
                            uint8_t* dst_g,
@@ -2231,6 +2654,11 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_argb,
                        int width);
+void MergeXRGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_argb,
+                      int width);
 void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
@@ -2271,6 +2699,11 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width);
+void SplitXRGBRow_RVV(const uint8_t* src_rgba,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width);
 void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr,
                            uint8_t* dst_r,
                            uint8_t* dst_g,
@@ -2604,8 +3037,8 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
                           uint8_t* dst_y,
                           int scale,
                           int width);
-void Convert16To8Row_Any_NEON(const uint16_t* src_y,
-                              uint8_t* dst_y,
+void Convert16To8Row_Any_NEON(const uint16_t* src_ptr,
+                              uint8_t* dst_ptr,
                               int scale,
                               int width);
 
@@ -2614,6 +3047,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int count);
 void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
 void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -2713,6 +3147,10 @@ void ARGBShuffleRow_MSA(const uint8_t* src_argb,
                         uint8_t* dst_argb,
                         const uint8_t* shuffler,
                         int width);
+void ARGBShuffleRow_LSX(const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        const uint8_t* shuffler,
+                        int width);
 void ARGBShuffleRow_LASX(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          const uint8_t* shuffler,
@@ -2733,6 +3171,10 @@ void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             const uint8_t* param,
                             int width);
+void ARGBShuffleRow_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            const uint8_t* param,
+                            int width);
 void ARGBShuffleRow_Any_LASX(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              const uint8_t* param,
@@ -2765,14 +3207,18 @@ void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
 void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24,
                          uint8_t* dst_argb,
                          int width);
+void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                           uint8_t* dst_argb,
                           int width);
@@ -2932,15 +3378,15 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width);
 
 void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
                              uint8_t* dst_rgb,
-                             const uint32_t dither4,
+                             uint32_t dither4,
                              int width);
 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
                                 uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width);
 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
                                 uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width);
 
 void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -2968,7 +3414,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                             int width);
 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width);
 void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -2981,23 +3427,39 @@ void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
                            int width);
 void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
                                uint8_t* dst_rgb,
-                               const uint32_t dither4,
+                               uint32_t dither4,
+                               int width);
+void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               uint32_t dither4,
                                int width);
 void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width);
 
+void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRGB565Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_LSX(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
 void ARGBToARGB1555Row_LASX(const uint8_t* src_argb,
                             uint8_t* dst_rgb,
                             int width);
+void ARGBToARGB4444Row_LSX(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
 void ARGBToARGB4444Row_LASX(const uint8_t* src_argb,
                             uint8_t* dst_rgb,
                             int width);
 
+void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width);
+void ARGBToRGB24Row_RVV(const uint8_t* src_argb, uint8_t* dst_rgb24, int width);
+
 void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -3035,6 +3497,10 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
 void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
 void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
 void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
+void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
+void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
+void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
 void ARGBToAR64Row_Any_SSSE3(const uint8_t* src_ptr,
                              uint16_t* dst_ptr,
                              int width);
@@ -3077,6 +3543,7 @@ void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
@@ -3096,6 +3563,12 @@ void I444ToARGBRow_C(const uint8_t* src_y,
                      uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
+void I444ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_u,
+                      const uint8_t* src_v,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width);
 void I422ToARGBRow_C(const uint8_t* src_y,
                      const uint8_t* src_u,
                      const uint8_t* src_v,
@@ -3290,6 +3763,18 @@ void I444ToARGBRow_AVX2(const uint8_t* y_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I444ToRGB24Row_SSSE3(const uint8_t* y_buf,
+                          const uint8_t* u_buf,
+                          const uint8_t* v_buf,
+                          uint8_t* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void I444ToRGB24Row_AVX2(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
 void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
                          const uint8_t* u_buf,
                          const uint8_t* v_buf,
@@ -3631,12 +4116,24 @@ void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
+void I444ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
 void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I444ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
                              const uint8_t* u_buf,
                              const uint8_t* v_buf,
@@ -3823,13 +4320,13 @@ void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void NV21ToYUV24Row_Any_SSSE3(const uint8_t* src_y,
-                              const uint8_t* src_vu,
-                              uint8_t* dst_yuv24,
+void NV21ToYUV24Row_Any_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
                               int width);
-void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y,
-                             const uint8_t* src_vu,
-                             uint8_t* dst_yuv24,
+void NV21ToYUV24Row_Any_AVX2(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
                              int width);
 void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
                                const uint8_t* uv_buf,
@@ -3976,6 +4473,10 @@ void I400ToARGBRow_LSX(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I400ToARGBRow_RVV(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
 void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* param,
@@ -4084,10 +4585,18 @@ void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
                              int width);
+void ARGBMultiplyRow_LSX(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
 void ARGBMultiplyRow_LASX(const uint8_t* src_argb0,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
+void ARGBMultiplyRow_Any_LSX(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 void ARGBMultiplyRow_Any_LASX(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
@@ -4130,10 +4639,18 @@ void ARGBAddRow_Any_MSA(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
                         int width);
+void ARGBAddRow_LSX(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width);
 void ARGBAddRow_LASX(const uint8_t* src_argb0,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width);
+void ARGBAddRow_Any_LSX(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
 void ARGBAddRow_Any_LASX(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
@@ -4177,10 +4694,18 @@ void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
                              int width);
+void ARGBSubtractRow_LSX(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
 void ARGBSubtractRow_LASX(const uint8_t* src_argb0,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
+void ARGBSubtractRow_Any_LSX(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 void ARGBSubtractRow_Any_LASX(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
@@ -4273,21 +4798,37 @@ void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr,
                                    uint8_t* dst_ptr,
                                    const uint32_t param,
                                    int width);
+void ARGBToRGB565DitherRow_Any_LSX(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   const uint32_t param,
+                                   int width);
 void ARGBToRGB565DitherRow_Any_LASX(const uint8_t* src_ptr,
                                     uint8_t* dst_ptr,
                                     const uint32_t param,
                                     int width);
-
+void ARGBToRGB24Row_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
 void ARGBToRGB24Row_Any_LASX(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
+void ARGBToRAWRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToRAWRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_LSX(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
 void ARGBToRGB565Row_Any_LASX(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int width);
+void ARGBToARGB1555Row_Any_LSX(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
 void ARGBToARGB1555Row_Any_LASX(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int width);
+void ARGBToARGB4444Row_Any_LSX(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
 void ARGBToARGB4444Row_Any_LASX(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int width);
@@ -4298,6 +4839,12 @@ void I444ToARGBRow_Any_NEON(const uint8_t* y_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I444ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422ToARGBRow_Any_NEON(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -4443,6 +4990,12 @@ void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I422ToARGBRow_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void I422ToARGBRow_Any_LASX(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -4455,6 +5008,12 @@ void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I422ToRGBARow_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void I422ToRGBARow_Any_LASX(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -4468,6 +5027,13 @@ void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
                                 uint8_t* dst_ptr,
                                 const struct YuvConstants* yuvconstants,
                                 int width);
+void I422AlphaToARGBRow_Any_LSX(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                const uint8_t* a_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
 void I422AlphaToARGBRow_Any_LASX(const uint8_t* y_buf,
                                  const uint8_t* u_buf,
                                  const uint8_t* v_buf,
@@ -4481,6 +5047,12 @@ void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I422ToRGB24Row_Any_LSX(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I422ToRGB24Row_Any_LASX(const uint8_t* y_buf,
                              const uint8_t* u_buf,
                              const uint8_t* v_buf,
@@ -4493,6 +5065,12 @@ void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
+void I422ToRGB565Row_Any_LSX(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422ToRGB565Row_Any_LASX(const uint8_t* y_buf,
                               const uint8_t* u_buf,
                               const uint8_t* v_buf,
@@ -4505,6 +5083,12 @@ void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf,
                                uint8_t* dst_ptr,
                                const struct YuvConstants* yuvconstants,
                                int width);
+void I422ToARGB4444Row_Any_LSX(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
 void I422ToARGB4444Row_Any_LASX(const uint8_t* y_buf,
                                 const uint8_t* u_buf,
                                 const uint8_t* v_buf,
@@ -4517,6 +5101,12 @@ void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf,
                                uint8_t* dst_ptr,
                                const struct YuvConstants* yuvconstants,
                                int width);
+void I422ToARGB1555Row_Any_LSX(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
 void I422ToARGB1555Row_Any_LASX(const uint8_t* y_buf,
                                 const uint8_t* u_buf,
                                 const uint8_t* v_buf,
@@ -4592,6 +5182,10 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width);
 void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -4602,6 +5196,10 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width);
 void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -4612,17 +5210,27 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width);
 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
 void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
 void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
 void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
                      int src_stride_yuy2,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void YUY2ToUVRow_LSX(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
 void YUY2ToUVRow_LASX(const uint8_t* src_yuy2,
                       int src_stride_yuy2,
                       uint8_t* dst_u,
@@ -4632,6 +5240,10 @@ void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -4642,6 +5254,10 @@ void YUY2ToUVRow_C(const uint8_t* src_yuy2,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
+void YUY2ToNVUVRow_C(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_uv,
+                     int width);
 void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
@@ -4652,6 +5268,10 @@ void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void YUY2ToNVUVRow_Any_AVX2(const uint8_t* src_yuy2,
+                            int stride_yuy2,
+                            uint8_t* dst_uv,
+                            int width);
 void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
@@ -4662,6 +5282,10 @@ void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void YUY2ToNVUVRow_Any_SSE2(const uint8_t* src_yuy2,
+                            int stride_yuy2,
+                            uint8_t* dst_uv,
+                            int width);
 void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
@@ -4672,17 +5296,27 @@ void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void YUY2ToNVUVRow_Any_NEON(const uint8_t* src_yuy2,
+                            int stride_yuy2,
+                            uint8_t* dst_uv,
+                            int width);
 void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
                              int width);
 void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void YUY2ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void YUY2ToUVRow_Any_LSX(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void YUY2ToUVRow_Any_LASX(const uint8_t* src_ptr,
                           int src_stride_ptr,
                           uint8_t* dst_u,
@@ -4692,6 +5326,10 @@ void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
+void YUY2ToUV422Row_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
 void YUY2ToUV422Row_Any_LASX(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
@@ -4737,12 +5375,18 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
                          uint8_t* dst_v,
                          int width);
 void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
 void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
 void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
                      int src_stride_uyvy,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void UYVYToUVRow_LSX(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
 void UYVYToUVRow_LASX(const uint8_t* src_uyvy,
                       int src_stride_uyvy,
                       uint8_t* dst_u,
@@ -4752,6 +5396,10 @@ void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void UYVYToUV422Row_LSX(const uint8_t* src_uyvy,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 void UYVYToUV422Row_LASX(const uint8_t* src_uyvy,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -4798,12 +5446,18 @@ void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_v,
                              int width);
 void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void UYVYToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void UYVYToUVRow_Any_LSX(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void UYVYToUVRow_Any_LASX(const uint8_t* src_ptr,
                           int src_stride_ptr,
                           uint8_t* dst_u,
@@ -4813,6 +5467,10 @@ void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
+void UYVYToUV422Row_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
 void UYVYToUV422Row_Any_LASX(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
@@ -4927,6 +5585,11 @@ void I422ToYUY2Row_MSA(const uint8_t* src_y,
                        const uint8_t* src_v,
                        uint8_t* dst_yuy2,
                        int width);
+void I422ToYUY2Row_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_yuy2,
+                       int width);
 void I422ToYUY2Row_LASX(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -4937,6 +5600,11 @@ void I422ToUYVYRow_MSA(const uint8_t* src_y,
                        const uint8_t* src_v,
                        uint8_t* dst_uyvy,
                        int width);
+void I422ToUYVYRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_uyvy,
+                       int width);
 void I422ToUYVYRow_LASX(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -4947,6 +5615,11 @@ void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* v_buf,
                            uint8_t* dst_ptr,
                            int width);
+void I422ToYUY2Row_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
 void I422ToYUY2Row_Any_LASX(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -4957,6 +5630,11 @@ void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* v_buf,
                            uint8_t* dst_ptr,
                            int width);
+void I422ToUYVYRow_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
 void I422ToUYVYRow_Any_LASX(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -4977,9 +5655,15 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
 void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int width);
+void ARGBAttenuateRow_LSX(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
 void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width);
+void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
 void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int width);
@@ -4992,6 +5676,9 @@ void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr,
 void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int width);
+void ARGBAttenuateRow_Any_LSX(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
 void ARGBAttenuateRow_Any_LASX(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
@@ -5018,12 +5705,14 @@ void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 
 void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width);
 
 void ARGBColorMatrixRow_C(const uint8_t* src_argb,
@@ -5103,6 +5792,10 @@ void ARGBShadeRow_MSA(const uint8_t* src_argb,
                       uint8_t* dst_argb,
                       int width,
                       uint32_t value);
+void ARGBShadeRow_LSX(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      int width,
+                      uint32_t value);
 void ARGBShadeRow_LASX(const uint8_t* src_argb,
                        uint8_t* dst_argb,
                        int width,
@@ -5175,6 +5868,11 @@ void InterpolateRow_LSX(uint8_t* dst_ptr,
                         ptrdiff_t src_stride,
                         int width,
                         int source_y_fraction);
+void InterpolateRow_RVV(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int source_y_fraction);
 void InterpolateRow_Any_NEON(uint8_t* dst_ptr,
                              const uint8_t* src_ptr,
                              ptrdiff_t src_stride_ptr,
@@ -5526,6 +6224,17 @@ void GaussCol_F32_C(const float* src0,
                     float* dst,
                     int width);
 
+void GaussRow_C(const uint32_t* src, uint16_t* dst, int width);
+void GaussCol_C(const uint16_t* src0,
+                const uint16_t* src1,
+                const uint16_t* src2,
+                const uint16_t* src3,
+                const uint16_t* src4,
+                uint32_t* dst,
+                int width);
+
+void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width);
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/include/libyuv/scale_row.h b/files/include/libyuv/scale_row.h
index 6cb5e128..a7957c3f 100644
--- a/files/include/libyuv/scale_row.h
+++ b/files/include/libyuv/scale_row.h
@@ -133,6 +133,8 @@ extern "C" {
 #define HAS_SCALEROWDOWN34_NEON
 #define HAS_SCALEROWDOWN38_NEON
 #define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEUVROWDOWN2_NEON
+#define HAS_SCALEUVROWDOWN2LINEAR_NEON
 #define HAS_SCALEUVROWDOWN2BOX_NEON
 #define HAS_SCALEUVROWDOWNEVEN_NEON
 #define HAS_SCALEROWUP2_LINEAR_NEON
@@ -214,6 +216,17 @@ void ScalePlaneVertical_16To8(int src_height,
                               int scale,
                               enum FilterMode filtering);
 
+void ScalePlaneDown2_16To8(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           int scale,
+                           enum FilterMode filtering);
+
 // Simplify the filtering based on scale factors.
 enum FilterMode ScaleFilterReduce(int src_width,
                                   int src_height,
@@ -259,6 +272,16 @@ void ScaleRowDown2_16_C(const uint16_t* src_ptr,
                         ptrdiff_t src_stride,
                         uint16_t* dst,
                         int dst_width);
+void ScaleRowDown2_16To8_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width,
+                           int scale);
+void ScaleRowDown2_16To8_Odd_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width,
+                               int scale);
 void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst,
@@ -267,6 +290,16 @@ void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint16_t* dst,
                               int dst_width);
+void ScaleRowDown2Linear_16To8_C(const uint16_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst,
+                                 int dst_width,
+                                 int scale);
+void ScaleRowDown2Linear_16To8_Odd_C(const uint16_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst,
+                                     int dst_width,
+                                     int scale);
 void ScaleRowDown2Box_C(const uint8_t* src_ptr,
                         ptrdiff_t src_stride,
                         uint8_t* dst,
@@ -279,6 +312,16 @@ void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint16_t* dst,
                            int dst_width);
+void ScaleRowDown2Box_16To8_C(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width,
+                              int scale);
+void ScaleRowDown2Box_16To8_Odd_C(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst,
+                                  int dst_width,
+                                  int scale);
 void ScaleRowDown4_C(const uint8_t* src_ptr,
                      ptrdiff_t src_stride,
                      uint8_t* dst,
diff --git a/files/include/libyuv/version.h b/files/include/libyuv/version.h
index a85be048..b6623dbb 100644
--- a/files/include/libyuv/version.h
+++ b/files/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1837
+#define LIBYUV_VERSION 1871
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/files/infra/config/PRESUBMIT.py b/files/infra/config/PRESUBMIT.py
index 01ec0eed..f79e08ad 100644
--- a/files/infra/config/PRESUBMIT.py
+++ b/files/infra/config/PRESUBMIT.py
@@ -2,6 +2,8 @@
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 
+USE_PYTHON3 = True
+
 
 def CheckChangeOnUpload(input_api, output_api):
   return input_api.canned_checks.CheckChangedLUCIConfigs(input_api, output_api)
diff --git a/files/infra/config/cr-buildbucket.cfg b/files/infra/config/cr-buildbucket.cfg
index 061cf33b..be9d1d28 100644
--- a/files/infra/config/cr-buildbucket.cfg
+++ b/files/infra/config/cr-buildbucket.cfg
@@ -34,6 +34,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -65,6 +69,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -96,6 +104,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -111,7 +123,7 @@ buckets {
       name: "Android Tester ARM32 Debug (Nexus 5X)"
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
-      dimensions: "device_type:bullhead"
+      dimensions: "device_type:walleye"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -124,6 +136,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -139,7 +155,7 @@ buckets {
       name: "Android Tester ARM32 Release (Nexus 5X)"
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
-      dimensions: "device_type:bullhead"
+      dimensions: "device_type:walleye"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -152,6 +168,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -167,7 +187,7 @@ buckets {
       name: "Android Tester ARM64 Debug (Nexus 5X)"
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
-      dimensions: "device_type:bullhead"
+      dimensions: "device_type:walleye"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -180,6 +200,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -211,6 +235,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -242,6 +270,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -273,6 +305,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -304,6 +340,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -335,6 +375,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -366,6 +410,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -397,6 +445,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -428,6 +480,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -459,6 +515,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -490,6 +550,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -521,6 +585,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -537,7 +605,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -550,6 +618,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -566,7 +638,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -579,6 +651,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -595,7 +671,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -608,6 +684,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -639,6 +719,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -670,6 +754,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -701,6 +789,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -732,6 +824,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -763,6 +859,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -794,6 +894,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -825,6 +929,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -856,6 +964,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -872,7 +984,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -885,6 +997,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -901,7 +1017,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -914,6 +1030,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -985,7 +1105,7 @@ buckets {
       name: "android"
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
-      dimensions: "device_type:bullhead"
+      dimensions: "device_type:walleye"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -998,6 +1118,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1013,7 +1137,7 @@ buckets {
       name: "android_arm64"
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
-      dimensions: "device_type:bullhead"
+      dimensions: "device_type:walleye"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -1026,6 +1150,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1041,7 +1169,7 @@ buckets {
       name: "android_rel"
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
-      dimensions: "device_type:bullhead"
+      dimensions: "device_type:walleye"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -1054,6 +1182,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1085,6 +1217,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1116,6 +1252,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1132,7 +1272,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -1145,6 +1285,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1161,7 +1305,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -1174,6 +1318,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1205,6 +1353,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1236,6 +1388,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1267,6 +1423,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1298,6 +1458,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1329,6 +1493,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1360,6 +1528,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1391,6 +1563,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1422,6 +1598,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1438,7 +1618,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -1451,6 +1631,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1467,7 +1651,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -1480,6 +1664,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1496,7 +1684,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -1509,6 +1697,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1540,6 +1732,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "run_presubmit",'
         '  "repo_name": "libyuv",'
@@ -1573,6 +1769,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1604,6 +1804,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1635,6 +1839,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1666,6 +1874,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1697,6 +1909,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
@@ -1728,6 +1944,10 @@ buckets {
         '    "server_host": "goma.chromium.org",'
         '    "use_luci_auth": true'
         '  },'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
         '}'
diff --git a/files/infra/config/main.star b/files/infra/config/main.star
index b922ca02..7490a599 100755
--- a/files/infra/config/main.star
+++ b/files/infra/config/main.star
@@ -26,6 +26,16 @@ GOMA_BACKEND_RBE_NO_ATS_PROD = {
     "enable_ats": False,
 }
 
+RECLIENT_CI = {
+    "instance": "rbe-webrtc-trusted",
+    "metrics_project": "chromium-reclient-metrics",
+}
+
+RECLIENT_CQ = {
+    "instance": "rbe-webrtc-untrusted",
+    "metrics_project": "chromium-reclient-metrics",
+}
+
 # Use LUCI Scheduler BBv2 names and add Scheduler realms configs.
 lucicfg.enable_experiment("crbug.com/1182002")
 
@@ -69,6 +79,10 @@ luci.project(
         acl.entry(acl.BUILDBUCKET_OWNER, groups = ["project-libyuv-admins"]),
     ],
     bindings = [
+        luci.binding(
+            roles = "role/swarming.taskTriggerer", # for LED tasks.
+            groups = "project-libyuv-admins",
+        ),
         luci.binding(
             roles = "role/configs.validator",
             users = "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com",
@@ -195,9 +209,9 @@ luci.bucket(
 
 def get_os_dimensions(os):
     if os == "android":
-        return {"device_type": "bullhead"}
+        return {"device_type": "walleye"}
     if os == "ios" or os == "mac":
-        return {"os": "Mac-10.15", "cpu": "x86-64"}
+        return {"os": "Mac-12", "cpu": "x86-64"}
     elif os == "win":
         return {"os": "Windows-10", "cores": "8", "cpu": "x86-64"}
     elif os == "linux":
@@ -255,6 +269,7 @@ def libyuv_try_builder(name, dimensions, properties, recipe_name = "libyuv/libyu
 def ci_builder(name, os, category, short_name = None):
     dimensions = get_os_dimensions(os)
     properties = get_os_properties(os)
+    properties["$build/reclient"] = RECLIENT_CI
 
     dimensions["pool"] = "luci.flex.ci"
     properties["builder_group"] = "client.libyuv"
@@ -266,6 +281,7 @@ def ci_builder(name, os, category, short_name = None):
 def try_builder(name, os, experiment_percentage = None):
     dimensions = get_os_dimensions(os)
     properties = get_os_properties(os, try_builder = True)
+    properties["$build/reclient"] = RECLIENT_CQ
 
     dimensions["pool"] = "luci.flex.try"
     properties["builder_group"] = "tryserver.libyuv"
diff --git a/files/infra/config/project.cfg b/files/infra/config/project.cfg
index 700226ad..af79cfb2 100644
--- a/files/infra/config/project.cfg
+++ b/files/infra/config/project.cfg
@@ -7,7 +7,7 @@
 name: "libyuv"
 access: "group:all"
 lucicfg {
-  version: "1.30.9"
+  version: "1.39.8"
   package_dir: "."
   config_dir: "."
   entry_point: "main.star"
diff --git a/files/infra/config/realms.cfg b/files/infra/config/realms.cfg
index ae04529e..16ffaac9 100644
--- a/files/infra/config/realms.cfg
+++ b/files/infra/config/realms.cfg
@@ -38,6 +38,10 @@ realms {
     role: "role/scheduler.reader"
     principals: "group:all"
   }
+  bindings {
+    role: "role/swarming.taskTriggerer"
+    principals: "group:project-libyuv-admins"
+  }
 }
 realms {
   name: "ci"
diff --git a/files/libyuv.gni b/files/libyuv.gni
index 8df40ba2..0a6c4453 100644
--- a/files/libyuv.gni
+++ b/files/libyuv.gni
@@ -6,13 +6,14 @@
 # in the file PATENTS. All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
-import("//build_overrides/build.gni")
 import("//build/config/arm.gni")
 import("//build/config/mips.gni")
+import("//build_overrides/build.gni")
 
 declare_args() {
   libyuv_include_tests = !build_with_chromium
   libyuv_disable_jpeg = false
+  libyuv_disable_rvv = false
   libyuv_use_neon =
       current_cpu == "arm64" ||
       (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon))
diff --git a/files/riscv_script/prepare_toolchain_qemu.sh b/files/riscv_script/prepare_toolchain_qemu.sh
new file mode 100755
index 00000000..2a901739
--- /dev/null
+++ b/files/riscv_script/prepare_toolchain_qemu.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+set -ev
+
+# Download & build RISC-V Clang toolchain & QEMU emulator. 
+# RISC-V Clang is for cross compile with  the RISC-V Vector ISA.
+# RISC-V QEMU is used to run the test suite.
+#
+# Requirements: Linux host w/ working C++ compiler, git, cmake, ninja, wget, tar
+
+# NOTE: this script must be run from the top-level directory of the LIBYUV_SRC_DIR.
+
+RISCV_TRIPLE="riscv64-unknown-linux-gnu"
+RISCV_QEMU="qemu-riscv64"
+
+LIBYUV_SRC_DIR=$(pwd)
+BUILD_DIR="$LIBYUV_SRC_DIR"/build-toolchain-qemu
+INSTALL_QEMU="$BUILD_DIR"/riscv-qemu
+INSTALL_CLANG="$BUILD_DIR"/riscv-clang
+
+LLVM_VERSION="16.0.0"
+LLVM_NAME=llvm-project-"$LLVM_VERSION".src
+
+RISCV_GNU_TOOLCHAIN="$BUILD_DIR"/riscv-gnu-toolchain
+RISCV_CLANG_TOOLCHAIN="$BUILD_DIR"/"$LLVM_NAME"
+
+QEMU_NAME="qemu-7.0.0"
+
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
+
+# Download and install RISC-V GNU Toolchain (needed to build Clang)
+if [ ! -d "$RISCV_GNU_TOOLCHAIN" ]
+then
+  git clone git@github.com:riscv/riscv-gnu-toolchain.git
+  pushd "$RISCV_GNU_TOOLCHAIN"
+  git submodule update --init --recursive
+  ./configure --with-cmodel=medany --prefix="$INSTALL_CLANG"
+  ionice nice make linux -j `nproc` install
+  popd
+fi
+
+# Download Clang toolchain & build cross compiler
+if [ ! -d "$RISCV_CLANG_TOOLCHAIN" ]
+then
+  wget https://github.com/llvm/llvm-project/releases/download/llvmorg-"$LLVM_VERSION"/"$LLVM_NAME".tar.xz
+  tar xvJf "$LLVM_NAME".tar.xz
+  pushd "$RISCV_CLANG_TOOLCHAIN"
+	cmake -DCMAKE_INSTALL_PREFIX="$INSTALL_CLANG" \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DLLVM_TARGETS_TO_BUILD="RISCV" \
+      -DLLVM_ENABLE_PROJECTS="clang" \
+      -DLLVM_DEFAULT_TARGET_TRIPLE="$RISCV_TRIPLE" \
+      -DLLVM_INSTALL_TOOLCHAIN_ONLY=On \
+      -DDEFAULT_SYSROOT=../sysroot \
+      -G "Ninja" "$RISCV_CLANG_TOOLCHAIN"/llvm
+	ionice nice ninja -j `nproc`
+	ionice nice ninja -j `nproc` install
+  popd
+  pushd "$INSTALL_CLANG"/bin
+  ln -sf clang "$RISCV_TRIPLE"-clang
+  ln -sf clang++ "$RISCV_TRIPLE"-clang++
+  popd
+fi
+
+# Download QEMU and build the riscv64 Linux usermode emulator
+if [ ! -d "$QEMU_NAME" ]
+then
+  wget https://download.qemu.org/"$QEMU_NAME".tar.xz
+  tar xvJf "$QEMU_NAME".tar.xz
+  pushd "$QEMU_NAME"
+  ./configure --target-list=riscv64-linux-user --prefix="$INSTALL_QEMU"
+  ionice nice make -j `nproc` install
+  popd
+fi
diff --git a/files/riscv_script/riscv-clang.cmake b/files/riscv_script/riscv-clang.cmake
new file mode 100644
index 00000000..47dd5067
--- /dev/null
+++ b/files/riscv_script/riscv-clang.cmake
@@ -0,0 +1,52 @@
+set(CMAKE_CROSSCOMPILING TRUE)
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_SYSTEM_PROCESSOR "riscv64")
+
+option(USE_RVV "Enable riscv vector or not." ON)
+option(USE_AUTO_VECTORIZER "Enable riscv auto vectorizer or not." OFF)
+
+# Avoid to use system path for cross-compile
+set(CMAKE_FIND_USE_CMAKE_SYSTEM_PATH FALSE)
+
+set(TOOLCHAIN_PATH "" CACHE STRING "The toolcahin path.")
+if(NOT TOOLCHAIN_PATH)
+  set(TOOLCHAIN_PATH ${CMAKE_SOURCE_DIR}/build-toolchain-qemu/riscv-clang)
+endif()
+
+set(TOOLCHAIN_PREFIX "riscv64-unknown-linux-gnu-" CACHE STRING "The toolcahin prefix.")
+
+# toolchain setting
+set(CMAKE_C_COMPILER "${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}clang")
+set(CMAKE_CXX_COMPILER "${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}clang++")
+
+# CMake will just use the host-side tools for the following tools, so we setup them here.
+set(CMAKE_C_COMPILER_AR "${TOOLCHAIN_PATH}/bin/llvm-ar")
+set(CMAKE_CXX_COMPILER_AR "${TOOLCHAIN_PATH}/bin/llvm-ar")
+set(CMAKE_C_COMPILER_RANLIB "${TOOLCHAIN_PATH}/bin/llvm-ranlib")
+set(CMAKE_CXX_COMPILER_RANLIB "${TOOLCHAIN_PATH}/bin/llvm-ranlib")
+set(CMAKE_OBJDUMP "${TOOLCHAIN_PATH}/bin/llvm-objdump")
+set(CMAKE_OBJCOPY "${TOOLCHAIN_PATH}/bin/llvm-objcopy")
+
+# compile options
+message(STATUS "USE_RVV: ${USE_RVV}")
+message(STATUS "USE_AUTO_VECTORIZER: ${USE_AUTO_VECTORIZER}")
+set(RISCV_COMPILER_FLAGS)
+if(USE_RVV)
+  list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gcv")
+  if(NOT USE_AUTO_VECTORIZER)
+    # Disable auto-vectorizer
+    add_compile_options(-fno-vectorize -fno-slp-vectorize)
+  endif()
+else()
+  list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gc")
+endif()
+message(STATUS "RISCV_COMPILER_FLAGS: ${RISCV_COMPILER_FLAGS}")
+
+set(CMAKE_C_FLAGS             "${RISCV_COMPILER_FLAGS} ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS           "${RISCV_COMPILER_FLAGS} ${CMAKE_CXX_FLAGS}")
+
+set(RISCV_LINKER_FLAGS "-lstdc++ -lpthread -lm -ldl")
+set(RISCV_LINKER_FLAGS_EXE)
+set(CMAKE_SHARED_LINKER_FLAGS "${RISCV_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
+set(CMAKE_MODULE_LINKER_FLAGS "${RISCV_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS    "${RISCV_LINKER_FLAGS} ${RISCV_LINKER_FLAGS_EXE} ${CMAKE_EXE_LINKER_FLAGS}")
diff --git a/files/riscv_script/run_qemu.sh b/files/riscv_script/run_qemu.sh
new file mode 100755
index 00000000..080af3b1
--- /dev/null
+++ b/files/riscv_script/run_qemu.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -x
+set -e
+
+USE_RVV="${USE_RVV:-OFF}"
+TOOLCHAIN_PATH="${TOOLCHAIN_PATH:-../../build-toolchain-qemu/riscv-clang}"
+QEMU_PREFIX_PATH="${QEMU_PREFIX_PATH:-../../build-toolchain-qemu/riscv-qemu/}"
+
+if [ "${USE_RVV}" = "ON" ];then
+  QEMU_OPTION="-cpu rv64,zba=true,zbb=true,zbc=true,zbs=true,v=true,vlen=512,elen=64,vext_spec=v1.0 -L ${TOOLCHAIN_PATH}/sysroot"
+else
+  QEMU_OPTION="-cpu rv64,zba=true,zbb=true,zbc=true,zbs=true -L ${TOOLCHAIN_PATH}/sysroot"
+fi
+
+$QEMU_PREFIX_PATH/bin/qemu-riscv64 $QEMU_OPTION $@
diff --git a/files/source/compare.cc b/files/source/compare.cc
index d4713b60..50a736bd 100644
--- a/files/source/compare.cc
+++ b/files/source/compare.cc
@@ -45,7 +45,7 @@ uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
   }
 #endif
 
-  while (count >= (uint64_t)(kBlockSize)) {
+  while (count >= (uint64_t)kBlockSize) {
     seed = HashDjb2_SSE(src, kBlockSize, seed);
     src += kBlockSize;
     count -= kBlockSize;
@@ -359,10 +359,10 @@ static double Ssim8x8_C(const uint8_t* src_a,
         (sum_a_sq + sum_b_sq + c1) *
         (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
 
-    if (ssim_d == 0.0) {
+    if (ssim_d == 0) {
       return DBL_MAX;
     }
-    return ssim_n * 1.0 / ssim_d;
+    return (double)ssim_n / (double)ssim_d;
   }
 }
 
diff --git a/files/source/compare_gcc.cc b/files/source/compare_gcc.cc
index b834b42a..33cbe25d 100644
--- a/files/source/compare_gcc.cc
+++ b/files/source/compare_gcc.cc
@@ -67,7 +67,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
       :
       : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
 
-  return static_cast<uint32_t>(diff);
+  return (uint32_t)(diff);
 }
 #else
 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
diff --git a/files/source/compare_mmi.cc b/files/source/compare_mmi.cc
deleted file mode 100644
index 7640d946..00000000
--- a/files/source/compare_mmi.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// Hakmem method for hamming distance.
-uint32_t HammingDistance_MMI(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
-  uint32_t diff = 0u;
-
-  uint64_t temp = 0, temp1 = 0, ta = 0, tb = 0;
-  uint64_t c1 = 0x5555555555555555;
-  uint64_t c2 = 0x3333333333333333;
-  uint64_t c3 = 0x0f0f0f0f0f0f0f0f;
-  uint32_t c4 = 0x01010101;
-  uint64_t s1 = 1, s2 = 2, s3 = 4;
-  __asm__ volatile(
-      "1:	\n\t"
-      "ldc1   %[ta],    0(%[src_a])          \n\t"
-      "ldc1   %[tb],    0(%[src_b])          \n\t"
-      "xor    %[temp],  %[ta],      %[tb]    \n\t"
-      "psrlw  %[temp1], %[temp],    %[s1]    \n\t"  // temp1=x>>1
-      "and    %[temp1], %[temp1],   %[c1]    \n\t"  // temp1&=c1
-      "psubw  %[temp1], %[temp],    %[temp1] \n\t"  // x-temp1
-      "and    %[temp],  %[temp1],   %[c2]    \n\t"  // t = (u&c2)
-      "psrlw  %[temp1], %[temp1],   %[s2]    \n\t"  // u>>2
-      "and    %[temp1], %[temp1],   %[c2]    \n\t"  // u>>2 & c2
-      "paddw  %[temp1], %[temp1],   %[temp]  \n\t"  // t1 = t1+t
-      "psrlw  %[temp],  %[temp1],   %[s3]    \n\t"  // u>>4
-      "paddw  %[temp1], %[temp1],   %[temp]  \n\t"  // u+(u>>4)
-      "and    %[temp1], %[temp1],   %[c3]    \n\t"  //&c3
-      "dmfc1  $t0,      %[temp1]             \n\t"
-      "dsrl32 $t0,      $t0,        0        \n\t "
-      "mul    $t0,      $t0,        %[c4]    \n\t"
-      "dsrl   $t0,      $t0,        24       \n\t"
-      "dadd   %[diff],  %[diff],    $t0      \n\t"
-      "dmfc1  $t0,      %[temp1]             \n\t"
-      "mul    $t0,      $t0,        %[c4]    \n\t"
-      "dsrl   $t0,      $t0,        24       \n\t"
-      "dadd   %[diff],  %[diff],    $t0      \n\t"
-      "daddiu %[src_a], %[src_a],   8        \n\t"
-      "daddiu %[src_b], %[src_b],   8        \n\t"
-      "addiu  %[count], %[count],  -8        \n\t"
-      "bgtz   %[count], 1b \n\t"
-      "nop                            \n\t"
-      : [diff] "+r"(diff), [src_a] "+r"(src_a), [src_b] "+r"(src_b),
-        [count] "+r"(count), [ta] "+f"(ta), [tb] "+f"(tb), [temp] "+f"(temp),
-        [temp1] "+f"(temp1)
-      : [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "r"(c4), [s1] "f"(s1),
-        [s2] "f"(s2), [s3] "f"(s3)
-      : "memory");
-  return diff;
-}
-
-uint32_t SumSquareError_MMI(const uint8_t* src_a,
-                            const uint8_t* src_b,
-                            int count) {
-  uint32_t sse = 0u;
-  uint32_t sse_hi = 0u, sse_lo = 0u;
-
-  uint64_t src1, src2;
-  uint64_t diff, diff_hi, diff_lo;
-  uint64_t sse_sum, sse_tmp;
-
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "xor        %[sse_sum],      %[sse_sum],        %[sse_sum]    \n\t"
-
-      "1:                                                           \n\t"
-      "ldc1       %[src1],         0x00(%[src_a])                   \n\t"
-      "ldc1       %[src2],         0x00(%[src_b])                   \n\t"
-      "pasubub    %[diff],         %[src1],           %[src2]       \n\t"
-      "punpcklbh  %[diff_lo],      %[diff],           %[mask]       \n\t"
-      "punpckhbh  %[diff_hi],      %[diff],           %[mask]       \n\t"
-      "pmaddhw    %[sse_tmp],      %[diff_lo],        %[diff_lo]    \n\t"
-      "paddw      %[sse_sum],      %[sse_sum],        %[sse_tmp]    \n\t"
-      "pmaddhw    %[sse_tmp],      %[diff_hi],        %[diff_hi]    \n\t"
-      "paddw      %[sse_sum],      %[sse_sum],        %[sse_tmp]    \n\t"
-
-      "daddiu     %[src_a],        %[src_a],          0x08          \n\t"
-      "daddiu     %[src_b],        %[src_b],          0x08          \n\t"
-      "daddiu     %[count],        %[count],         -0x08          \n\t"
-      "bnez       %[count],        1b                               \n\t"
-
-      "mfc1       %[sse_lo],       %[sse_sum]                       \n\t"
-      "mfhc1      %[sse_hi],       %[sse_sum]                       \n\t"
-      "daddu      %[sse],          %[sse_hi],         %[sse_lo]     \n\t"
-      : [sse] "+&r"(sse), [diff] "=&f"(diff), [src1] "=&f"(src1),
-        [src2] "=&f"(src2), [diff_lo] "=&f"(diff_lo), [diff_hi] "=&f"(diff_hi),
-        [sse_sum] "=&f"(sse_sum), [sse_tmp] "=&f"(sse_tmp),
-        [sse_hi] "+&r"(sse_hi), [sse_lo] "+&r"(sse_lo)
-      : [src_a] "r"(src_a), [src_b] "r"(src_b), [count] "r"(count),
-        [mask] "f"(mask)
-      : "memory");
-
-  return sse;
-}
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/convert.cc b/files/source/convert.cc
index 7178580f..b11ab1bf 100644
--- a/files/source/convert.cc
+++ b/files/source/convert.cc
@@ -24,6 +24,10 @@ namespace libyuv {
 extern "C" {
 #endif
 
+// Subsample amount uses a shift.
+//   v is value
+//   a is amount to add to round up
+//   s is shift to subsample down
 #define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
 static __inline int Abs(int v) {
   return v >= 0 ? v : -v;
@@ -199,6 +203,99 @@ static int Planar16bitTo8bit(const uint16_t* src_y,
   return 0;
 }
 
+static int I41xToI420(const uint16_t* src_y,
+                      int src_stride_y,
+                      const uint16_t* src_u,
+                      int src_stride_u,
+                      const uint16_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height,
+                      int depth) {
+  const int scale = 1 << (24 - depth);
+
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  {
+    const int uv_width = SUBSAMPLE(width, 1, 1);
+    const int uv_height = SUBSAMPLE(height, 1, 1);
+
+    Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
+                      height);
+    ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_u,
+                          dst_stride_u, src_u, dst_u, scale, kFilterBilinear);
+    ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_v,
+                          dst_stride_v, src_v, dst_v, scale, kFilterBilinear);
+  }
+  return 0;
+}
+
+static int I21xToI420(const uint16_t* src_y,
+                      int src_stride_y,
+                      const uint16_t* src_u,
+                      int src_stride_u,
+                      const uint16_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height,
+                      int depth) {
+  const int scale = 1 << (24 - depth);
+
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  {
+    const int uv_width = SUBSAMPLE(width, 1, 1);
+    const int uv_height = SUBSAMPLE(height, 1, 1);
+    const int dy = FixedDiv(height, uv_height);
+
+    Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
+                      height);
+    ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u,
+                             dst_stride_u, src_u, dst_u, 0, 32768, dy,
+                             /*bpp=*/1, scale, kFilterBilinear);
+    ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v,
+                             dst_stride_v, src_v, dst_v, 0, 32768, dy,
+                             /*bpp=*/1, scale, kFilterBilinear);
+  }
+  return 0;
+}
+
 // Convert 10 bit YUV to 8 bit.
 LIBYUV_API
 int I010ToI420(const uint16_t* src_y,
@@ -236,38 +333,9 @@ int I210ToI420(const uint16_t* src_y,
                int dst_stride_v,
                int width,
                int height) {
-  const int depth = 10;
-  const int scale = 1 << (24 - depth);
-
-  if (width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  {
-    const int uv_width = SUBSAMPLE(width, 1, 1);
-    const int uv_height = SUBSAMPLE(height, 1, 1);
-    const int dy = FixedDiv(height, uv_height);
-
-    Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
-                      height);
-    ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u,
-                             dst_stride_u, src_u, dst_u, 0, 32768, dy,
-                             /*bpp=*/1, scale, kFilterBilinear);
-    ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v,
-                             dst_stride_v, src_v, dst_v, 0, 32768, dy,
-                             /*bpp=*/1, scale, kFilterBilinear);
-  }
-  return 0;
+  return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 10);
 }
 
 LIBYUV_API
@@ -291,6 +359,26 @@ int I210ToI422(const uint16_t* src_y,
                            0, 10);
 }
 
+LIBYUV_API
+int I410ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 10);
+}
+
 LIBYUV_API
 int I410ToI444(const uint16_t* src_y,
                int src_stride_y,
@@ -354,6 +442,26 @@ int I212ToI422(const uint16_t* src_y,
                            0, 12);
 }
 
+LIBYUV_API
+int I212ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 12);
+}
+
 LIBYUV_API
 int I412ToI444(const uint16_t* src_y,
                int src_stride_y,
@@ -375,6 +483,26 @@ int I412ToI444(const uint16_t* src_y,
                            0, 12);
 }
 
+LIBYUV_API
+int I412ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 12);
+}
+
 // Any Ix10 To I010 format with mirroring.
 static int Ix10ToI010(const uint16_t* src_y,
                       int src_stride_y,
@@ -713,6 +841,110 @@ int MM21ToI420(const uint8_t* src_y,
   return 0;
 }
 
+LIBYUV_API
+int MM21ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
+  if (!src_y || !src_uv || !dst_yuy2 || width <= 0) {
+    return -1;
+  }
+
+  DetileToYUY2(src_y, src_stride_y, src_uv, src_stride_uv, dst_yuy2,
+               dst_stride_yuy2, width, height, 32);
+
+  return 0;
+}
+
+// Convert MT2T into P010. See tinyurl.com/mtk-10bit-video-format for format
+// documentation.
+// TODO(greenjustin): Add an MT2T to I420 conversion.
+LIBYUV_API
+int MT2TToP010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (width <= 0 || !height || !src_uv || !dst_uv) {
+    return -1;
+  }
+
+  {
+    int uv_width = (width + 1) & ~1;
+    int uv_height = (height + 1) / 2;
+    int y = 0;
+    const int tile_width = 16;
+    const int y_tile_height = 32;
+    const int uv_tile_height = 16;
+    int padded_width = (width + tile_width - 1) & ~(tile_width - 1);
+    int y_tile_row_size = padded_width * y_tile_height * 10 / 8;
+    int uv_tile_row_size = padded_width * uv_tile_height * 10 / 8;
+    size_t row_buf_size = padded_width * y_tile_height * sizeof(uint16_t);
+    void (*UnpackMT2T)(const uint8_t* src, uint16_t* dst, size_t size) =
+        UnpackMT2T_C;
+    align_buffer_64(row_buf, row_buf_size);
+
+#if defined(HAS_UNPACKMT2T_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      UnpackMT2T = UnpackMT2T_NEON;
+    }
+#endif
+    // Negative height means invert the image.
+    if (height < 0) {
+      height = -height;
+      uv_height = (height + 1) / 2;
+      if (dst_y) {
+        dst_y = dst_y + (height - 1) * dst_stride_y;
+        dst_stride_y = -dst_stride_y;
+      }
+      dst_uv = dst_uv + (uv_height - 1) * dst_stride_uv;
+      dst_stride_uv = -dst_stride_uv;
+    }
+
+    // Unpack and detile Y in rows of tiles
+    if (src_y && dst_y) {
+      for (y = 0; y < (height & ~(y_tile_height - 1)); y += y_tile_height) {
+        UnpackMT2T(src_y, (uint16_t*)row_buf, y_tile_row_size);
+        DetilePlane_16((uint16_t*)row_buf, padded_width, dst_y, dst_stride_y,
+                       width, y_tile_height, y_tile_height);
+        src_y += src_stride_y * y_tile_height;
+        dst_y += dst_stride_y * y_tile_height;
+      }
+      if (height & (y_tile_height - 1)) {
+        UnpackMT2T(src_y, (uint16_t*)row_buf, y_tile_row_size);
+        DetilePlane_16((uint16_t*)row_buf, padded_width, dst_y, dst_stride_y,
+                       width, height & (y_tile_height - 1), y_tile_height);
+      }
+    }
+
+    // Unpack and detile UV plane
+    for (y = 0; y < (uv_height & ~(uv_tile_height - 1)); y += uv_tile_height) {
+      UnpackMT2T(src_uv, (uint16_t*)row_buf, uv_tile_row_size);
+      DetilePlane_16((uint16_t*)row_buf, padded_width, dst_uv, dst_stride_uv,
+                     uv_width, uv_tile_height, uv_tile_height);
+      src_uv += src_stride_uv * uv_tile_height;
+      dst_uv += dst_stride_uv * uv_tile_height;
+    }
+    if (uv_height & (uv_tile_height - 1)) {
+      UnpackMT2T(src_uv, (uint16_t*)row_buf, uv_tile_row_size);
+      DetilePlane_16((uint16_t*)row_buf, padded_width, dst_uv, dst_stride_uv,
+                     uv_width, uv_height & (uv_tile_height - 1),
+                     uv_tile_height);
+    }
+    free_aligned_buffer_64(row_buf);
+  }
+  return 0;
+}
+
 #ifdef I422TONV21_ROW_VERSION
 // Unittest fails for this version.
 // 422 chroma is 1/2 width, 1x height
@@ -734,7 +966,7 @@ int I422ToNV21(const uint8_t* src_y,
   int y;
   void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
                      uint8_t* dst_uv, int width) = MergeUVRow_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
   int halfwidth = (width + 1) >> 1;
@@ -764,11 +996,19 @@ int I422ToNV21(const uint8_t* src_y,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow = MergeUVRow_Any_NEON;
@@ -793,6 +1033,11 @@ int I422ToNV21(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow = MergeUVRow_RVV;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -833,6 +1078,11 @@ int I422ToNV21(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   if (dst_y) {
     CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height);
@@ -1118,6 +1368,70 @@ int NV16ToNV24(const uint8_t* src_y,
   return 0;
 }
 
+// Any P[420]1[02] to I[420]1[02] format with mirroring.
+static int PxxxToIxxx(const uint16_t* src_y,
+                      int src_stride_y,
+                      const uint16_t* src_uv,
+                      int src_stride_uv,
+                      uint16_t* dst_y,
+                      int dst_stride_y,
+                      uint16_t* dst_u,
+                      int dst_stride_u,
+                      uint16_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height,
+                      int subsample_x,
+                      int subsample_y,
+                      int depth) {
+  const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+  const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+  ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
+                       depth);
+  SplitUVPlane_16(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                  dst_stride_v, uv_width, uv_height, depth);
+  return 0;
+}
+
+LIBYUV_API
+int P010ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return PxxxToIxxx(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+                    dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+                    width, height, 1, 1, 10);
+}
+
+LIBYUV_API
+int P012ToI012(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return PxxxToIxxx(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+                    dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+                    width, height, 1, 1, 12);
+}
+
 LIBYUV_API
 int P010ToP410(const uint16_t* src_y,
                int src_stride_y,
@@ -1231,6 +1545,16 @@ int YUY2ToI420(const uint8_t* src_yuy2,
     }
   }
 #endif
+#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    YUY2ToYRow = YUY2ToYRow_Any_LSX;
+    YUY2ToUVRow = YUY2ToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_LSX;
+      YUY2ToUVRow = YUY2ToUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     YUY2ToYRow = YUY2ToYRow_Any_LASX;
@@ -1322,6 +1646,26 @@ int UYVYToI420(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    UYVYToYRow = UYVYToYRow_Any_LSX;
+    UYVYToUVRow = UYVYToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_LSX;
+      UYVYToUVRow = UYVYToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    UYVYToYRow = UYVYToYRow_Any_LSX;
+    UYVYToUVRow = UYVYToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_LSX;
+      UYVYToUVRow = UYVYToUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_UYVYTOYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     UYVYToYRow = UYVYToYRow_Any_LASX;
@@ -1574,6 +1918,176 @@ int ARGBToI420(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYRow = ARGBToYRow_Any_LASX;
+    ARGBToUVRow = ARGBToUVRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_LASX;
+      ARGBToUVRow = ARGBToUVRow_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    src_argb += src_stride_argb * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+  }
+  return 0;
+}
+
+#ifdef USE_EXTRACTALPHA
+// Convert ARGB to I420 with Alpha
+// The following version calls ARGBExtractAlpha on the full image.
+LIBYUV_API
+int ARGBToI420Alpha(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    uint8_t* dst_a,
+                    int dst_stride_a,
+                    int width,
+                    int height) {
+  int r = ARGBToI420(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, width, height);
+  if (r == 0) {
+    r = ARGBExtractAlpha(src_argb, src_stride_argb, dst_a, dst_stride_a, width,
+                         height);
+  }
+  return r;
+}
+#else  // USE_EXTRACTALPHA
+// Convert ARGB to I420 with Alpha
+LIBYUV_API
+int ARGBToI420Alpha(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    uint8_t* dst_a,
+                    int dst_stride_a,
+                    int width,
+                    int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a,
+                              int width) = ARGBExtractAlphaRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || !dst_a || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -1584,22 +2098,58 @@ int ARGBToI420(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
+                                               : ARGBExtractAlphaRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2
+                                                : ARGBExtractAlphaRow_Any_AVX2;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
+                                                : ARGBExtractAlphaRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
+                                                : ARGBExtractAlphaRow_Any_MSA;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_LSX
+                                                : ARGBExtractAlphaRow_Any_LSX;
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
     ARGBToYRow(src_argb, dst_y, width);
     ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    ARGBExtractAlphaRow(src_argb, dst_a, width);
+    ARGBExtractAlphaRow(src_argb + src_stride_argb, dst_a + dst_stride_a,
+                        width);
     src_argb += src_stride_argb * 2;
     dst_y += dst_stride_y * 2;
     dst_u += dst_stride_u;
     dst_v += dst_stride_v;
+    dst_a += dst_stride_a * 2;
   }
   if (height & 1) {
     ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
     ARGBToYRow(src_argb, dst_y, width);
+    ARGBExtractAlphaRow(src_argb, dst_a, width);
   }
   return 0;
 }
+#endif  // USE_EXTRACTALPHA
 
 // Convert BGRA to I420.
 LIBYUV_API
@@ -1628,16 +2178,6 @@ int BGRAToI420(const uint8_t* src_bgra,
     src_bgra = src_bgra + (height - 1) * src_stride_bgra;
     src_stride_bgra = -src_stride_bgra;
   }
-#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
-    BGRAToYRow = BGRAToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      BGRAToUVRow = BGRAToUVRow_SSSE3;
-      BGRAToYRow = BGRAToYRow_SSSE3;
-    }
-  }
-#endif
 #if defined(HAS_BGRATOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     BGRAToYRow = BGRAToYRow_Any_NEON;
@@ -1654,12 +2194,46 @@ int BGRAToI420(const uint8_t* src_bgra,
     }
   }
 #endif
+#if defined(HAS_BGRATOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BGRAToYRow = BGRAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    BGRAToYRow = BGRAToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      BGRAToYRow = BGRAToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    BGRAToUVRow = BGRAToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      BGRAToUVRow = BGRAToUVRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     BGRAToYRow = BGRAToYRow_Any_MSA;
     BGRAToUVRow = BGRAToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
       BGRAToYRow = BGRAToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
       BGRAToUVRow = BGRAToUVRow_MSA;
     }
   }
@@ -1674,6 +2248,19 @@ int BGRAToI420(const uint8_t* src_bgra,
     }
   }
 #endif
+#if defined(HAS_BGRATOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    BGRAToYRow = BGRAToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      BGRAToYRow = BGRAToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    BGRAToYRow = BGRAToYRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
@@ -1786,6 +2373,19 @@ int ABGRToI420(const uint8_t* src_abgr,
     }
   }
 #endif
+#if defined(HAS_ABGRTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYRow = ABGRToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYRow = ABGRToYRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
@@ -1882,6 +2482,19 @@ int RGBAToI420(const uint8_t* src_rgba,
     }
   }
 #endif
+#if defined(HAS_RGBATOYROW_LASX)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToYRow = RGBAToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGBAToYRow = RGBAToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGBAToYRow = RGBAToYRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
@@ -1901,7 +2514,7 @@ int RGBAToI420(const uint8_t* src_rgba,
 
 // Enabled if 1 pass is available
 #if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
-     defined(HAS_RGB24TOYROW_LSX))
+     defined(HAS_RGB24TOYROW_LSX) || defined(HAS_RGB24TOYROW_RVV))
 #define HAS_RGB24TOYROW
 #endif
 
@@ -1986,6 +2599,11 @@ int RGB24ToI420(const uint8_t* src_rgb24,
     }
   }
 #endif
+#if defined(HAS_RGB24TOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGB24ToYRow = RGB24ToYRow_RVV;
+  }
+#endif
 
 // Other platforms do intermediate conversion from RGB24 to ARGB.
 #else  // HAS_RGB24TOYROW
@@ -2035,8 +2653,8 @@ int RGB24ToI420(const uint8_t* src_rgb24,
   {
 #if !defined(HAS_RGB24TOYROW)
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
@@ -2046,10 +2664,10 @@ int RGB24ToI420(const uint8_t* src_rgb24,
       RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
 #else
       RGB24ToARGBRow(src_rgb24, row, width);
-      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + row_size, width);
+      ARGBToUVRow(row, row_size, dst_u, dst_v, width);
       ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_rgb24 += src_stride_rgb24 * 2;
       dst_y += dst_stride_y * 2;
@@ -2075,7 +2693,8 @@ int RGB24ToI420(const uint8_t* src_rgb24,
 #undef HAS_RGB24TOYROW
 
 // Enabled if 1 pass is available
-#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA)
+#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+    defined(HAS_RGB24TOYJROW_RVV)
 #define HAS_RGB24TOYJROW
 #endif
 
@@ -2140,6 +2759,27 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
     }
   }
 #endif
+#if defined(HAS_RGB24TOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGB24ToYJRow = RGB24ToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGB24ToYJRow = RGB24ToYJRow_RVV;
+  }
+#endif
 
 // Other platforms do intermediate conversion from RGB24 to ARGB.
 #else  // HAS_RGB24TOYJROW
@@ -2189,8 +2829,8 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
   {
 #if !defined(HAS_RGB24TOYJROW)
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
@@ -2200,10 +2840,10 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
       RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
 #else
       RGB24ToARGBRow(src_rgb24, row, width);
-      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
-      ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + row_size, width);
+      ARGBToUVJRow(row, row_size, dst_u, dst_v, width);
       ARGBToYJRow(row, dst_y, width);
-      ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_rgb24 += src_stride_rgb24 * 2;
       dst_y += dst_stride_y * 2;
@@ -2230,7 +2870,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
 
 // Enabled if 1 pass is available
 #if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
-     defined(HAS_RAWTOYROW_LSX))
+     defined(HAS_RAWTOYROW_LSX) || defined(HAS_RAWTOYROW_RVV))
 #define HAS_RAWTOYROW
 #endif
 
@@ -2314,6 +2954,11 @@ int RAWToI420(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToYRow = RAWToYRow_RVV;
+  }
+#endif
 
 // Other platforms do intermediate conversion from RAW to ARGB.
 #else  // HAS_RAWTOYROW
@@ -2363,8 +3008,8 @@ int RAWToI420(const uint8_t* src_raw,
   {
 #if !defined(HAS_RAWTOYROW)
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
@@ -2374,10 +3019,10 @@ int RAWToI420(const uint8_t* src_raw,
       RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
 #else
       RAWToARGBRow(src_raw, row, width);
-      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width);
+      ARGBToUVRow(row, row_size, dst_u, dst_v, width);
       ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_raw += src_stride_raw * 2;
       dst_y += dst_stride_y * 2;
@@ -2403,7 +3048,8 @@ int RAWToI420(const uint8_t* src_raw,
 #undef HAS_RAWTOYROW
 
 // Enabled if 1 pass is available
-#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA)
+#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || \
+    defined(HAS_RAWTOYJROW_RVV)
 #define HAS_RAWTOYJROW
 #endif
 
@@ -2468,6 +3114,27 @@ int RAWToJ420(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RAWToYJRow = RAWToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RAWToYJRow = RAWToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToYJRow = RAWToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToYJRow = RAWToYJRow_RVV;
+  }
+#endif
 
 // Other platforms do intermediate conversion from RAW to ARGB.
 #else  // HAS_RAWTOYJROW
@@ -2517,8 +3184,8 @@ int RAWToJ420(const uint8_t* src_raw,
   {
 #if !defined(HAS_RAWTOYJROW)
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
@@ -2528,10 +3195,10 @@ int RAWToJ420(const uint8_t* src_raw,
       RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
 #else
       RAWToARGBRow(src_raw, row, width);
-      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
-      ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width);
+      ARGBToUVJRow(row, row_size, dst_u, dst_v, width);
       ARGBToYJRow(row, dst_y, width);
-      ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_raw += src_stride_raw * 2;
       dst_y += dst_stride_y * 2;
@@ -2695,8 +3362,8 @@ int RGB565ToI420(const uint8_t* src_rgb565,
 #if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
       defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
 #endif
     for (y = 0; y < height - 1; y += 2) {
 #if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
@@ -2706,10 +3373,10 @@ int RGB565ToI420(const uint8_t* src_rgb565,
       RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
 #else
       RGB565ToARGBRow(src_rgb565, row, width);
-      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + row_size, width);
+      ARGBToUVRow(row, row_size, dst_u, dst_v, width);
       ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_rgb565 += src_stride_rgb565 * 2;
       dst_y += dst_stride_y * 2;
@@ -2875,8 +3542,8 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
 #if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
       defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
@@ -2888,11 +3555,11 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
                      width);
 #else
       ARGB1555ToARGBRow(src_argb1555, row, width);
-      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + row_size,
                         width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToUVRow(row, row_size, dst_u, dst_v, width);
       ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_argb1555 += src_stride_argb1555 * 2;
       dst_y += dst_stride_y * 2;
@@ -3055,6 +3722,24 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -3070,8 +3755,8 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
   {
 #if !(defined(HAS_ARGB4444TOYROW_NEON))
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
@@ -3082,11 +3767,11 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
                      width);
 #else
       ARGB4444ToARGBRow(src_argb4444, row, width);
-      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + row_size,
                         width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToUVRow(row, row_size, dst_u, dst_v, width);
       ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_argb4444 += src_stride_argb4444 * 2;
       dst_y += dst_stride_y * 2;
@@ -3167,6 +3852,27 @@ int RGB24ToJ400(const uint8_t* src_rgb24,
     }
   }
 #endif
+#if defined(HAS_RGB24TOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGB24ToYJRow = RGB24ToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGB24ToYJRow = RGB24ToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RGB24ToYJRow(src_rgb24, dst_yj, width);
@@ -3235,6 +3941,27 @@ int RAWToJ400(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RAWToYJRow = RAWToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RAWToYJRow = RAWToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToYJRow = RAWToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToYJRow = RAWToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToYJRow(src_raw, dst_yj, width);
diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc
index 71ef8c10..cc6560de 100644
--- a/files/source/convert_argb.cc
+++ b/files/source/convert_argb.cc
@@ -7,8 +7,12 @@
  *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
 #include "libyuv/convert_argb.h"
 
+#include <assert.h>
+
+#include "libyuv/convert_from_argb.h"
 #include "libyuv/cpu_id.h"
 #ifdef HAVE_JPEG
 #include "libyuv/mjpeg_decoder.h"
@@ -65,6 +69,7 @@ int I420ToARGBMatrix(const uint8_t* src_y,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -115,6 +120,14 @@ int I420ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGBROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToARGBRow = I422ToARGBRow_Any_LASX;
@@ -123,6 +136,11 @@ int I420ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -298,6 +316,7 @@ int I422ToARGBMatrix(const uint8_t* src_y,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -355,6 +374,14 @@ int I422ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGBROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToARGBRow = I422ToARGBRow_Any_LASX;
@@ -363,6 +390,11 @@ int I422ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -536,6 +568,7 @@ int I444ToARGBMatrix(const uint8_t* src_y,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I444ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -592,6 +625,11 @@ int I444ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I444TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToARGBRow = I444ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -747,6 +785,133 @@ int U444ToABGR(const uint8_t* src_y,
                           width, height);
 }
 
+// Convert I444 to RGB24 with matrix.
+LIBYUV_API
+int I444ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height) {
+  int y;
+  void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I444ToRGB24Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
+      dst_stride_rgb24 == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_rgb24 = 0;
+  }
+#if defined(HAS_I444TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToRGB24Row = I444ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I444ToRGB24Row = I444ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToRGB24Row = I444ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToRGB24Row = I444ToRGB24Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToRGB24Row = I444ToRGB24Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToRGB24Row = I444ToRGB24Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I444ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I444 to RGB24.
+LIBYUV_API
+int I444ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I444ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvI601Constants, width, height);
+}
+
+// Convert I444 to RAW.
+LIBYUV_API
+int I444ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I444ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuI601Constants,  // Use Yvu matrix
+                           width, height);
+}
+
 // Convert 10 bit YUV to ARGB with matrix.
 // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
 // multiply 10 bit yuv into high bits to allow any number of bits.
@@ -767,6 +932,7 @@ int I010ToAR30Matrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I210ToAR30Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -926,6 +1092,7 @@ int I012ToAR30Matrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I212ToAR30Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -983,6 +1150,7 @@ int I210ToAR30Matrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I210ToAR30Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -1137,6 +1305,7 @@ int I410ToAR30Matrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I410ToAR30Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -1190,6 +1359,7 @@ int I010ToARGBMatrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I210ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1353,6 +1523,7 @@ int I012ToARGBMatrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I212ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1408,6 +1579,7 @@ int I210ToARGBMatrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I210ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1568,6 +1740,7 @@ int I410ToARGBMatrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I410ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1617,6 +1790,7 @@ int P010ToARGBMatrix(const uint16_t* src_y,
   void (*P210ToARGBRow)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1667,6 +1841,7 @@ int P210ToARGBMatrix(const uint16_t* src_y,
   void (*P210ToARGBRow)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1715,6 +1890,7 @@ int P010ToAR30Matrix(const uint16_t* src_y,
   void (*P210ToAR30Row)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -1765,6 +1941,7 @@ int P210ToAR30Matrix(const uint16_t* src_y,
   void (*P210ToAR30Row)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -1823,6 +2000,7 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y,
                              int width) = I422AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -1865,6 +2043,14 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422ALPHATOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422ALPHATOARGBROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX;
@@ -1873,6 +2059,11 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422ALPHATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_RVV;
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -1905,6 +2096,11 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -1947,6 +2143,7 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y,
                              int width) = I422AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -1989,6 +2186,14 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422ALPHATOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422ALPHATOARGBROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX;
@@ -1997,6 +2202,11 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422ALPHATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_RVV;
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -2029,6 +2239,11 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -2069,6 +2284,7 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y,
                              int width) = I444AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -2111,6 +2327,11 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I444ALPHATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_RVV;
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -2143,6 +2364,11 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I444AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -2312,6 +2538,7 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y,
                              int width) = I210AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -2370,6 +2597,11 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -2412,6 +2644,7 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y,
                              int width) = I210AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -2470,6 +2703,11 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -2510,6 +2748,7 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y,
                              int width) = I410AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -2568,6 +2807,11 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -2597,6 +2841,7 @@ int I400ToARGBMatrix(const uint8_t* src_y,
   void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I400ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -2652,6 +2897,11 @@ int I400ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I400TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I400ToARGBRow = I400ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I400ToARGBRow(src_y, dst_argb, yuvconstants, width);
@@ -2739,6 +2989,12 @@ int J400ToARGB(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_J400TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    J400ToARGBRow = J400ToARGBRow_RVV;
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     J400ToARGBRow(src_y, dst_argb, width);
     src_y += src_stride_y;
@@ -2901,6 +3157,11 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
     }
   }
 #endif
+#if defined(HAS_RGB24TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RGB24ToARGBRow(src_rgb24, dst_argb, width);
@@ -2976,6 +3237,11 @@ int RAWToARGB(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToARGBRow = RAWToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToARGBRow(src_raw, dst_argb, width);
@@ -3027,6 +3293,11 @@ int RAWToRGBA(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTORGBAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToRGBARow = RAWToRGBARow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToRGBARow(src_raw, dst_rgba, width);
@@ -3431,6 +3702,11 @@ int AR64ToARGB(const uint16_t* src_ar64,
     }
   }
 #endif
+#if defined(HAS_AR64TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    AR64ToARGBRow = AR64ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     AR64ToARGBRow(src_ar64, dst_argb, width);
@@ -3490,6 +3766,11 @@ int AB64ToARGB(const uint16_t* src_ab64,
     }
   }
 #endif
+#if defined(HAS_AB64TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    AB64ToARGBRow = AB64ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     AB64ToARGBRow(src_ab64, dst_argb, width);
@@ -3514,6 +3795,7 @@ int NV12ToARGBMatrix(const uint8_t* src_y,
   void (*NV12ToARGBRow)(
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -3598,6 +3880,7 @@ int NV21ToARGBMatrix(const uint8_t* src_y,
   void (*NV21ToARGBRow)(
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -3741,6 +4024,7 @@ int NV12ToRGB24Matrix(const uint8_t* src_y,
   void (*NV12ToRGB24Row)(
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
@@ -3801,6 +4085,7 @@ int NV21ToRGB24Matrix(const uint8_t* src_y,
   void (*NV21ToRGB24Row)(
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
@@ -4143,6 +4428,7 @@ int Android420ToARGBMatrix(const uint8_t* src_y,
   const ptrdiff_t vu_off = src_v - src_u;
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -4243,6 +4529,7 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToRGBARow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
     return -1;
   }
@@ -4284,6 +4571,14 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGBAROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGBARow = I422ToRGBARow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TORGBAROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToRGBARow = I422ToRGBARow_Any_LASX;
@@ -4292,6 +4587,11 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGBAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGBARow = I422ToRGBARow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
@@ -4354,6 +4654,7 @@ int NV12ToRGB565Matrix(const uint8_t* src_y,
   void (*NV12ToRGB565Row)(
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
@@ -4456,6 +4757,7 @@ int I420ToRGBAMatrix(const uint8_t* src_y,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToRGBARow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
     return -1;
   }
@@ -4497,6 +4799,14 @@ int I420ToRGBAMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGBAROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGBARow = I422ToRGBARow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TORGBAROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToRGBARow = I422ToRGBARow_Any_LASX;
@@ -4505,6 +4815,11 @@ int I420ToRGBAMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGBAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGBARow = I422ToRGBARow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
@@ -4572,6 +4887,7 @@ int I420ToRGB24Matrix(const uint8_t* src_y,
                          const uint8_t* v_buf, uint8_t* rgb_buf,
                          const struct YuvConstants* yuvconstants, int width) =
       I422ToRGB24Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
@@ -4613,6 +4929,14 @@ int I420ToRGB24Matrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGB24ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TORGB24ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToRGB24Row = I422ToRGB24Row_Any_LASX;
@@ -4621,6 +4945,11 @@ int I420ToRGB24Matrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGB24Row = I422ToRGB24Row_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
@@ -4742,70 +5071,206 @@ int H420ToRAW(const uint8_t* src_y,
                            width, height);
 }
 
-// Convert I420 to ARGB1555.
+// Convert I422 to RGB24 with matrix.
 LIBYUV_API
-int I420ToARGB1555(const uint8_t* src_y,
-                   int src_stride_y,
-                   const uint8_t* src_u,
-                   int src_stride_u,
-                   const uint8_t* src_v,
-                   int src_stride_v,
-                   uint8_t* dst_argb1555,
-                   int dst_stride_argb1555,
-                   int width,
-                   int height) {
+int I422ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height) {
   int y;
-  void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                            const uint8_t* v_buf, uint8_t* rgb_buf,
-                            const struct YuvConstants* yuvconstants,
-                            int width) = I422ToARGB1555Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
-      height == 0) {
+  void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB24Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
-    dst_stride_argb1555 = -dst_stride_argb1555;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
   }
-#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+#if defined(HAS_I422TORGB24ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
     }
   }
 #endif
-#if defined(HAS_I422TOARGB1555ROW_AVX2)
+#if defined(HAS_I422TORGB24ROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToRGB24Row = I422ToRGB24Row_AVX2;
     }
   }
 #endif
-#if defined(HAS_I422TOARGB1555ROW_NEON)
+#if defined(HAS_I422TORGB24ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
     }
   }
 #endif
-#if defined(HAS_I422TOARGB1555ROW_MSA)
+#if defined(HAS_I422TORGB24ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+    I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_MSA;
     }
   }
 #endif
-#if defined(HAS_I422TOARGB1555ROW_LASX)
+#if defined(HAS_I422TORGB24ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_LASX;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_LASX;
+    I422ToRGB24Row = I422ToRGB24Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToRGB24Row = I422ToRGB24Row_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGB24Row = I422ToRGB24Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to RGB24.
+LIBYUV_API
+int I422ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I422ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to RAW.
+LIBYUV_API
+int I422ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I422ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuI601Constants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height) {
+  int y;
+  void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                            const uint8_t* v_buf, uint8_t* rgb_buf,
+                            const struct YuvConstants* yuvconstants,
+                            int width) = I422ToARGB1555Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+    dst_stride_argb1555 = -dst_stride_argb1555;
+  }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_LASX;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_LASX;
     }
   }
 #endif
@@ -4882,6 +5347,14 @@ int I420ToARGB4444(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGB4444ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGB4444ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToARGB4444Row = I422ToARGB4444Row_Any_LASX;
@@ -4922,6 +5395,7 @@ int I420ToRGB565Matrix(const uint8_t* src_y,
                           const uint8_t* v_buf, uint8_t* rgb_buf,
                           const struct YuvConstants* yuvconstants, int width) =
       I422ToRGB565Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
@@ -4963,6 +5437,14 @@ int I420ToRGB565Matrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGB565ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TORGB565ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToRGB565Row = I422ToRGB565Row_Any_LASX;
@@ -5035,23 +5517,25 @@ int H420ToRGB565(const uint8_t* src_y,
                             &kYuvH709Constants, width, height);
 }
 
-// Convert I422 to RGB565.
+// Convert I422 to RGB565 with specified color matrix.
 LIBYUV_API
-int I422ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
+int I422ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height) {
   int y;
   void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
                           const uint8_t* v_buf, uint8_t* rgb_buf,
                           const struct YuvConstants* yuvconstants, int width) =
       I422ToRGB565Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
@@ -5093,6 +5577,14 @@ int I422ToRGB565(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGB565ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TORGB565ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToRGB565Row = I422ToRGB565Row_Any_LASX;
@@ -5103,7 +5595,7 @@ int I422ToRGB565(const uint8_t* src_y,
 #endif
 
   for (y = 0; y < height; ++y) {
-    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
     dst_rgb565 += dst_stride_rgb565;
     src_y += src_stride_y;
     src_u += src_stride_u;
@@ -5112,6 +5604,23 @@ int I422ToRGB565(const uint8_t* src_y,
   return 0;
 }
 
+// Convert I422 to RGB565.
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  return I422ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_rgb565, dst_stride_rgb565,
+                            &kYuvI601Constants, width, height);
+}
+
 // Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
 static const uint8_t kDither565_4x4[16] = {
     0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
@@ -5136,7 +5645,7 @@ int I420ToRGB565Dither(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToARGBRow_C;
   void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
-                                const uint32_t dither4, int width) =
+                                uint32_t dither4, int width) =
       ARGBToRGB565DitherRow_C;
   if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
@@ -5191,6 +5700,14 @@ int I420ToRGB565Dither(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGBROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToARGBRow = I422ToARGBRow_Any_LASX;
@@ -5199,6 +5716,11 @@ int I420ToRGB565Dither(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif
 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
@@ -5231,6 +5753,14 @@ int I420ToRGB565Dither(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB565DITHERROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX;
@@ -5278,6 +5808,7 @@ int I420ToAR30Matrix(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToAR30Row_C;
 
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -5401,9 +5932,12 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I444ToARGBRow_C;
-  void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
-                      uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleRowUp2_Bilinear_Any_C;
+  void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                               uint8_t* dst_ptr, ptrdiff_t dst_stride,
+                               int dst_width) = ScaleRowUp2_Bilinear_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -5453,48 +5987,57 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I444TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToARGBRow = I444ToARGBRow_RVV;
+  }
+#endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+#if defined(HAS_SCALEROWUP2_BILINEAR_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
   }
 #endif
 
   // alloc 4 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 4);
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4);
   uint8_t* temp_u_1 = row;
-  uint8_t* temp_u_2 = row + kRowSize;
-  uint8_t* temp_v_1 = row + kRowSize * 2;
-  uint8_t* temp_v_2 = row + kRowSize * 3;
+  uint8_t* temp_u_2 = row + row_size;
+  uint8_t* temp_v_1 = row + row_size * 2;
+  uint8_t* temp_v_2 = row + row_size * 3;
 
-  Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-  Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+  ScaleRowUp2_Linear(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear(src_v, temp_v_1, width);
   I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
   dst_argb += dst_stride_argb;
   src_y += src_stride_y;
 
   for (y = 0; y < height - 2; y += 2) {
-    Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width);
+    Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width);
     I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -5506,8 +6049,8 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y,
   }
 
   if (!(height & 1)) {
-    Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+    ScaleRowUp2_Linear(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear(src_v, temp_v_1, width);
     I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
   }
 
@@ -5531,8 +6074,9 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I444ToARGBRow_C;
-  void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
-      ScaleRowUp2_Linear_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -5582,36 +6126,41 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I444TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToARGBRow = I444ToARGBRow_RVV;
+  }
+#endif
 #if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
   }
 #endif
 
   // alloc 2 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2);
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2);
   uint8_t* temp_u = row;
-  uint8_t* temp_v = row + kRowSize;
+  uint8_t* temp_v = row + row_size;
 
   for (y = 0; y < height; ++y) {
-    ScaleRowUp(src_u, temp_u, width);
-    ScaleRowUp(src_v, temp_v, width);
+    ScaleRowUp2_Linear(src_u, temp_u, width);
+    ScaleRowUp2_Linear(src_v, temp_v, width);
     I444ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -5623,6 +6172,148 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y,
   return 0;
 }
 
+static int I420ToRGB24MatrixBilinear(const uint8_t* src_y,
+                                     int src_stride_y,
+                                     const uint8_t* src_u,
+                                     int src_stride_u,
+                                     const uint8_t* src_v,
+                                     int src_stride_v,
+                                     uint8_t* dst_rgb24,
+                                     int dst_stride_rgb24,
+                                     const struct YuvConstants* yuvconstants,
+                                     int width,
+                                     int height) {
+  int y;
+  void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I444ToRGB24Row_C;
+  void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                               uint8_t* dst_ptr, ptrdiff_t dst_stride,
+                               int dst_width) = ScaleRowUp2_Bilinear_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I444TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToRGB24Row = I444ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I444ToRGB24Row = I444ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToRGB24Row = I444ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToRGB24Row = I444ToRGB24Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I444ToRGB24Row = I444ToRGB24Row_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToRGB24Row = I444ToRGB24Row_RVV;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+
+  // alloc 4 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4);
+  uint8_t* temp_u_1 = row;
+  uint8_t* temp_u_2 = row + row_size;
+  uint8_t* temp_v_1 = row + row_size * 2;
+  uint8_t* temp_v_2 = row + row_size * 3;
+
+  ScaleRowUp2_Linear(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear(src_v, temp_v_1, width);
+  I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width);
+  dst_rgb24 += dst_stride_rgb24;
+  src_y += src_stride_y;
+
+  for (y = 0; y < height - 2; y += 2) {
+    Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width);
+    I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    I444ToRGB24Row(src_y, temp_u_2, temp_v_2, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+
+  if (!(height & 1)) {
+    ScaleRowUp2_Linear(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear(src_v, temp_v_1, width);
+    I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width);
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
 static int I010ToAR30MatrixBilinear(const uint16_t* src_y,
                                     int src_stride_y,
                                     const uint16_t* src_u,
@@ -5639,9 +6330,12 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I410ToAR30Row_C;
-  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleRowUp2_Bilinear_16_Any_C;
+  void (*Scale2RowUp_Bilinear_12)(
+      const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+      ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C;
+  void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                                int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -5668,41 +6362,44 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y,
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
   }
 #endif
 
   // alloc 4 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 4 * sizeof(uint16_t));
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4 * sizeof(uint16_t));
   uint16_t* temp_u_1 = (uint16_t*)(row);
-  uint16_t* temp_u_2 = (uint16_t*)(row) + kRowSize;
-  uint16_t* temp_v_1 = (uint16_t*)(row) + kRowSize * 2;
-  uint16_t* temp_v_2 = (uint16_t*)(row) + kRowSize * 3;
+  uint16_t* temp_u_2 = (uint16_t*)(row) + row_size;
+  uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2;
+  uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3;
 
-  Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-  Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+  ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
   I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width);
   dst_ar30 += dst_stride_ar30;
   src_y += src_stride_y;
 
   for (y = 0; y < height - 2; y += 2) {
-    Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width);
+    Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width);
     I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width);
     dst_ar30 += dst_stride_ar30;
     src_y += src_stride_y;
@@ -5714,8 +6411,8 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y,
   }
 
   if (!(height & 1)) {
-    Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+    ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
     I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width);
   }
 
@@ -5740,8 +6437,9 @@ static int I210ToAR30MatrixLinear(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I410ToAR30Row_C;
-  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
-                     int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                                int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -5770,29 +6468,29 @@ static int I210ToAR30MatrixLinear(const uint16_t* src_y,
 
 #if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
   }
 #endif
 
   // alloc 2 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t));
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
   uint16_t* temp_u = (uint16_t*)(row);
-  uint16_t* temp_v = (uint16_t*)(row) + kRowSize;
+  uint16_t* temp_v = (uint16_t*)(row) + row_size;
 
   for (y = 0; y < height; ++y) {
-    ScaleRowUp(src_u, temp_u, width);
-    ScaleRowUp(src_v, temp_v, width);
+    ScaleRowUp2_Linear_12(src_u, temp_u, width);
+    ScaleRowUp2_Linear_12(src_v, temp_v, width);
     I410ToAR30Row(src_y, temp_u, temp_v, dst_ar30, yuvconstants, width);
     dst_ar30 += dst_stride_ar30;
     src_y += src_stride_y;
@@ -5819,9 +6517,12 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I410ToARGBRow_C;
-  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleRowUp2_Bilinear_16_Any_C;
+  void (*Scale2RowUp_Bilinear_12)(
+      const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+      ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C;
+  void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                                int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -5848,41 +6549,44 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y,
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
   }
 #endif
 
   // alloc 4 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 4 * sizeof(uint16_t));
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4 * sizeof(uint16_t));
   uint16_t* temp_u_1 = (uint16_t*)(row);
-  uint16_t* temp_u_2 = (uint16_t*)(row) + kRowSize;
-  uint16_t* temp_v_1 = (uint16_t*)(row) + kRowSize * 2;
-  uint16_t* temp_v_2 = (uint16_t*)(row) + kRowSize * 3;
+  uint16_t* temp_u_2 = (uint16_t*)(row) + row_size;
+  uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2;
+  uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3;
 
-  Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-  Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+  ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
   I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
   dst_argb += dst_stride_argb;
   src_y += src_stride_y;
 
   for (y = 0; y < height - 2; y += 2) {
-    Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width);
+    Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width);
     I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -5894,8 +6598,8 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y,
   }
 
   if (!(height & 1)) {
-    Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+    ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
     I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
   }
 
@@ -5919,8 +6623,9 @@ static int I210ToARGBMatrixLinear(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I410ToARGBRow_C;
-  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
-                     int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                                int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -5949,29 +6654,29 @@ static int I210ToARGBMatrixLinear(const uint16_t* src_y,
 
 #if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
   }
 #endif
 
   // alloc 2 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t));
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
   uint16_t* temp_u = (uint16_t*)(row);
-  uint16_t* temp_v = (uint16_t*)(row) + kRowSize;
+  uint16_t* temp_v = (uint16_t*)(row) + row_size;
 
   for (y = 0; y < height; ++y) {
-    ScaleRowUp(src_u, temp_u, width);
-    ScaleRowUp(src_v, temp_v, width);
+    ScaleRowUp2_Linear_12(src_u, temp_u, width);
+    ScaleRowUp2_Linear_12(src_v, temp_v, width);
     I410ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -6006,9 +6711,12 @@ static int I420AlphaToARGBMatrixBilinear(
                              int width) = I444AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
-  void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
-                      uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleRowUp2_Bilinear_Any_C;
+  void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                               uint8_t* dst_ptr, ptrdiff_t dst_stride,
+                               int dst_width) = ScaleRowUp2_Bilinear_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -6059,6 +6767,11 @@ static int I420AlphaToARGBMatrixBilinear(
     }
   }
 #endif
+#if defined(HAS_I444ALPHATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_RVV;
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -6091,40 +6804,50 @@ static int I420AlphaToARGBMatrixBilinear(
     }
   }
 #endif
-#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+#if defined(HAS_SCALEROWUP2_BILINEAR_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
   }
 #endif
 
   // alloc 4 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 4);
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4);
   uint8_t* temp_u_1 = row;
-  uint8_t* temp_u_2 = row + kRowSize;
-  uint8_t* temp_v_1 = row + kRowSize * 2;
-  uint8_t* temp_v_2 = row + kRowSize * 3;
+  uint8_t* temp_u_2 = row + row_size;
+  uint8_t* temp_v_1 = row + row_size * 2;
+  uint8_t* temp_v_2 = row + row_size * 3;
 
-  Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-  Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+  ScaleRowUp2_Linear(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear(src_v, temp_v_1, width);
   I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
                      width);
   if (attenuate) {
@@ -6135,8 +6858,8 @@ static int I420AlphaToARGBMatrixBilinear(
   src_a += src_stride_a;
 
   for (y = 0; y < height - 2; y += 2) {
-    Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width);
+    Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width);
     I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
                        width);
     if (attenuate) {
@@ -6158,8 +6881,8 @@ static int I420AlphaToARGBMatrixBilinear(
   }
 
   if (!(height & 1)) {
-    Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+    ScaleRowUp2_Linear(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear(src_v, temp_v_1, width);
     I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
                        width);
     if (attenuate) {
@@ -6193,8 +6916,9 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y,
                              int width) = I444AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
-  void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
-      ScaleRowUp2_Linear_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -6245,6 +6969,11 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I444ALPHATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_RVV;
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -6277,36 +7006,42 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
+
 #if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
   }
 #endif
 
   // alloc 2 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2);
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2);
   uint8_t* temp_u = row;
-  uint8_t* temp_v = row + kRowSize;
+  uint8_t* temp_v = row + row_size;
 
   for (y = 0; y < height; ++y) {
-    ScaleRowUp(src_u, temp_u, width);
-    ScaleRowUp(src_v, temp_v, width);
+    ScaleRowUp2_Linear(src_u, temp_u, width);
+    ScaleRowUp2_Linear(src_v, temp_v, width);
     I444AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants,
                        width);
     if (attenuate) {
@@ -6346,9 +7081,12 @@ static int I010AlphaToARGBMatrixBilinear(
                              int width) = I410AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
-  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleRowUp2_Bilinear_16_Any_C;
+  void (*Scale2RowUp_Bilinear_12)(
+      const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+      ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C;
+  void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                                int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -6407,35 +7145,43 @@ static int I010AlphaToARGBMatrixBilinear(
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
   }
 #endif
 
   // alloc 4 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 4 * sizeof(uint16_t));
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4 * sizeof(uint16_t));
   uint16_t* temp_u_1 = (uint16_t*)(row);
-  uint16_t* temp_u_2 = (uint16_t*)(row) + kRowSize;
-  uint16_t* temp_v_1 = (uint16_t*)(row) + kRowSize * 2;
-  uint16_t* temp_v_2 = (uint16_t*)(row) + kRowSize * 3;
+  uint16_t* temp_u_2 = (uint16_t*)(row) + row_size;
+  uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2;
+  uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3;
 
-  Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-  Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+  ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
   I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
                      width);
   if (attenuate) {
@@ -6446,8 +7192,8 @@ static int I010AlphaToARGBMatrixBilinear(
   src_a += src_stride_a;
 
   for (y = 0; y < height - 2; y += 2) {
-    Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width);
+    Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width);
     I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
                        width);
     if (attenuate) {
@@ -6469,8 +7215,8 @@ static int I010AlphaToARGBMatrixBilinear(
   }
 
   if (!(height & 1)) {
-    Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+    ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
     I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
                        width);
     if (attenuate) {
@@ -6504,8 +7250,9 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y,
                              int width) = I410AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
-  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
-                     int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -6564,32 +7311,37 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
 #if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_SSSE3;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_AVX2;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_NEON;
   }
 #endif
 
   // alloc 2 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t));
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
   uint16_t* temp_u = (uint16_t*)(row);
-  uint16_t* temp_v = (uint16_t*)(row) + kRowSize;
+  uint16_t* temp_v = (uint16_t*)(row) + row_size;
 
   for (y = 0; y < height; ++y) {
-    ScaleRowUp(src_u, temp_u, width);
-    ScaleRowUp(src_v, temp_v, width);
+    ScaleRowUp2_Linear(src_u, temp_u, width);
+    ScaleRowUp2_Linear(src_v, temp_v, width);
     I410AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants,
                        width);
     if (attenuate) {
@@ -6618,9 +7370,10 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y,
   void (*P410ToARGBRow)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C;
-  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleUVRowUp2_Bilinear_16_Any_C;
+  void (*Scale2RowUp_Bilinear_16)(
+      const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+      ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -6649,35 +7402,35 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y,
 
 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON;
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_NEON;
   }
 #endif
 
   // alloc 2 lines temp
-  const int kRowSize = (2 * width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t));
+  const int row_size = (2 * width + 31) & ~31;
+  align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
   uint16_t* temp_uv_1 = (uint16_t*)(row);
-  uint16_t* temp_uv_2 = (uint16_t*)(row) + kRowSize;
+  uint16_t* temp_uv_2 = (uint16_t*)(row) + row_size;
 
-  Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width);
+  Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
   P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width);
   dst_argb += dst_stride_argb;
   src_y += src_stride_y;
 
   for (y = 0; y < height - 2; y += 2) {
-    Scale2RowUp(src_uv, src_stride_uv, temp_uv_1, kRowSize, width);
+    Scale2RowUp_Bilinear_16(src_uv, src_stride_uv, temp_uv_1, row_size, width);
     P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -6688,7 +7441,7 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y,
   }
 
   if (!(height & 1)) {
-    Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width);
+    Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
     P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width);
   }
 
@@ -6709,8 +7462,9 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y,
   void (*P410ToARGBRow)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C;
-  void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
-      ScaleUVRowUp2_Linear_16_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint16_t* src_uv, uint16_t* dst_uv,
+                             int dst_width) = ScaleUVRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -6739,28 +7493,28 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y,
 
 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_AVX2;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON;
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_NEON;
   }
 #endif
 
-  const int kRowSize = (2 * width + 31) & ~31;
-  align_buffer_64(row, kRowSize * sizeof(uint16_t));
+  const int row_size = (2 * width + 31) & ~31;
+  align_buffer_64(row, row_size * sizeof(uint16_t));
   uint16_t* temp_uv = (uint16_t*)(row);
 
   for (y = 0; y < height; ++y) {
-    ScaleRowUp(src_uv, temp_uv, width);
+    ScaleRowUp2_Linear(src_uv, temp_uv, width);
     P410ToARGBRow(src_y, temp_uv, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -6784,9 +7538,10 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y,
   void (*P410ToAR30Row)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C;
-  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleUVRowUp2_Bilinear_16_Any_C;
+  void (*Scale2RowUp_Bilinear_16)(
+      const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+      ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -6815,35 +7570,35 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y,
 
 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON;
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_NEON;
   }
 #endif
 
   // alloc 2 lines temp
-  const int kRowSize = (2 * width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t));
+  const int row_size = (2 * width + 31) & ~31;
+  align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
   uint16_t* temp_uv_1 = (uint16_t*)(row);
-  uint16_t* temp_uv_2 = (uint16_t*)(row) + kRowSize;
+  uint16_t* temp_uv_2 = (uint16_t*)(row) + row_size;
 
-  Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width);
+  Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
   P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width);
   dst_ar30 += dst_stride_ar30;
   src_y += src_stride_y;
 
   for (y = 0; y < height - 2; y += 2) {
-    Scale2RowUp(src_uv, src_stride_uv, temp_uv_1, kRowSize, width);
+    Scale2RowUp_Bilinear_16(src_uv, src_stride_uv, temp_uv_1, row_size, width);
     P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width);
     dst_ar30 += dst_stride_ar30;
     src_y += src_stride_y;
@@ -6854,7 +7609,7 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y,
   }
 
   if (!(height & 1)) {
-    Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width);
+    Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
     P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width);
   }
 
@@ -6875,8 +7630,9 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y,
   void (*P410ToAR30Row)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C;
-  void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
-      ScaleUVRowUp2_Linear_16_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint16_t* src_uv, uint16_t* dst_uv,
+                             int dst_width) = ScaleUVRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -6905,28 +7661,28 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y,
 
 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_AVX2;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON;
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_NEON;
   }
 #endif
 
-  const int kRowSize = (2 * width + 31) & ~31;
-  align_buffer_64(row, kRowSize * sizeof(uint16_t));
+  const int row_size = (2 * width + 31) & ~31;
+  align_buffer_64(row, row_size * sizeof(uint16_t));
   uint16_t* temp_uv = (uint16_t*)(row);
 
   for (y = 0; y < height; ++y) {
-    ScaleRowUp(src_uv, temp_uv, width);
+    ScaleRowUp2_Linear(src_uv, temp_uv, width);
     P410ToAR30Row(src_y, temp_uv, dst_ar30, yuvconstants, width);
     dst_ar30 += dst_stride_ar30;
     src_y += src_stride_y;
@@ -6937,6 +7693,133 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y,
   return 0;
 }
 
+static int I422ToRGB24MatrixLinear(const uint8_t* src_y,
+                                   int src_stride_y,
+                                   const uint8_t* src_u,
+                                   int src_stride_u,
+                                   const uint8_t* src_v,
+                                   int src_stride_v,
+                                   uint8_t* dst_rgb24,
+                                   int dst_stride_rgb24,
+                                   const struct YuvConstants* yuvconstants,
+                                   int width,
+                                   int height) {
+  int y;
+  void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I444ToRGB24Row_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I444TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToRGB24Row = I444ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I444ToRGB24Row = I444ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToRGB24Row = I444ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToRGB24Row = I444ToRGB24Row_RVV;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+
+  // alloc 2 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2);
+  uint8_t* temp_u = row;
+  uint8_t* temp_v = row + row_size;
+
+  for (y = 0; y < height; ++y) {
+    ScaleRowUp2_Linear(src_u, temp_u, width);
+    ScaleRowUp2_Linear(src_v, temp_v, width);
+    I444ToRGB24Row(src_y, temp_u, temp_v, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+LIBYUV_API
+int I422ToRGB24MatrixFilter(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgb24,
+                            int dst_stride_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height,
+                            enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I422ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                               src_stride_v, dst_rgb24, dst_stride_rgb24,
+                               yuvconstants, width, height);
+    case kFilterBilinear:
+    case kFilterBox:
+    case kFilterLinear:
+      return I422ToRGB24MatrixLinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+          dst_rgb24, dst_stride_rgb24, yuvconstants, width, height);
+  }
+
+  return -1;
+}
+
 LIBYUV_API
 int I420ToARGBMatrixFilter(const uint8_t* src_y,
                            int src_stride_y,
@@ -6997,6 +7880,35 @@ int I422ToARGBMatrixFilter(const uint8_t* src_y,
   return -1;
 }
 
+LIBYUV_API
+int I420ToRGB24MatrixFilter(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgb24,
+                            int dst_stride_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height,
+                            enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                               src_stride_v, dst_rgb24, dst_stride_rgb24,
+                               yuvconstants, width, height);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
+    case kFilterBilinear:
+    case kFilterBox:
+      return I420ToRGB24MatrixBilinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+          dst_rgb24, dst_stride_rgb24, yuvconstants, width, height);
+  }
+
+  return -1;
+}
+
 LIBYUV_API
 int I010ToAR30MatrixFilter(const uint16_t* src_y,
                            int src_stride_y,
@@ -7015,13 +7927,12 @@ int I010ToAR30MatrixFilter(const uint16_t* src_y,
       return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
                               src_stride_v, dst_ar30, dst_stride_ar30,
                               yuvconstants, width, height);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
     case kFilterBilinear:
     case kFilterBox:
       return I010ToAR30MatrixBilinear(
           src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
           dst_ar30, dst_stride_ar30, yuvconstants, width, height);
-    case kFilterLinear:
-      return -1;
   }
 
   return -1;
@@ -7074,13 +7985,12 @@ int I010ToARGBMatrixFilter(const uint16_t* src_y,
       return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
                               src_stride_v, dst_argb, dst_stride_argb,
                               yuvconstants, width, height);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
     case kFilterBilinear:
     case kFilterBox:
       return I010ToARGBMatrixBilinear(
           src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
           dst_argb, dst_stride_argb, yuvconstants, width, height);
-    case kFilterLinear:
-      return -1;
   }
 
   return -1;
@@ -7137,14 +8047,13 @@ int I420AlphaToARGBMatrixFilter(const uint8_t* src_y,
                                    src_v, src_stride_v, src_a, src_stride_a,
                                    dst_argb, dst_stride_argb, yuvconstants,
                                    width, height, attenuate);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
     case kFilterBilinear:
     case kFilterBox:
       return I420AlphaToARGBMatrixBilinear(
           src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
           src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
           attenuate);
-    case kFilterLinear:
-      return -1;
   }
 
   return -1;
@@ -7206,14 +8115,13 @@ int I010AlphaToARGBMatrixFilter(const uint16_t* src_y,
                                    src_v, src_stride_v, src_a, src_stride_a,
                                    dst_argb, dst_stride_argb, yuvconstants,
                                    width, height, attenuate);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
     case kFilterBilinear:
     case kFilterBox:
       return I010AlphaToARGBMatrixBilinear(
           src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
           src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
           attenuate);
-    case kFilterLinear:
-      return -1;
   }
 
   return -1;
@@ -7253,6 +8161,8 @@ int I210AlphaToARGBMatrixFilter(const uint16_t* src_y,
   return -1;
 }
 
+// TODO(fb): Verify this function works correctly.  P010 is like NV12 but 10 bit
+// UV is biplanar.
 LIBYUV_API
 int P010ToARGBMatrixFilter(const uint16_t* src_y,
                            int src_stride_y,
@@ -7269,13 +8179,12 @@ int P010ToARGBMatrixFilter(const uint16_t* src_y,
       return P010ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv,
                               dst_argb, dst_stride_argb, yuvconstants, width,
                               height);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
     case kFilterBilinear:
     case kFilterBox:
       return P010ToARGBMatrixBilinear(src_y, src_stride_y, src_uv,
                                       src_stride_uv, dst_argb, dst_stride_argb,
                                       yuvconstants, width, height);
-    case kFilterLinear:
-      return -1;
   }
 
   return -1;
@@ -7324,13 +8233,12 @@ int P010ToAR30MatrixFilter(const uint16_t* src_y,
       return P010ToAR30Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
                               dst_ar30, dst_stride_ar30, yuvconstants, width,
                               height);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
     case kFilterBilinear:
     case kFilterBox:
       return P010ToAR30MatrixBilinear(src_y, src_stride_y, src_uv,
                                       src_stride_uv, dst_ar30, dst_stride_ar30,
                                       yuvconstants, width, height);
-    case kFilterLinear:
-      return -1;
   }
 
   return -1;
diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc
index 8bd07e4c..4102d610 100644
--- a/files/source/convert_from.cc
+++ b/files/source/convert_from.cc
@@ -446,6 +446,14 @@ int I420ToYUY2(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_LASX;
@@ -533,6 +541,14 @@ int I422ToUYVY(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
@@ -608,6 +624,14 @@ int I420ToUYVY(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
diff --git a/files/source/convert_from_argb.cc b/files/source/convert_from_argb.cc
index e50c2af3..c3d037c4 100644
--- a/files/source/convert_from_argb.cc
+++ b/files/source/convert_from_argb.cc
@@ -76,6 +76,14 @@ int ARGBToI444(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUV444ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOUV444ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToUV444Row = ARGBToUV444Row_Any_LASX;
@@ -116,6 +124,14 @@ int ARGBToI444(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -124,6 +140,11 @@ int ARGBToI444(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToUV444Row(src_argb, dst_u, dst_v, width);
@@ -230,7 +251,24 @@ int ARGBToI422(const uint8_t* src_argb,
     }
   }
 #endif
-
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -241,6 +279,11 @@ int ARGBToI422(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
@@ -340,6 +383,14 @@ int ARGBToNV12(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -350,6 +401,11 @@ int ARGBToNV12(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -361,11 +417,19 @@ int ARGBToNV12(const uint8_t* src_argb,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -389,6 +453,11 @@ int ARGBToNV12(const uint8_t* src_argb,
       MergeUVRow_ = MergeUVRow_LSX;
     }
   }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
 #endif
   {
     // Allocate a rows of uv.
@@ -502,6 +571,24 @@ int ARGBToNV21(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -512,6 +599,11 @@ int ARGBToNV21(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -523,11 +615,19 @@ int ARGBToNV21(const uint8_t* src_argb,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -551,6 +651,11 @@ int ARGBToNV21(const uint8_t* src_argb,
       MergeUVRow_ = MergeUVRow_LSX;
     }
   }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
 #endif
   {
     // Allocate a rows of uv.
@@ -663,6 +768,27 @@ int ABGRToNV12(const uint8_t* src_abgr,
     }
   }
 #endif
+#if defined(HAS_ABGRTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYRow = ABGRToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYRow = ABGRToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYRow = ABGRToYRow_RVV;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -674,11 +800,19 @@ int ABGRToNV12(const uint8_t* src_abgr,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -702,6 +836,11 @@ int ABGRToNV12(const uint8_t* src_abgr,
       MergeUVRow_ = MergeUVRow_LSX;
     }
   }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
 #endif
   {
     // Allocate a rows of uv.
@@ -815,6 +954,27 @@ int ABGRToNV21(const uint8_t* src_abgr,
     }
   }
 #endif
+#if defined(HAS_ABGRTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYRow = ABGRToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYRow = ABGRToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYRow = ABGRToYRow_RVV;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -826,11 +986,19 @@ int ABGRToNV21(const uint8_t* src_abgr,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -854,6 +1022,11 @@ int ABGRToNV21(const uint8_t* src_abgr,
       MergeUVRow_ = MergeUVRow_LSX;
     }
   }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
 #endif
   {
     // Allocate a rows of uv.
@@ -972,6 +1145,24 @@ int ARGBToYUY2(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -982,6 +1173,11 @@ int ARGBToYUY2(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -1014,6 +1210,14 @@ int ARGBToYUY2(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_LASX;
@@ -1135,6 +1339,24 @@ int ARGBToUYVY(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -1145,6 +1367,11 @@ int ARGBToUYVY(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -1177,6 +1404,14 @@ int ARGBToUYVY(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
@@ -1262,6 +1497,14 @@ int ARGBToI400(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -1270,6 +1513,11 @@ int ARGBToI400(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToYRow(src_argb, dst_y, width);
@@ -1360,6 +1608,14 @@ int ARGBToRGB24(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB24ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToRGB24Row = ARGBToRGB24Row_Any_LASX;
@@ -1368,6 +1624,11 @@ int ARGBToRGB24(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRGB24Row(src_argb, dst_rgb24, width);
@@ -1434,6 +1695,14 @@ int ARGBToRAW(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORAWROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORAWROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToRAWRow = ARGBToRAWRow_Any_LASX;
@@ -1442,6 +1711,11 @@ int ARGBToRAW(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORAWROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToRAWRow = ARGBToRAWRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRAWRow(src_argb, dst_raw, width);
@@ -1467,7 +1741,7 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
                        int height) {
   int y;
   void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
-                                const uint32_t dither4, int width) =
+                                uint32_t dither4, int width) =
       ARGBToRGB565DitherRow_C;
   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
@@ -1512,6 +1786,14 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB565DITHERROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX;
@@ -1589,6 +1871,15 @@ int ARGBToRGB565(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_LSX;
+    }
+  }
+#endif
+
 #if defined(HAS_ARGBTORGB565ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToRGB565Row = ARGBToRGB565Row_Any_LASX;
@@ -1663,6 +1954,14 @@ int ARGBToARGB1555(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOARGB1555ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOARGB1555ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LASX;
@@ -1737,6 +2036,14 @@ int ARGBToARGB4444(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOARGB4444ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOARGB4444ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LASX;
@@ -1858,19 +2165,19 @@ int ARGBToJ420(const uint8_t* src_argb,
                int src_stride_argb,
                uint8_t* dst_yj,
                int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height) {
   int y;
   void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
       ARGBToUVJRow_C;
   void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
-  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if (!src_argb || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1879,6 +2186,22 @@ int ARGBToJ420(const uint8_t* src_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
@@ -1903,19 +2226,11 @@ int ARGBToJ420(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_NEON;
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_AVX2;
     }
   }
 #endif
@@ -1951,18 +2266,23 @@ int ARGBToJ420(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYJRow = ARGBToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToUVJRow(src_argb, src_stride_argb, dst_uj, dst_vj, width);
     ARGBToYJRow(src_argb, dst_yj, width);
     ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
     src_argb += src_stride_argb * 2;
     dst_yj += dst_stride_yj * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
+    dst_uj += dst_stride_uj;
+    dst_vj += dst_stride_vj;
   }
   if (height & 1) {
-    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToUVJRow(src_argb, 0, dst_uj, dst_vj, width);
     ARGBToYJRow(src_argb, dst_yj, width);
   }
   return 0;
@@ -1974,19 +2294,19 @@ int ARGBToJ422(const uint8_t* src_argb,
                int src_stride_argb,
                uint8_t* dst_yj,
                int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height) {
   int y;
   void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
       ARGBToUVJRow_C;
   void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
-  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if (!src_argb || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1997,10 +2317,10 @@ int ARGBToJ422(const uint8_t* src_argb,
   }
   // Coalesce rows.
   if (src_stride_argb == width * 4 && dst_stride_yj == width &&
-      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
+      dst_stride_uj * 2 == width && dst_stride_vj * 2 == width) {
     width *= height;
     height = 1;
-    src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
+    src_stride_argb = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0;
   }
 #if defined(HAS_ARGBTOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -2026,6 +2346,14 @@ int ARGBToJ422(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYJROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYJRow = ARGBToYJRow_Any_NEON;
@@ -2074,270 +2402,649 @@ int ARGBToJ422(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYJRow = ARGBToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToUVJRow(src_argb, 0, dst_uj, dst_vj, width);
     ARGBToYJRow(src_argb, dst_yj, width);
     src_argb += src_stride_argb;
     dst_yj += dst_stride_yj;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
+    dst_uj += dst_stride_uj;
+    dst_vj += dst_stride_vj;
   }
   return 0;
 }
 
-// Convert ARGB to AR64.
+// Convert ARGB to J400.
 LIBYUV_API
-int ARGBToAR64(const uint8_t* src_argb,
+int ARGBToJ400(const uint8_t* src_argb,
                int src_stride_argb,
-               uint16_t* dst_ar64,
-               int dst_stride_ar64,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
                int width,
                int height) {
   int y;
-  void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
-                        int width) = ARGBToAR64Row_C;
-  if (!src_argb || !dst_ar64 || width <= 0 || height == 0) {
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_yj || width <= 0 || height == 0) {
     return -1;
   }
-  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_yj == width) {
     width *= height;
     height = 1;
-    src_stride_argb = dst_stride_ar64 = 0;
+    src_stride_argb = dst_stride_yj = 0;
   }
-#if defined(HAS_ARGBTOAR64ROW_SSSE3)
+#if defined(HAS_ARGBTOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToAR64Row = ARGBToAR64Row_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOAR64ROW_AVX2)
+#if defined(HAS_ARGBTOYJROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToAR64Row = ARGBToAR64Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToAR64Row = ARGBToAR64Row_AVX2;
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOAR64ROW_NEON)
+#if defined(HAS_ARGBTOYJROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToAR64Row = ARGBToAR64Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToAR64Row = ARGBToAR64Row_NEON;
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYJRow = ARGBToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    ARGBToAR64Row(src_argb, dst_ar64, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
     src_argb += src_stride_argb;
-    dst_ar64 += dst_stride_ar64;
+    dst_yj += dst_stride_yj;
   }
   return 0;
 }
 
-// Convert ARGB to AB64.
+// Convert RGBA to J400.
 LIBYUV_API
-int ARGBToAB64(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint16_t* dst_ab64,
-               int dst_stride_ab64,
+int RGBAToJ400(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
                int width,
                int height) {
   int y;
-  void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
-                        int width) = ARGBToAB64Row_C;
-  if (!src_argb || !dst_ab64 || width <= 0 || height == 0) {
+  void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) =
+      RGBAToYJRow_C;
+  if (!src_rgba || !dst_yj || width <= 0 || height == 0) {
     return -1;
   }
-  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) {
+  if (src_stride_rgba == width * 4 && dst_stride_yj == width) {
     width *= height;
     height = 1;
-    src_stride_argb = dst_stride_ab64 = 0;
+    src_stride_rgba = dst_stride_yj = 0;
   }
-#if defined(HAS_ARGBTOAB64ROW_SSSE3)
+#if defined(HAS_RGBATOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToAB64Row = ARGBToAB64Row_SSSE3;
+    RGBAToYJRow = RGBAToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOAB64ROW_AVX2)
+#if defined(HAS_RGBATOYJROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToAB64Row = ARGBToAB64Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToAB64Row = ARGBToAB64Row_AVX2;
+    RGBAToYJRow = RGBAToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      RGBAToYJRow = RGBAToYJRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOAB64ROW_NEON)
+#if defined(HAS_RGBATOYJROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToAB64Row = ARGBToAB64Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToAB64Row = ARGBToAB64Row_NEON;
+    RGBAToYJRow = RGBAToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToYJRow = RGBAToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGBAToYJRow = RGBAToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_LSX;
     }
   }
 #endif
+#if defined(HAS_RGBATOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RGBAToYJRow = RGBAToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGBAToYJRow = RGBAToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGBAToYJRow = RGBAToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    ARGBToAB64Row(src_argb, dst_ab64, width);
-    src_argb += src_stride_argb;
-    dst_ab64 += dst_stride_ab64;
+    RGBAToYJRow(src_rgba, dst_yj, width);
+    src_rgba += src_stride_rgba;
+    dst_yj += dst_stride_yj;
   }
   return 0;
 }
 
-// Convert ARGB to J400.
+// Convert ABGR to J420. (JPeg full range I420).
 LIBYUV_API
-int ARGBToJ400(const uint8_t* src_argb,
-               int src_stride_argb,
+int ABGRToJ420(const uint8_t* src_abgr,
+               int src_stride_abgr,
                uint8_t* dst_yj,
                int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height) {
   int y;
-  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
-      ARGBToYJRow_C;
-  if (!src_argb || !dst_yj || width <= 0 || height == 0) {
+  void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+      ABGRToUVJRow_C;
+  void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) =
+      ABGRToYJRow_C;
+  if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
     return -1;
   }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToYJRow = ABGRToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToYJRow = ABGRToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVJRow = ABGRToUVJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYJRow = ABGRToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYJRow = ABGRToYJRow_Any_MSA;
+    ABGRToUVJRow = ABGRToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_MSA;
+      ABGRToUVJRow = ABGRToUVJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYJRow = ABGRToYJRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ABGRToUVJRow(src_abgr, src_stride_abgr, dst_uj, dst_vj, width);
+    ABGRToYJRow(src_abgr, dst_yj, width);
+    ABGRToYJRow(src_abgr + src_stride_abgr, dst_yj + dst_stride_yj, width);
+    src_abgr += src_stride_abgr * 2;
+    dst_yj += dst_stride_yj * 2;
+    dst_uj += dst_stride_uj;
+    dst_vj += dst_stride_vj;
+  }
+  if (height & 1) {
+    ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width);
+    ABGRToYJRow(src_abgr, dst_yj, width);
+  }
+  return 0;
+}
+
+// Convert ABGR to J422. (JPeg full range I422).
+LIBYUV_API
+int ABGRToJ422(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height) {
+  int y;
+  void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+      ABGRToUVJRow_C;
+  void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) =
+      ABGRToYJRow_C;
+  if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_yj == width) {
+  if (src_stride_abgr == width * 4 && dst_stride_yj == width &&
+      dst_stride_uj * 2 == width && dst_stride_vj * 2 == width) {
     width *= height;
     height = 1;
-    src_stride_argb = dst_stride_yj = 0;
+    src_stride_abgr = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0;
   }
-#if defined(HAS_ARGBTOYJROW_SSSE3)
+#if defined(HAS_ABGRTOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    ABGRToYJRow = ABGRToYJRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
+      ABGRToYJRow = ABGRToYJRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
+#if defined(HAS_ABGRTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    ABGRToYJRow = ABGRToYJRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_AVX2;
+      ABGRToYJRow = ABGRToYJRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_NEON)
+#if defined(HAS_ABGRTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVJRow = ABGRToUVJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    ABGRToYJRow = ABGRToYJRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
+      ABGRToYJRow = ABGRToYJRow_NEON;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_MSA)
+#if defined(HAS_ABGRTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    ABGRToYJRow = ABGRToYJRow_Any_MSA;
+    ABGRToUVJRow = ABGRToUVJRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_MSA;
+      ABGRToYJRow = ABGRToYJRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVJRow = ABGRToUVJRow_MSA;
     }
   }
 #endif
+#if defined(HAS_ABGRTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYJRow = ABGRToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    ARGBToYJRow(src_argb, dst_yj, width);
-    src_argb += src_stride_argb;
+    ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width);
+    ABGRToYJRow(src_abgr, dst_yj, width);
+    src_abgr += src_stride_abgr;
     dst_yj += dst_stride_yj;
+    dst_uj += dst_stride_uj;
+    dst_vj += dst_stride_vj;
   }
   return 0;
 }
 
-// Convert RGBA to J400.
+// Convert ABGR to J400.
 LIBYUV_API
-int RGBAToJ400(const uint8_t* src_rgba,
-               int src_stride_rgba,
+int ABGRToJ400(const uint8_t* src_abgr,
+               int src_stride_abgr,
                uint8_t* dst_yj,
                int dst_stride_yj,
                int width,
                int height) {
   int y;
-  void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) =
-      RGBAToYJRow_C;
-  if (!src_rgba || !dst_yj || width <= 0 || height == 0) {
+  void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) =
+      ABGRToYJRow_C;
+  if (!src_abgr || !dst_yj || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
-    src_stride_rgba = -src_stride_rgba;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
   }
   // Coalesce rows.
-  if (src_stride_rgba == width * 4 && dst_stride_yj == width) {
+  if (src_stride_abgr == width * 4 && dst_stride_yj == width) {
     width *= height;
     height = 1;
-    src_stride_rgba = dst_stride_yj = 0;
+    src_stride_abgr = dst_stride_yj = 0;
   }
-#if defined(HAS_RGBATOYJROW_SSSE3)
+#if defined(HAS_ABGRTOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGBAToYJRow = RGBAToYJRow_Any_SSSE3;
+    ABGRToYJRow = ABGRToYJRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      RGBAToYJRow = RGBAToYJRow_SSSE3;
+      ABGRToYJRow = ABGRToYJRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_RGBATOYJROW_AVX2)
+#if defined(HAS_ABGRTOYJROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    RGBAToYJRow = RGBAToYJRow_Any_AVX2;
+    ABGRToYJRow = ABGRToYJRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      RGBAToYJRow = RGBAToYJRow_AVX2;
+      ABGRToYJRow = ABGRToYJRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_RGBATOYJROW_NEON)
+#if defined(HAS_ABGRTOYJROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    RGBAToYJRow = RGBAToYJRow_Any_NEON;
+    ABGRToYJRow = ABGRToYJRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      RGBAToYJRow = RGBAToYJRow_NEON;
+      ABGRToYJRow = ABGRToYJRow_NEON;
     }
   }
 #endif
-#if defined(HAS_RGBATOYJROW_MSA)
+#if defined(HAS_ABGRTOYJROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
-    RGBAToYJRow = RGBAToYJRow_Any_MSA;
+    ABGRToYJRow = ABGRToYJRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
-      RGBAToYJRow = RGBAToYJRow_MSA;
+      ABGRToYJRow = ABGRToYJRow_MSA;
     }
   }
 #endif
+#if defined(HAS_ABGRTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYJRow = ABGRToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    RGBAToYJRow(src_rgba, dst_yj, width);
-    src_rgba += src_stride_rgba;
+    ABGRToYJRow(src_abgr, dst_yj, width);
+    src_abgr += src_stride_abgr;
     dst_yj += dst_stride_yj;
   }
   return 0;
 }
 
+// Convert ARGB to AR64.
+LIBYUV_API
+int ARGBToAR64(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint16_t* dst_ar64,
+               int dst_stride_ar64,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
+                        int width) = ARGBToAR64Row_C;
+  if (!src_argb || !dst_ar64 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_ar64 = 0;
+  }
+#if defined(HAS_ARGBTOAR64ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToAR64Row = ARGBToAR64Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToAR64Row = ARGBToAR64Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAR64Row = ARGBToAR64Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToAR64Row = ARGBToAR64Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAR64Row = ARGBToAR64Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToAR64Row = ARGBToAR64Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToAR64Row(src_argb, dst_ar64, width);
+    src_argb += src_stride_argb;
+    dst_ar64 += dst_stride_ar64;
+  }
+  return 0;
+}
+
+// Convert ARGB to AB64.
+LIBYUV_API
+int ARGBToAB64(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint16_t* dst_ab64,
+               int dst_stride_ab64,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
+                        int width) = ARGBToAB64Row_C;
+  if (!src_argb || !dst_ab64 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_ab64 = 0;
+  }
+#if defined(HAS_ARGBTOAB64ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToAB64Row = ARGBToAB64Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToAB64Row = ARGBToAB64Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAB64Row = ARGBToAB64Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToAB64Row = ARGBToAB64Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAB64Row = ARGBToAB64Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToAB64Row = ARGBToAB64Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToAB64Row(src_argb, dst_ab64, width);
+    src_argb += src_stride_argb;
+    dst_ab64 += dst_stride_ab64;
+  }
+  return 0;
+}
+
 // Enabled if 1 pass is available
-#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA)
+#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || \
+    defined(HAS_RAWTOYJROW_RVV)
 #define HAS_RAWTOYJROW
 #endif
 
@@ -2355,7 +3062,7 @@ int RAWToJNV21(const uint8_t* src_raw,
   int halfwidth = (width + 1) >> 1;
 #if defined(HAS_RAWTOYJROW)
   void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+                      uint8_t* dst_uj, uint8_t* dst_vj, int width) =
       RAWToUVJRow_C;
   void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
       RAWToYJRow_C;
@@ -2363,12 +3070,12 @@ int RAWToJNV21(const uint8_t* src_raw,
   void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RAWToARGBRow_C;
   void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
       ARGBToUVJRow_C;
   void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYJRow_C;
 #endif
-  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+  void (*MergeUVRow_)(const uint8_t* src_uj, const uint8_t* src_vj,
                       uint8_t* dst_vu, int width) = MergeUVRow_C;
   if (!src_raw || !dst_y || !dst_vu || width <= 0 || height == 0) {
     return -1;
@@ -2403,6 +3110,27 @@ int RAWToJNV21(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RAWToYJRow = RAWToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RAWToYJRow = RAWToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToYJRow = RAWToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToYJRow = RAWToYJRow_RVV;
+  }
+#endif
 
 // Other platforms do intermediate conversion from RAW to ARGB.
 #else  // HAS_RAWTOYJROW
@@ -2459,11 +3187,19 @@ int RAWToJNV21(const uint8_t* src_raw,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -2487,30 +3223,35 @@ int RAWToJNV21(const uint8_t* src_raw,
       MergeUVRow_ = MergeUVRow_LSX;
     }
   }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
 #endif
   {
     // Allocate a row of uv.
-    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+    align_buffer_64(row_uj, ((halfwidth + 31) & ~31) * 2);
+    uint8_t* row_vj = row_uj + ((halfwidth + 31) & ~31);
 #if !defined(HAS_RAWTOYJROW)
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_RAWTOYJROW)
-      RAWToUVJRow(src_raw, src_stride_raw, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      RAWToUVJRow(src_raw, src_stride_raw, row_uj, row_vj, width);
+      MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
       RAWToYJRow(src_raw, dst_y, width);
       RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
 #else
       RAWToARGBRow(src_raw, row, width);
-      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
-      ARGBToUVJRow(row, kRowSize, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width);
+      ARGBToUVJRow(row, row_size, row_uj, row_vj, width);
+      MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
       ARGBToYJRow(row, dst_y, width);
-      ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_raw += src_stride_raw * 2;
       dst_y += dst_stride_y * 2;
@@ -2518,20 +3259,20 @@ int RAWToJNV21(const uint8_t* src_raw,
     }
     if (height & 1) {
 #if defined(HAS_RAWTOYJROW)
-      RAWToUVJRow(src_raw, 0, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      RAWToUVJRow(src_raw, 0, row_uj, row_vj, width);
+      MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
       RAWToYJRow(src_raw, dst_y, width);
 #else
       RAWToARGBRow(src_raw, row, width);
-      ARGBToUVJRow(row, 0, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      ARGBToUVJRow(row, 0, row_uj, row_vj, width);
+      MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
       ARGBToYJRow(row, dst_y, width);
 #endif
     }
 #if !defined(HAS_RAWTOYJROW)
     free_aligned_buffer_64(row);
 #endif
-    free_aligned_buffer_64(row_u);
+    free_aligned_buffer_64(row_uj);
   }
   return 0;
 }
diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc
index 56fe60e4..0c4a1581 100644
--- a/files/source/cpu_id.cc
+++ b/files/source/cpu_id.cc
@@ -40,7 +40,6 @@ extern "C" {
 // cpu_info_ variable for SIMD instruction sets detected.
 LIBYUV_API int cpu_info_ = 0;
 
-// TODO(fbarchard): Consider using int for cpuid so casting is not needed.
 // Low level cpuid for X86.
 #if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
      defined(__x86_64__)) &&                                     \
@@ -108,14 +107,14 @@ void CpuId(int eax, int ecx, int* cpu_info) {
 //  }
 // For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
 // https://code.google.com/p/libyuv/issues/detail?id=529
-#if defined(_M_IX86) && (_MSC_VER < 1900)
+#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
 #pragma optimize("g", off)
 #endif
 #if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
      defined(__x86_64__)) &&                                     \
     !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
 // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-int GetXCR0() {
+static int GetXCR0() {
   int xcr0 = 0;
 #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
   xcr0 = (int)_xgetbv(0);  // VS2010 SP1 required.  NOLINT
@@ -129,7 +128,7 @@ int GetXCR0() {
 #define GetXCR0() 0
 #endif  // defined(_M_IX86) || defined(_M_X64) ..
 // Return optimization to previous setting.
-#if defined(_M_IX86) && (_MSC_VER < 1900)
+#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
 #pragma optimize("g", on)
 #endif
 
@@ -137,13 +136,14 @@ int GetXCR0() {
 // For Arm, but public to allow testing on any CPU
 LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
   char cpuinfo_line[512];
-  FILE* f = fopen(cpuinfo_name, "r");
+  FILE* f = fopen(cpuinfo_name, "re");
   if (!f) {
     // Assume Neon if /proc/cpuinfo is unavailable.
     // This will occur for Chrome sandbox for Pepper or Render process.
     return kCpuHasNEON;
   }
-  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+  memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
     if (memcmp(cpuinfo_line, "Features", 8) == 0) {
       char* p = strstr(cpuinfo_line, " neon");
       if (p && (p[5] == ' ' || p[5] == '\n')) {
@@ -162,17 +162,90 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
   return 0;
 }
 
-// TODO(fbarchard): Consider read_msa_ir().
+LIBYUV_API SAFEBUFFERS int RiscvCpuCaps(const char* cpuinfo_name) {
+  char cpuinfo_line[512];
+  int flag = 0;
+  FILE* f = fopen(cpuinfo_name, "re");
+  if (!f) {
+#if defined(__riscv_vector)
+    // Assume RVV if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
+    return kCpuHasRVV;
+#else
+    return 0;
+#endif
+  }
+  memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
+    if (memcmp(cpuinfo_line, "isa", 3) == 0) {
+      // ISA string must begin with rv64{i,e,g} for a 64-bit processor.
+      char* isa = strstr(cpuinfo_line, "rv64");
+      if (isa) {
+        size_t isa_len = strlen(isa);
+        char* extensions;
+        size_t extensions_len = 0;
+        size_t std_isa_len;
+        // Remove the new-line character at the end of string
+        if (isa[isa_len - 1] == '\n') {
+          isa[--isa_len] = '\0';
+        }
+        // 5 ISA characters
+        if (isa_len < 5) {
+          fclose(f);
+          return 0;
+        }
+        // Skip {i,e,g} canonical checking.
+        // Skip rvxxx
+        isa += 5;
+        // Find the very first occurrence of 's', 'x' or 'z'.
+        // To detect multi-letter standard, non-standard, and
+        // supervisor-level extensions.
+        extensions = strpbrk(isa, "zxs");
+        if (extensions) {
+          // Multi-letter extensions are seperated by a single underscore
+          // as described in RISC-V User-Level ISA V2.2.
+          char* ext = strtok(extensions, "_");
+          extensions_len = strlen(extensions);
+          while (ext) {
+            // Search for the ZVFH (Vector FP16) extension.
+            if (!strcmp(ext, "zvfh")) {
+              flag |= kCpuHasRVVZVFH;
+            }
+            ext = strtok(NULL, "_");
+          }
+        }
+        std_isa_len = isa_len - extensions_len - 5;
+        // Detect the v in the standard single-letter extensions.
+        if (memchr(isa, 'v', std_isa_len)) {
+          // The RVV implied the F extension.
+          flag |= kCpuHasRVV;
+        }
+      }
+    }
+#if defined(__riscv_vector)
+    // Assume RVV if /proc/cpuinfo is from x86 host running QEMU.
+    else if ((memcmp(cpuinfo_line, "vendor_id\t: GenuineIntel", 24) == 0) ||
+             (memcmp(cpuinfo_line, "vendor_id\t: AuthenticAMD", 24) == 0)) {
+      fclose(f);
+      return kCpuHasRVV;
+    }
+#endif
+  }
+  fclose(f);
+  return flag;
+}
+
 LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
   char cpuinfo_line[512];
-  int flag = 0x0;
-  FILE* f = fopen(cpuinfo_name, "r");
+  int flag = 0;
+  FILE* f = fopen(cpuinfo_name, "re");
   if (!f) {
     // Assume nothing if /proc/cpuinfo is unavailable.
     // This will occur for Chrome sandbox for Pepper or Render process.
     return 0;
   }
-  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+  memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
     if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
       // Workaround early kernel without MSA in ASEs line.
       if (strstr(cpuinfo_line, "Loongson-2K")) {
@@ -191,14 +264,13 @@ LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
   return flag;
 }
 
-// TODO(fbarchard): Consider read_loongarch_ir().
 #define LOONGARCH_CFG2 0x2
 #define LOONGARCH_CFG2_LSX (1 << 6)
 #define LOONGARCH_CFG2_LASX (1 << 7)
 
 #if defined(__loongarch__)
 LIBYUV_API SAFEBUFFERS int LoongarchCpuCaps(void) {
-  int flag = 0x0;
+  int flag = 0;
   uint32_t cfg2 = 0;
 
   __asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(cfg2) : "r"(LOONGARCH_CFG2));
@@ -277,6 +349,10 @@ static SAFEBUFFERS int GetCpuFlags(void) {
 #endif
   cpu_info |= kCpuHasARM;
 #endif  // __arm__
+#if defined(__riscv) && defined(__linux__)
+  cpu_info = RiscvCpuCaps("/proc/cpuinfo");
+  cpu_info |= kCpuHasRISCV;
+#endif  // __riscv
   cpu_info |= kCpuInitialized;
   return cpu_info;
 }
diff --git a/files/source/mjpeg_decoder.cc b/files/source/mjpeg_decoder.cc
index 4ccf00a3..0141da8a 100644
--- a/files/source/mjpeg_decoder.cc
+++ b/files/source/mjpeg_decoder.cc
@@ -109,7 +109,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) {
   }
 
   buf_.data = src;
-  buf_.len = static_cast<int>(src_len);
+  buf_.len = (int)src_len;
   buf_vec_.pos = 0;
   decompress_struct_->client_data = &buf_vec_;
 #ifdef HAVE_SETJMP
@@ -428,7 +428,7 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {
 
 void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
   jpeg_source_mgr* src = cinfo->src;
-  size_t bytes = static_cast<size_t>(num_bytes);
+  size_t bytes = (size_t)num_bytes;
   if (bytes > src->bytes_in_buffer) {
     src->next_input_byte = nullptr;
     src->bytes_in_buffer = 0;
diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc
index 169d4a8f..d115a2a1 100644
--- a/files/source/planar_functions.cc
+++ b/files/source/planar_functions.cc
@@ -75,6 +75,11 @@ void CopyPlane(const uint8_t* src_y,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
+#if defined(HAS_COPYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    CopyRow = CopyRow_RVV;
+  }
+#endif
 
   // Copy plane
   for (y = 0; y < height; ++y) {
@@ -162,7 +167,7 @@ void Convert8To16Plane(const uint8_t* src_y,
                        int src_stride_y,
                        uint16_t* dst_y,
                        int dst_stride_y,
-                       int scale,  // 16384 for 10 bits
+                       int scale,  // 1024 for 10 bits
                        int width,
                        int height) {
   int y;
@@ -333,6 +338,45 @@ int I210Copy(const uint16_t* src_y,
   return 0;
 }
 
+// Copy I410.
+LIBYUV_API
+int I410Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+  CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+  return 0;
+}
+
 // Copy I400.
 LIBYUV_API
 int I400ToI400(const uint8_t* src_y,
@@ -385,6 +429,7 @@ int I420ToI400(const uint8_t* src_y,
 }
 
 // Copy NV12. Supports inverting.
+LIBYUV_API
 int NV12Copy(const uint8_t* src_y,
              int src_stride_y,
              const uint8_t* src_uv,
@@ -418,6 +463,7 @@ int NV12Copy(const uint8_t* src_y,
 }
 
 // Copy NV21. Supports inverting.
+LIBYUV_API
 int NV21Copy(const uint8_t* src_y,
              int src_stride_y,
              const uint8_t* src_vu,
@@ -504,6 +550,11 @@ void SplitUVPlane(const uint8_t* src_uv,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitUVRow = SplitUVRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Copy a row of UV.
@@ -553,11 +604,19 @@ void MergeUVPlane(const uint8_t* src_u,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
+    if (IS_ALIGNED(width, 16)) {
       MergeUVRow = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 32)) {
+      MergeUVRow = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow = MergeUVRow_Any_NEON;
@@ -582,6 +641,11 @@ void MergeUVPlane(const uint8_t* src_u,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow = MergeUVRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Merge a row of U and V into a row of UV.
@@ -687,7 +751,7 @@ void MergeUVPlane_16(const uint16_t* src_u,
 #if defined(HAS_MERGEUVROW_16_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_16 = MergeUVRow_16_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 8)) {
       MergeUVRow_16 = MergeUVRow_16_AVX2;
     }
   }
@@ -911,31 +975,31 @@ int NV21ToNV12(const uint8_t* src_y,
   return 0;
 }
 
+// Test if tile_height is a power of 2 (16 or 32)
+#define IS_POWEROFTWO(x) (!((x) & ((x)-1)))
+
 // Detile a plane of data
 // tile width is 16 and assumed.
 // tile_height is 16 or 32 for MM21.
 // src_stride_y is bytes per row of source ignoring tiling. e.g. 640
 // TODO: More detile row functions.
-
 LIBYUV_API
-void DetilePlane(const uint8_t* src_y,
-                 int src_stride_y,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 int width,
-                 int height,
-                 int tile_height) {
+int DetilePlane(const uint8_t* src_y,
+                int src_stride_y,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                int width,
+                int height,
+                int tile_height) {
   const ptrdiff_t src_tile_stride = 16 * tile_height;
   int y;
   void (*DetileRow)(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst,
                     int width) = DetileRow_C;
-  assert(src_stride_y >= 0);
-  assert(tile_height > 0);
-  assert(src_stride_y > 0);
-
-  if (width <= 0 || height == 0) {
-    return;
+  if (!src_y || !dst_y || width <= 0 || height == 0 ||
+      !IS_POWEROFTWO(tile_height)) {
+    return -1;
   }
+
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -970,6 +1034,72 @@ void DetilePlane(const uint8_t* src_y,
       src_y = src_y - src_tile_stride + src_stride_y * tile_height;
     }
   }
+  return 0;
+}
+
+// Convert a plane of 16 bit tiles of 16 x H to linear.
+// tile width is 16 and assumed.
+// tile_height is 16 or 32 for MT2T.
+LIBYUV_API
+int DetilePlane_16(const uint16_t* src_y,
+                   int src_stride_y,
+                   uint16_t* dst_y,
+                   int dst_stride_y,
+                   int width,
+                   int height,
+                   int tile_height) {
+  const ptrdiff_t src_tile_stride = 16 * tile_height;
+  int y;
+  void (*DetileRow_16)(const uint16_t* src, ptrdiff_t src_tile_stride,
+                       uint16_t* dst, int width) = DetileRow_16_C;
+  if (!src_y || !dst_y || width <= 0 || height == 0 ||
+      !IS_POWEROFTWO(tile_height)) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+
+#if defined(HAS_DETILEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    DetileRow_16 = DetileRow_16_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      DetileRow_16 = DetileRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_DETILEROW_16_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    DetileRow_16 = DetileRow_16_Any_AVX;
+    if (IS_ALIGNED(width, 16)) {
+      DetileRow_16 = DetileRow_16_AVX;
+    }
+  }
+#endif
+#if defined(HAS_DETILEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    DetileRow_16 = DetileRow_16_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      DetileRow_16 = DetileRow_16_NEON;
+    }
+  }
+#endif
+
+  // Detile plane
+  for (y = 0; y < height; ++y) {
+    DetileRow_16(src_y, src_tile_stride, dst_y, width);
+    dst_y += dst_stride_y;
+    src_y += 16;
+    // Advance to next row of tiles.
+    if ((y & (tile_height - 1)) == (tile_height - 1)) {
+      src_y = src_y - src_tile_stride + src_stride_y * tile_height;
+    }
+  }
+  return 0;
 }
 
 LIBYUV_API
@@ -1033,6 +1163,74 @@ void DetileSplitUVPlane(const uint8_t* src_uv,
   }
 }
 
+LIBYUV_API
+void DetileToYUY2(const uint8_t* src_y,
+                  int src_stride_y,
+                  const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_yuy2,
+                  int dst_stride_yuy2,
+                  int width,
+                  int height,
+                  int tile_height) {
+  const ptrdiff_t src_y_tile_stride = 16 * tile_height;
+  const ptrdiff_t src_uv_tile_stride = src_y_tile_stride / 2;
+  int y;
+  void (*DetileToYUY2)(const uint8_t* src_y, ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv, ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2, int width) = DetileToYUY2_C;
+  assert(src_stride_y >= 0);
+  assert(src_stride_y > 0);
+  assert(src_stride_uv >= 0);
+  assert(src_stride_uv > 0);
+  assert(tile_height > 0);
+
+  if (width <= 0 || height == 0 || tile_height <= 0) {
+    return;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+
+#if defined(HAS_DETILETOYUY2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    DetileToYUY2 = DetileToYUY2_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      DetileToYUY2 = DetileToYUY2_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_DETILETOYUY2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    DetileToYUY2 = DetileToYUY2_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      DetileToYUY2 = DetileToYUY2_SSE2;
+    }
+  }
+#endif
+
+  // Detile plane
+  for (y = 0; y < height; ++y) {
+    DetileToYUY2(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, dst_yuy2,
+                 width);
+    dst_yuy2 += dst_stride_yuy2;
+    src_y += 16;
+
+    if (y & 0x1)
+      src_uv += 16;
+
+    // Advance to next row of tiles.
+    if ((y & (tile_height - 1)) == (tile_height - 1)) {
+      src_y = src_y - src_y_tile_stride + src_stride_y * tile_height;
+      src_uv = src_uv - src_uv_tile_stride + src_stride_uv * (tile_height / 2);
+    }
+  }
+}
+
 // Support function for NV12 etc RGB channels.
 // Width and height are plane sizes (typically half pixel width).
 LIBYUV_API
@@ -1085,6 +1283,11 @@ void SplitRGBPlane(const uint8_t* src_rgb,
     }
   }
 #endif
+#if defined(HAS_SPLITRGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitRGBRow = SplitRGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Copy a row of RGB.
@@ -1144,6 +1347,11 @@ void MergeRGBPlane(const uint8_t* src_r,
     }
   }
 #endif
+#if defined(HAS_MERGERGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeRGBRow = MergeRGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Merge a row of U and V into a row of RGB.
@@ -1156,18 +1364,18 @@ void MergeRGBPlane(const uint8_t* src_r,
 }
 
 LIBYUV_NOINLINE
-void SplitARGBPlaneAlpha(const uint8_t* src_argb,
-                         int src_stride_argb,
-                         uint8_t* dst_r,
-                         int dst_stride_r,
-                         uint8_t* dst_g,
-                         int dst_stride_g,
-                         uint8_t* dst_b,
-                         int dst_stride_b,
-                         uint8_t* dst_a,
-                         int dst_stride_a,
-                         int width,
-                         int height) {
+static void SplitARGBPlaneAlpha(const uint8_t* src_argb,
+                                int src_stride_argb,
+                                uint8_t* dst_r,
+                                int dst_stride_r,
+                                uint8_t* dst_g,
+                                int dst_stride_g,
+                                uint8_t* dst_b,
+                                int dst_stride_b,
+                                uint8_t* dst_a,
+                                int dst_stride_a,
+                                int width,
+                                int height) {
   int y;
   void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
                        uint8_t* dst_b, uint8_t* dst_a, int width) =
@@ -1175,6 +1383,9 @@ void SplitARGBPlaneAlpha(const uint8_t* src_argb,
 
   assert(height > 0);
 
+  if (width <= 0 || height == 0) {
+    return;
+  }
   if (src_stride_argb == width * 4 && dst_stride_r == width &&
       dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) {
     width *= height;
@@ -1215,6 +1426,11 @@ void SplitARGBPlaneAlpha(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_SPLITARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitARGBRow = SplitARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width);
@@ -1227,21 +1443,24 @@ void SplitARGBPlaneAlpha(const uint8_t* src_argb,
 }
 
 LIBYUV_NOINLINE
-void SplitARGBPlaneOpaque(const uint8_t* src_argb,
-                          int src_stride_argb,
-                          uint8_t* dst_r,
-                          int dst_stride_r,
-                          uint8_t* dst_g,
-                          int dst_stride_g,
-                          uint8_t* dst_b,
-                          int dst_stride_b,
-                          int width,
-                          int height) {
+static void SplitARGBPlaneOpaque(const uint8_t* src_argb,
+                                 int src_stride_argb,
+                                 uint8_t* dst_r,
+                                 int dst_stride_r,
+                                 uint8_t* dst_g,
+                                 int dst_stride_g,
+                                 uint8_t* dst_b,
+                                 int dst_stride_b,
+                                 int width,
+                                 int height) {
   int y;
   void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
                        uint8_t* dst_b, int width) = SplitXRGBRow_C;
   assert(height > 0);
 
+  if (width <= 0 || height == 0) {
+    return;
+  }
   if (src_stride_argb == width * 4 && dst_stride_r == width &&
       dst_stride_g == width && dst_stride_b == width) {
     width *= height;
@@ -1281,6 +1500,11 @@ void SplitARGBPlaneOpaque(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_SPLITXRGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitXRGBRow = SplitXRGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width);
@@ -1328,18 +1552,18 @@ void SplitARGBPlane(const uint8_t* src_argb,
 }
 
 LIBYUV_NOINLINE
-void MergeARGBPlaneAlpha(const uint8_t* src_r,
-                         int src_stride_r,
-                         const uint8_t* src_g,
-                         int src_stride_g,
-                         const uint8_t* src_b,
-                         int src_stride_b,
-                         const uint8_t* src_a,
-                         int src_stride_a,
-                         uint8_t* dst_argb,
-                         int dst_stride_argb,
-                         int width,
-                         int height) {
+static void MergeARGBPlaneAlpha(const uint8_t* src_r,
+                                int src_stride_r,
+                                const uint8_t* src_g,
+                                int src_stride_g,
+                                const uint8_t* src_b,
+                                int src_stride_b,
+                                const uint8_t* src_a,
+                                int src_stride_a,
+                                uint8_t* dst_argb,
+                                int dst_stride_argb,
+                                int width,
+                                int height) {
   int y;
   void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g,
                        const uint8_t* src_b, const uint8_t* src_a,
@@ -1347,6 +1571,9 @@ void MergeARGBPlaneAlpha(const uint8_t* src_r,
 
   assert(height > 0);
 
+  if (width <= 0 || height == 0) {
+    return;
+  }
   if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
       src_stride_a == width && dst_stride_argb == width * 4) {
     width *= height;
@@ -1378,6 +1605,11 @@ void MergeARGBPlaneAlpha(const uint8_t* src_r,
     }
   }
 #endif
+#if defined(HAS_MERGEARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeARGBRow = MergeARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width);
@@ -1390,16 +1622,16 @@ void MergeARGBPlaneAlpha(const uint8_t* src_r,
 }
 
 LIBYUV_NOINLINE
-void MergeARGBPlaneOpaque(const uint8_t* src_r,
-                          int src_stride_r,
-                          const uint8_t* src_g,
-                          int src_stride_g,
-                          const uint8_t* src_b,
-                          int src_stride_b,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          int width,
-                          int height) {
+static void MergeARGBPlaneOpaque(const uint8_t* src_r,
+                                 int src_stride_r,
+                                 const uint8_t* src_g,
+                                 int src_stride_g,
+                                 const uint8_t* src_b,
+                                 int src_stride_b,
+                                 uint8_t* dst_argb,
+                                 int dst_stride_argb,
+                                 int width,
+                                 int height) {
   int y;
   void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
                        const uint8_t* src_b, uint8_t* dst_argb, int width) =
@@ -1407,6 +1639,9 @@ void MergeARGBPlaneOpaque(const uint8_t* src_r,
 
   assert(height > 0);
 
+  if (width <= 0 || height == 0) {
+    return;
+  }
   if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
       dst_stride_argb == width * 4) {
     width *= height;
@@ -1437,6 +1672,11 @@ void MergeARGBPlaneOpaque(const uint8_t* src_r,
     }
   }
 #endif
+#if defined(HAS_MERGEXRGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeXRGBRow = MergeXRGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     MergeXRGBRow(src_r, src_g, src_b, dst_argb, width);
@@ -1888,6 +2128,16 @@ int YUY2ToI422(const uint8_t* src_yuy2,
     }
   }
 #endif
+#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUV422ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    YUY2ToYRow = YUY2ToYRow_Any_LSX;
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_LSX;
+      YUY2ToUV422Row = YUY2ToUV422Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     YUY2ToYRow = YUY2ToYRow_Any_LASX;
@@ -1984,6 +2234,16 @@ int UYVYToI422(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_LSX) && defined(HAS_UYVYTOUV422ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    UYVYToYRow = UYVYToYRow_Any_LSX;
+    UYVYToUV422Row = UYVYToUV422Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_LSX;
+      UYVYToUV422Row = UYVYToUV422Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_UYVYTOYROW_LASX) && defined(HAS_UYVYTOUV422ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     UYVYToYRow = UYVYToYRow_Any_LASX;
@@ -2131,6 +2391,14 @@ int UYVYToY(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    UYVYToYRow = UYVYToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_LSX;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     UYVYToYRow(src_uyvy, dst_y, width);
@@ -2189,6 +2457,14 @@ void MirrorPlane(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_MIRRORROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MirrorRow = MirrorRow_Any_LSX;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_MIRRORROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     MirrorRow = MirrorRow_Any_LASX;
@@ -2255,6 +2531,14 @@ void MirrorUVPlane(const uint8_t* src_uv,
     }
   }
 #endif
+#if defined(HAS_MIRRORUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MirrorUVRow = MirrorUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorUVRow = MirrorUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_MIRRORUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     MirrorUVRow = MirrorUVRow_Any_LASX;
@@ -2427,6 +2711,14 @@ int ARGBMirror(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBMIRRORROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBMIRRORROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBMirrorRow = ARGBMirrorRow_Any_LASX;
@@ -2809,6 +3101,14 @@ int ARGBMultiply(const uint8_t* src_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBMULTIPLYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_LSX;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBMULTIPLYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBMultiplyRow = ARGBMultiplyRow_Any_LASX;
@@ -2894,6 +3194,14 @@ int ARGBAdd(const uint8_t* src_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBADDROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBAddRow = ARGBAddRow_Any_LSX;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAddRow = ARGBAddRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBADDROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBAddRow = ARGBAddRow_Any_LASX;
@@ -2974,6 +3282,14 @@ int ARGBSubtract(const uint8_t* src_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBSUBTRACTROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_LSX;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSubtractRow = ARGBSubtractRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBSUBTRACTROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBSubtractRow = ARGBSubtractRow_Any_LASX;
@@ -3051,6 +3367,11 @@ int RAWToRGB24(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToRGB24Row = RAWToRGB24Row_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToRGB24Row(src_raw, dst_rgb24, width);
@@ -3060,6 +3381,7 @@ int RAWToRGB24(const uint8_t* src_raw,
   return 0;
 }
 
+// TODO(fbarchard): Consider uint8_t value
 LIBYUV_API
 void SetPlane(uint8_t* dst_y,
               int dst_stride_y,
@@ -3067,7 +3389,7 @@ void SetPlane(uint8_t* dst_y,
               int height,
               uint32_t value) {
   int y;
-  void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C;
+  void (*SetRow)(uint8_t* dst, uint8_t value, int width) = SetRow_C;
 
   if (width <= 0 || height == 0) {
     return;
@@ -3120,7 +3442,7 @@ void SetPlane(uint8_t* dst_y,
 
   // Set plane
   for (y = 0; y < height; ++y) {
-    SetRow(dst_y, value, width);
+    SetRow(dst_y, (uint8_t)value, width);
     dst_y += dst_stride_y;
   }
 }
@@ -3168,7 +3490,7 @@ int ARGBRect(uint8_t* dst_argb,
              int height,
              uint32_t value) {
   int y;
-  void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) =
+  void (*ARGBSetRow)(uint8_t* dst_argb, uint32_t value, int width) =
       ARGBSetRow_C;
   if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
     return -1;
@@ -3293,6 +3615,14 @@ int ARGBAttenuate(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
@@ -3301,6 +3631,11 @@ int ARGBAttenuate(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBAttenuateRow(src_argb, dst_argb, width);
@@ -3401,6 +3736,11 @@ int ARGBGrayTo(const uint8_t* src_argb,
     ARGBGrayRow = ARGBGrayRow_MSA;
   }
 #endif
+#if defined(HAS_ARGBGRAYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_LSX;
+  }
+#endif
 #if defined(HAS_ARGBGRAYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
     ARGBGrayRow = ARGBGrayRow_LASX;
@@ -3451,6 +3791,11 @@ int ARGBGray(uint8_t* dst_argb,
     ARGBGrayRow = ARGBGrayRow_MSA;
   }
 #endif
+#if defined(HAS_ARGBGRAYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_LSX;
+  }
+#endif
 #if defined(HAS_ARGBGRAYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
     ARGBGrayRow = ARGBGrayRow_LASX;
@@ -3473,7 +3818,7 @@ int ARGBSepia(uint8_t* dst_argb,
               int width,
               int height) {
   int y;
-  void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C;
+  void (*ARGBSepiaRow)(uint8_t* dst_argb, int width) = ARGBSepiaRow_C;
   uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
     return -1;
@@ -3499,6 +3844,11 @@ int ARGBSepia(uint8_t* dst_argb,
     ARGBSepiaRow = ARGBSepiaRow_MSA;
   }
 #endif
+#if defined(HAS_ARGBSEPIAROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_LSX;
+  }
+#endif
 #if defined(HAS_ARGBSEPIAROW_LASX)
   if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
     ARGBSepiaRow = ARGBSepiaRow_LASX;
@@ -3616,7 +3966,7 @@ int ARGBColorTable(uint8_t* dst_argb,
                    int width,
                    int height) {
   int y;
-  void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
+  void (*ARGBColorTableRow)(uint8_t* dst_argb, const uint8_t* table_argb,
                             int width) = ARGBColorTableRow_C;
   uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
@@ -3652,7 +4002,7 @@ int RGBColorTable(uint8_t* dst_argb,
                   int width,
                   int height) {
   int y;
-  void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
+  void (*RGBColorTableRow)(uint8_t* dst_argb, const uint8_t* table_argb,
                            int width) = RGBColorTableRow_C;
   uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
@@ -3697,7 +4047,7 @@ int ARGBQuantize(uint8_t* dst_argb,
                  int width,
                  int height) {
   int y;
-  void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size,
+  void (*ARGBQuantizeRow)(uint8_t* dst_argb, int scale, int interval_size,
                           int interval_offset, int width) = ARGBQuantizeRow_C;
   uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
@@ -3924,6 +4274,11 @@ int ARGBShade(const uint8_t* src_argb,
     ARGBShadeRow = ARGBShadeRow_MSA;
   }
 #endif
+#if defined(HAS_ARGBSHADEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 4)) {
+    ARGBShadeRow = ARGBShadeRow_LSX;
+  }
+#endif
 #if defined(HAS_ARGBSHADEROW_LASX)
   if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 8)) {
     ARGBShadeRow = ARGBShadeRow_LASX;
@@ -3950,7 +4305,7 @@ int InterpolatePlane(const uint8_t* src0,
                      int height,
                      int interpolation) {
   int y;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
   if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
@@ -4008,6 +4363,11 @@ int InterpolatePlane(const uint8_t* src0,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     InterpolateRow(dst, src0, src1 - src0, width, interpolation);
@@ -4030,7 +4390,7 @@ int InterpolatePlane_16(const uint16_t* src0,
                         int height,
                         int interpolation) {
   int y;
-  void (*InterpolateRow_16)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*InterpolateRow_16)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                             ptrdiff_t src_stride, int dst_width,
                             int source_y_fraction) = InterpolateRow_16_C;
   if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
@@ -4213,6 +4573,14 @@ int ARGBShuffle(const uint8_t* src_bgra,
     }
   }
 #endif
+#if defined(HAS_ARGBSHUFFLEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBShuffleRow = ARGBShuffleRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBSHUFFLEROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBShuffleRow = ARGBShuffleRow_Any_LASX;
@@ -4444,6 +4812,11 @@ static int ARGBSobelize(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYJRow = ARGBToYJRow_RVV;
+  }
+#endif
 
 #if defined(HAS_SOBELYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
@@ -4477,16 +4850,16 @@ static int ARGBSobelize(const uint8_t* src_argb,
 #endif
   {
     // 3 rows with edges before/after.
-    const int kRowSize = (width + kEdge + 31) & ~31;
-    align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
+    const int row_size = (width + kEdge + 31) & ~31;
+    align_buffer_64(rows, row_size * 2 + (kEdge + row_size * 3 + kEdge));
     uint8_t* row_sobelx = rows;
-    uint8_t* row_sobely = rows + kRowSize;
-    uint8_t* row_y = rows + kRowSize * 2;
+    uint8_t* row_sobely = rows + row_size;
+    uint8_t* row_y = rows + row_size * 2;
 
     // Convert first row.
     uint8_t* row_y0 = row_y + kEdge;
-    uint8_t* row_y1 = row_y0 + kRowSize;
-    uint8_t* row_y2 = row_y1 + kRowSize;
+    uint8_t* row_y1 = row_y0 + row_size;
+    uint8_t* row_y2 = row_y1 + row_size;
     ARGBToYJRow(src_argb, row_y0, width);
     row_y0[-1] = row_y0[0];
     memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
@@ -5027,9 +5400,6 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
   return 0;
 }
 
-// TODO(fbarchard): Consider if width is even Y channel can be split
-// directly. A SplitUVRow_Odd function could copy the remaining chroma.
-
 LIBYUV_API
 int YUY2ToNV12(const uint8_t* src_yuy2,
                int src_stride_yuy2,
@@ -5040,13 +5410,10 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
                int width,
                int height) {
   int y;
-  int halfwidth = (width + 1) >> 1;
-  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
-                     int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+      YUY2ToYRow_C;
+  void (*YUY2ToNVUVRow)(const uint8_t* src_yuy2, int stride_yuy2,
+                        uint8_t* dst_uv, int width) = YUY2ToNVUVRow_C;
   if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
@@ -5057,109 +5424,91 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
     src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
     src_stride_yuy2 = -src_stride_yuy2;
   }
-#if defined(HAS_SPLITUVROW_SSE2)
+#if defined(HAS_YUY2TOYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    SplitUVRow = SplitUVRow_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_AVX2)
+#if defined(HAS_YUY2TOYROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitUVRow = SplitUVRow_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_NEON)
+#if defined(HAS_YUY2TOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUVRow = SplitUVRow_Any_NEON;
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_NEON;
+      YUY2ToYRow = YUY2ToYRow_NEON;
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_MSA)
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
-    SplitUVRow = SplitUVRow_Any_MSA;
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
     if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_MSA;
+      YUY2ToYRow = YUY2ToYRow_MSA;
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_LSX)
+#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUV422ROW_LSX)
   if (TestCpuFlag(kCpuHasLSX)) {
-    SplitUVRow = SplitUVRow_Any_LSX;
-    if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
+    YUY2ToYRow = YUY2ToYRow_Any_LSX;
     if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
+      YUY2ToYRow = YUY2ToYRow_LSX;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
+#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    YUY2ToYRow = YUY2ToYRow_Any_LASX;
     if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
+      YUY2ToYRow = YUY2ToYRow_LASX;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
+
+#if defined(HAS_YUY2TONVUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToNVUVRow = YUY2ToNVUVRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
+      YUY2ToNVUVRow = YUY2ToNVUVRow_SSE2;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    InterpolateRow = InterpolateRow_Any_MSA;
+#if defined(HAS_YUY2TONVUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToNVUVRow = YUY2ToNVUVRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_MSA;
+      YUY2ToNVUVRow = YUY2ToNVUVRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    InterpolateRow = InterpolateRow_Any_LSX;
-    if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_LSX;
+#if defined(HAS_YUY2TONVUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToNVUVRow = YUY2ToNVUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToNVUVRow = YUY2ToNVUVRow_NEON;
     }
   }
 #endif
 
-  {
-    int awidth = halfwidth * 2;
-    // row of y and 2 rows of uv
-    align_buffer_64(rows, awidth * 3);
-
-    for (y = 0; y < height - 1; y += 2) {
-      // Split Y from UV.
-      SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
-      memcpy(dst_y, rows, width);
-      SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
-      memcpy(dst_y + dst_stride_y, rows, width);
-      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
-      src_yuy2 += src_stride_yuy2 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_uv += dst_stride_uv;
-    }
-    if (height & 1) {
-      // Split Y from UV.
-      SplitUVRow(src_yuy2, rows, dst_uv, awidth);
-      memcpy(dst_y, rows, width);
-    }
-    free_aligned_buffer_64(rows);
+  for (y = 0; y < height - 1; y += 2) {
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+    YUY2ToNVUVRow(src_yuy2, src_stride_yuy2, dst_uv, width);
+    src_yuy2 += src_stride_yuy2 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    YUY2ToNVUVRow(src_yuy2, 0, dst_uv, width);
   }
   return 0;
 }
@@ -5177,7 +5526,7 @@ int UYVYToNV12(const uint8_t* src_uyvy,
   int halfwidth = (width + 1) >> 1;
   void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
                      int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
 
@@ -5231,6 +5580,12 @@ int UYVYToNV12(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitUVRow = SplitUVRow_RVV;
+  }
+#endif
+
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -5271,6 +5626,11 @@ int UYVYToNV12(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   {
     int awidth = halfwidth * 2;
@@ -5336,6 +5696,7 @@ void HalfMergeUVPlane(const uint8_t* src_u,
     HalfMergeUVRow = HalfMergeUVRow_AVX2;
   }
 #endif
+
   for (y = 0; y < height - 1; y += 2) {
     // Merge a row of U and V into a row of UV.
     HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
diff --git a/files/source/rotate.cc b/files/source/rotate.cc
index f1e83cbd..8d3978c7 100644
--- a/files/source/rotate.cc
+++ b/files/source/rotate.cc
@@ -138,7 +138,7 @@ void RotatePlane180(const uint8_t* src,
                     int dst_stride,
                     int width,
                     int height) {
-  // Swap first and last row and mirror the content. Uses a temporary row.
+  // Swap top and bottom row and mirror the content. Uses a temporary row.
   align_buffer_64(row, width);
   const uint8_t* src_bot = src + src_stride * (height - 1);
   uint8_t* dst_bot = dst + dst_stride * (height - 1);
@@ -178,6 +178,14 @@ void RotatePlane180(const uint8_t* src,
     }
   }
 #endif
+#if defined(HAS_MIRRORROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MirrorRow = MirrorRow_Any_LSX;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_MIRRORROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     MirrorRow = MirrorRow_Any_LASX;
@@ -206,12 +214,17 @@ void RotatePlane180(const uint8_t* src,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
+#if defined(HAS_COPYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    CopyRow = CopyRow_RVV;
+  }
+#endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
-    CopyRow(src, row, width);        // Copy first row into buffer
-    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
-    MirrorRow(row, dst_bot, width);  // Mirror buffer into last row
+    CopyRow(src, row, width);        // Copy top row into buffer
+    MirrorRow(src_bot, dst, width);  // Mirror bottom row into top row
+    MirrorRow(row, dst_bot, width);  // Mirror buffer into bottom row
     src += src_stride;
     dst += dst_stride;
     src_bot -= src_stride;
@@ -476,6 +489,120 @@ int RotatePlane(const uint8_t* src,
   return -1;
 }
 
+LIBYUV_API
+void TransposePlane_16(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width,
+                       int height) {
+  int i = height;
+  // Work across the source in 8x8 tiles
+  while (i >= 8) {
+    TransposeWx8_16_C(src, src_stride, dst, dst_stride, width);
+    src += 8 * src_stride;  // Go down 8 rows.
+    dst += 8;               // Move over 8 columns.
+    i -= 8;
+  }
+
+  if (i > 0) {
+    TransposeWxH_16_C(src, src_stride, dst, dst_stride, width, i);
+  }
+}
+
+static void RotatePlane90_16(const uint16_t* src,
+                             int src_stride,
+                             uint16_t* dst,
+                             int dst_stride,
+                             int width,
+                             int height) {
+  // Rotate by 90 is a transpose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  TransposePlane_16(src, src_stride, dst, dst_stride, width, height);
+}
+
+static void RotatePlane270_16(const uint16_t* src,
+                              int src_stride,
+                              uint16_t* dst,
+                              int dst_stride,
+                              int width,
+                              int height) {
+  // Rotate by 270 is a transpose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  TransposePlane_16(src, src_stride, dst, dst_stride, width, height);
+}
+
+static void RotatePlane180_16(const uint16_t* src,
+                              int src_stride,
+                              uint16_t* dst,
+                              int dst_stride,
+                              int width,
+                              int height) {
+  // Swap top and bottom row and mirror the content. Uses a temporary row.
+  align_buffer_64_16(row, width);
+  const uint16_t* src_bot = src + src_stride * (height - 1);
+  uint16_t* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    CopyRow_16_C(src, row, width);        // Copy top row into buffer
+    MirrorRow_16_C(src_bot, dst, width);  // Mirror bottom row into top row
+    MirrorRow_16_C(row, dst_bot, width);  // Mirror buffer into bottom row
+    src += src_stride;
+    dst += dst_stride;
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+  free_aligned_buffer_64_16(row);
+}
+
+LIBYUV_API
+int RotatePlane_16(const uint16_t* src,
+                   int src_stride,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height,
+                   enum RotationMode mode) {
+  if (!src || width <= 0 || height == 0 || !dst) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src = src + (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      CopyPlane_16(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    case kRotate90:
+      RotatePlane90_16(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270_16(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    case kRotate180:
+      RotatePlane180_16(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
 LIBYUV_API
 int I420Rotate(const uint8_t* src_y,
                int src_stride_y,
@@ -544,6 +671,8 @@ int I420Rotate(const uint8_t* src_y,
   return -1;
 }
 
+// I422 has half width x full height UV planes, so rotate by 90 and 270
+// require scaling to maintain 422 subsampling.
 LIBYUV_API
 int I422Rotate(const uint8_t* src_y,
                int src_stride_y,
@@ -579,31 +708,42 @@ int I422Rotate(const uint8_t* src_y,
 
   switch (mode) {
     case kRotate0:
-      // copy frame
+      // Copy frame
       CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
       CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
       CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
       return 0;
+
+      // Note on temporary Y plane for UV.
+      // Rotation of UV first fits within the Y destination plane rows.
+      // Y plane is width x height
+      // Y plane rotated is height x width
+      // UV plane is (width / 2) x height
+      // UV plane rotated is height x (width / 2)
+      // UV plane rotated+scaled is (height / 2) x width.
+      // UV plane rotated is a temporary that fits within the Y plane rotated.
+
     case kRotate90:
-      // We need to rotate and rescale, we use plane Y as temporal storage.
-      RotatePlane90(src_u, src_stride_u, dst_y, height, halfwidth, height);
-      ScalePlane(dst_y, height, height, halfwidth, dst_u, halfheight,
+      RotatePlane90(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+                    height);
+      ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_u, dst_stride_u,
                  halfheight, width, kFilterBilinear);
-      RotatePlane90(src_v, src_stride_v, dst_y, height, halfwidth, height);
-      ScalePlane(dst_y, height, height, halfwidth, dst_v, halfheight,
+      RotatePlane90(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+                    height);
+      ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_v, dst_stride_v,
                  halfheight, width, kFilterLinear);
       RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
       return 0;
     case kRotate270:
-      // We need to rotate and rescale, we use plane Y as temporal storage.
-      RotatePlane270(src_u, src_stride_u, dst_y, height, halfwidth, height);
-      ScalePlane(dst_y, height, height, halfwidth, dst_u, halfheight,
+      RotatePlane270(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+                     height);
+      ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_u, dst_stride_u,
                  halfheight, width, kFilterBilinear);
-      RotatePlane270(src_v, src_stride_v, dst_y, height, halfwidth, height);
-      ScalePlane(dst_y, height, height, halfwidth, dst_v, halfheight,
+      RotatePlane270(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+                     height);
+      ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_v, dst_stride_v,
                  halfheight, width, kFilterLinear);
       RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-
       return 0;
     case kRotate180:
       RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
@@ -828,6 +968,228 @@ int Android420ToI420Rotate(const uint8_t* src_y,
   return -1;
 }
 
+LIBYUV_API
+int I010Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v || dst_stride_y < 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return I010Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                      src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                      dst_v, dst_stride_v, width, height);
+    case kRotate90:
+      RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                       halfheight);
+      RotatePlane90_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                       halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      RotatePlane270_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                        halfheight);
+      RotatePlane270_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                        halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                        halfheight);
+      RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                        halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+// I210 has half width x full height UV planes, so rotate by 90 and 270
+// require scaling to maintain 422 subsampling.
+LIBYUV_API
+int I210Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // Copy frame
+      CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+      CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+      return 0;
+
+      // Note on temporary Y plane for UV.
+      // Rotation of UV first fits within the Y destination plane rows.
+      // Y plane is width x height
+      // Y plane rotated is height x width
+      // UV plane is (width / 2) x height
+      // UV plane rotated is height x (width / 2)
+      // UV plane rotated+scaled is (height / 2) x width.
+      // UV plane rotated is a temporary that fits within the Y plane rotated.
+
+    case kRotate90:
+      RotatePlane90_16(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+                       height);
+      ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_u, dst_stride_u,
+                    halfheight, width, kFilterBilinear);
+      RotatePlane90_16(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+                       height);
+      ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_v, dst_stride_v,
+                    halfheight, width, kFilterLinear);
+      RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270_16(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+                        height);
+      ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_u, dst_stride_u,
+                    halfheight, width, kFilterBilinear);
+      RotatePlane270_16(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+                        height);
+      ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_v, dst_stride_v,
+                    halfheight, width, kFilterLinear);
+      RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      return 0;
+    case kRotate180:
+      RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                        height);
+      RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                        height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int I410Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v || dst_stride_y < 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case kRotate90:
+      RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      RotatePlane90_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      RotatePlane270_16(src_u, src_stride_u, dst_u, dst_stride_u, width,
+                        height);
+      RotatePlane270_16(src_v, src_stride_v, dst_v, dst_stride_v, width,
+                        height);
+      return 0;
+    case kRotate180:
+      RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, width,
+                        height);
+      RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, width,
+                        height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/source/rotate_argb.cc b/files/source/rotate_argb.cc
index 539cf98d..c7239010 100644
--- a/files/source/rotate_argb.cc
+++ b/files/source/rotate_argb.cc
@@ -8,11 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/rotate.h"
+#include "libyuv/rotate_argb.h"
 
 #include "libyuv/convert.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
 #include "libyuv/row.h"
 #include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */
 
@@ -155,6 +156,14 @@ static int ARGBRotate180(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBMIRRORROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBMIRRORROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBMirrorRow = ARGBMirrorRow_Any_LASX;
@@ -183,6 +192,11 @@ static int ARGBRotate180(const uint8_t* src_argb,
     CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
+#if defined(HAS_COPYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    CopyRow = CopyRow_RVV;
+  }
+#endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
diff --git a/files/source/rotate_common.cc b/files/source/rotate_common.cc
index ff212ade..4b496d1b 100644
--- a/files/source/rotate_common.cc
+++ b/files/source/rotate_common.cc
@@ -94,12 +94,135 @@ void TransposeUVWxH_C(const uint8_t* src,
   for (i = 0; i < width * 2; i += 2) {
     int j;
     for (j = 0; j < height; ++j) {
-      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
-      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+      dst_a[((i >> 1) * dst_stride_a) + j] = src[i + (j * src_stride)];
+      dst_b[((i >> 1) * dst_stride_b) + j] = src[i + (j * src_stride) + 1];
     }
   }
 }
 
+void TransposeWx8_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst[0] = src[0 * src_stride];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
+}
+
+void TransposeUVWx8_16_C(const uint16_t* src,
+                         int src_stride,
+                         uint16_t* dst_a,
+                         int dst_stride_a,
+                         uint16_t* dst_b,
+                         int dst_stride_b,
+                         int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_a[0] = src[0 * src_stride + 0];
+    dst_b[0] = src[0 * src_stride + 1];
+    dst_a[1] = src[1 * src_stride + 0];
+    dst_b[1] = src[1 * src_stride + 1];
+    dst_a[2] = src[2 * src_stride + 0];
+    dst_b[2] = src[2 * src_stride + 1];
+    dst_a[3] = src[3 * src_stride + 0];
+    dst_b[3] = src[3 * src_stride + 1];
+    dst_a[4] = src[4 * src_stride + 0];
+    dst_b[4] = src[4 * src_stride + 1];
+    dst_a[5] = src[5 * src_stride + 0];
+    dst_b[5] = src[5 * src_stride + 1];
+    dst_a[6] = src[6 * src_stride + 0];
+    dst_b[6] = src[6 * src_stride + 1];
+    dst_a[7] = src[7 * src_stride + 0];
+    dst_b[7] = src[7 * src_stride + 1];
+    src += 2;
+    dst_a += dst_stride_a;
+    dst_b += dst_stride_b;
+  }
+}
+
+void TransposeWxH_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width,
+                       int height) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst[i * dst_stride + j] = src[j * src_stride + i];
+    }
+  }
+}
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width) {
+  const uint8_t* src1 = src + src_stride;
+  const uint8_t* src2 = src1 + src_stride;
+  const uint8_t* src3 = src2 + src_stride;
+  uint8_t* dst1 = dst + dst_stride;
+  uint8_t* dst2 = dst1 + dst_stride;
+  uint8_t* dst3 = dst2 + dst_stride;
+  int i;
+  for (i = 0; i < width; i += 4) {
+    uint32_t p00 = ((uint32_t*)(src))[0];
+    uint32_t p10 = ((uint32_t*)(src))[1];
+    uint32_t p20 = ((uint32_t*)(src))[2];
+    uint32_t p30 = ((uint32_t*)(src))[3];
+    uint32_t p01 = ((uint32_t*)(src1))[0];
+    uint32_t p11 = ((uint32_t*)(src1))[1];
+    uint32_t p21 = ((uint32_t*)(src1))[2];
+    uint32_t p31 = ((uint32_t*)(src1))[3];
+    uint32_t p02 = ((uint32_t*)(src2))[0];
+    uint32_t p12 = ((uint32_t*)(src2))[1];
+    uint32_t p22 = ((uint32_t*)(src2))[2];
+    uint32_t p32 = ((uint32_t*)(src2))[3];
+    uint32_t p03 = ((uint32_t*)(src3))[0];
+    uint32_t p13 = ((uint32_t*)(src3))[1];
+    uint32_t p23 = ((uint32_t*)(src3))[2];
+    uint32_t p33 = ((uint32_t*)(src3))[3];
+    ((uint32_t*)(dst))[0] = p00;
+    ((uint32_t*)(dst))[1] = p01;
+    ((uint32_t*)(dst))[2] = p02;
+    ((uint32_t*)(dst))[3] = p03;
+    ((uint32_t*)(dst1))[0] = p10;
+    ((uint32_t*)(dst1))[1] = p11;
+    ((uint32_t*)(dst1))[2] = p12;
+    ((uint32_t*)(dst1))[3] = p13;
+    ((uint32_t*)(dst2))[0] = p20;
+    ((uint32_t*)(dst2))[1] = p21;
+    ((uint32_t*)(dst2))[2] = p22;
+    ((uint32_t*)(dst2))[3] = p23;
+    ((uint32_t*)(dst3))[0] = p30;
+    ((uint32_t*)(dst3))[1] = p31;
+    ((uint32_t*)(dst3))[2] = p32;
+    ((uint32_t*)(dst3))[3] = p33;
+    src += src_stride * 4;  // advance 4 rows
+    src1 += src_stride * 4;
+    src2 += src_stride * 4;
+    src3 += src_stride * 4;
+    dst += 4 * 4;  // advance 4 columns
+    dst1 += 4 * 4;
+    dst2 += 4 * 4;
+    dst3 += 4 * 4;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/source/rotate_gcc.cc b/files/source/rotate_gcc.cc
index 1a3f8cbb..fd5eee05 100644
--- a/files/source/rotate_gcc.cc
+++ b/files/source/rotate_gcc.cc
@@ -365,6 +365,136 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
         "xmm7", "xmm8", "xmm9");
 }
 #endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
+
+#if defined(HAS_TRANSPOSE4X4_32_SSE2)
+// 4 values, little endian view
+// a b c d
+// e f g h
+// i j k l
+// m n o p
+
+// transpose 2x2
+// a e b f   from row 0, 1
+// i m j n   from row 2, 3
+// c g d h   from row 0, 1
+// k o l p   from row 2, 3
+
+// transpose 4x4
+// a e i m   from row 0, 1
+// b f j n   from row 0, 1
+// c g k o   from row 2, 3
+// d h l p   from row 2, 3
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_SSE2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width) {
+  asm volatile(
+      // Main loop transpose 4x4.  Read a column, write a row.
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // a b c d
+      "movdqu      (%0,%3),%%xmm1                \n"  // e f g h
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+      "movdqu      (%0),%%xmm2                   \n"  // i j k l
+      "movdqu      (%0,%3),%%xmm3                \n"  // m n o p
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+
+      // Transpose 2x2
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "movdqa      %%xmm0,%%xmm6                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "punpckldq   %%xmm1,%%xmm4                 \n"  // a e b f   from row 0, 1
+      "punpckldq   %%xmm3,%%xmm5                 \n"  // i m j n   from row 2, 3
+      "punpckhdq   %%xmm1,%%xmm6                 \n"  // c g d h   from row 0, 1
+      "punpckhdq   %%xmm3,%%xmm7                 \n"  // k o l p   from row 2, 3
+
+      // Transpose 4x4
+      "movdqa      %%xmm4,%%xmm0                 \n"
+      "movdqa      %%xmm4,%%xmm1                 \n"
+      "movdqa      %%xmm6,%%xmm2                 \n"
+      "movdqa      %%xmm6,%%xmm3                 \n"
+      "punpcklqdq  %%xmm5,%%xmm0                 \n"  // a e i m   from row 0, 1
+      "punpckhqdq  %%xmm5,%%xmm1                 \n"  // b f j n   from row 0, 1
+      "punpcklqdq  %%xmm7,%%xmm2                 \n"  // c g k o   from row 2, 3
+      "punpckhqdq  %%xmm7,%%xmm3                 \n"  // d h l p   from row 2, 3
+
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         16(%1,%4),%1                  \n"  // dst += stride + 16
+      "movdqu      %%xmm1,-16(%1)                \n"
+      "movdqu      %%xmm2,-16(%1,%4)             \n"
+      "movdqu      %%xmm3,-16(%1,%4,2)           \n"
+      "sub         %4,%1                         \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),                     // %0
+        "+r"(dst),                     // %1
+        "+rm"(width)                   // %2
+      : "r"((ptrdiff_t)(src_stride)),  // %3
+        "r"((ptrdiff_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // defined(HAS_TRANSPOSE4X4_32_SSE2)
+
+#if defined(HAS_TRANSPOSE4X4_32_AVX2)
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_AVX2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width) {
+  asm volatile(
+      // Main loop transpose 2 blocks of 4x4.  Read a column, write a row.
+      "1:                                        \n"
+      "vmovdqu     (%0),%%xmm0                   \n"  // a b c d
+      "vmovdqu     (%0,%3),%%xmm1                \n"  // e f g h
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+      "vmovdqu     (%0),%%xmm2                   \n"  // i j k l
+      "vmovdqu     (%0,%3),%%xmm3                \n"  // m n o p
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+
+      "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // a b c d
+      "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1      \n"  // e f g h
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+      "vinserti128 $1,(%0),%%ymm2,%%ymm2         \n"  // i j k l
+      "vinserti128 $1,(%0,%3),%%ymm3,%%ymm3      \n"  // m n o p
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+
+      // Transpose 2x2
+      "vpunpckldq  %%ymm1,%%ymm0,%%ymm4          \n"  // a e b f   from row 0, 1
+      "vpunpckldq  %%ymm3,%%ymm2,%%ymm5          \n"  // i m j n   from row 2, 3
+      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm6          \n"  // c g d h   from row 0, 1
+      "vpunpckhdq  %%ymm3,%%ymm2,%%ymm7          \n"  // k o l p   from row 2, 3
+
+      // Transpose 4x4
+      "vpunpcklqdq %%ymm5,%%ymm4,%%ymm0          \n"  // a e i m   from row 0, 1
+      "vpunpckhqdq %%ymm5,%%ymm4,%%ymm1          \n"  // b f j n   from row 0, 1
+      "vpunpcklqdq %%ymm7,%%ymm6,%%ymm2          \n"  // c g k o   from row 2, 3
+      "vpunpckhqdq %%ymm7,%%ymm6,%%ymm3          \n"  // d h l p   from row 2, 3
+
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         32(%1,%4),%1                  \n"  // dst += stride + 32
+      "vmovdqu     %%ymm1,-32(%1)                \n"
+      "vmovdqu     %%ymm2,-32(%1,%4)             \n"
+      "vmovdqu     %%ymm3,-32(%1,%4,2)           \n"
+      "sub         %4,%1                         \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                     // %0
+        "+r"(dst),                     // %1
+        "+rm"(width)                   // %2
+      : "r"((ptrdiff_t)(src_stride)),  // %3
+        "r"((ptrdiff_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // defined(HAS_TRANSPOSE4X4_32_AVX2)
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus
diff --git a/files/source/rotate_mmi.cc b/files/source/rotate_mmi.cc
deleted file mode 100644
index f8de6083..00000000
--- a/files/source/rotate_mmi.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-void TransposeWx8_MMI(const uint8_t* src,
-                      int src_stride,
-                      uint8_t* dst,
-                      int dst_stride,
-                      int width) {
-  uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-  uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
-  uint8_t* src_tmp = nullptr;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "ldc1       %[tmp12],        0x00(%[src])                     \n\t"
-      "dadd       %[src_tmp],      %[src],         %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp0 = (00 10 01 11 02 12 03 13) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (04 14 05 15 06 16 07 17) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (20 30 21 31 22 32 23 33) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (24 34 25 35 26 36 27 37) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp4 = (00 10 20 30 01 11 21 31) */
-      "punpcklhw  %[tmp4],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp5 = (02 12 22 32 03 13 23 33) */
-      "punpckhhw  %[tmp5],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp6 = (04 14 24 34 05 15 25 35) */
-      "punpcklhw  %[tmp6],         %[tmp1],        %[tmp3]          \n\t"
-      /* tmp7 = (06 16 26 36 07 17 27 37) */
-      "punpckhhw  %[tmp7],         %[tmp1],        %[tmp3]          \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp0 = (40 50 41 51 42 52 43 53) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (44 54 45 55 46 56 47 57) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (60 70 61 71 62 72 63 73) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (64 74 65 75 66 76 67 77) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp8 = (40 50 60 70 41 51 61 71) */
-      "punpcklhw  %[tmp8],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp9 = (42 52 62 72 43 53 63 73) */
-      "punpckhhw  %[tmp9],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp10 = (44 54 64 74 45 55 65 75) */
-      "punpcklhw  %[tmp10],        %[tmp1],        %[tmp3]          \n\t"
-      /* tmp11 = (46 56 66 76 47 57 67 77) */
-      "punpckhhw  %[tmp11],        %[tmp1],        %[tmp3]          \n\t"
-
-      /* tmp0 = (00 10 20 30 40 50 60 70) */
-      "punpcklwd  %[tmp0],         %[tmp4],        %[tmp8]          \n\t"
-      /* tmp1 = (01 11 21 31 41 51 61 71) */
-      "punpckhwd  %[tmp1],         %[tmp4],        %[tmp8]          \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      /* tmp0 = (02 12 22 32 42 52 62 72) */
-      "punpcklwd  %[tmp0],         %[tmp5],        %[tmp9]          \n\t"
-      /* tmp1 = (03 13 23 33 43 53 63 73) */
-      "punpckhwd  %[tmp1],         %[tmp5],        %[tmp9]          \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      /* tmp0 = (04 14 24 34 44 54 64 74) */
-      "punpcklwd  %[tmp0],         %[tmp6],        %[tmp10]         \n\t"
-      /* tmp1 = (05 15 25 35 45 55 65 75) */
-      "punpckhwd  %[tmp1],         %[tmp6],        %[tmp10]         \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      /* tmp0 = (06 16 26 36 46 56 66 76) */
-      "punpcklwd  %[tmp0],         %[tmp7],        %[tmp11]         \n\t"
-      /* tmp1 = (07 17 27 37 47 57 67 77) */
-      "punpckhwd  %[tmp1],         %[tmp7],        %[tmp11]         \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "daddi      %[src],          %[src],          0x08            \n\t"
-      "daddi      %[width],        %[width],       -0x08            \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
-        [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
-        [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
-        [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
-        [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst),
-        [src_tmp] "+&r"(src_tmp)
-      : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride),
-        [dst_stride] "r"(dst_stride)
-      : "memory");
-}
-
-void TransposeUVWx8_MMI(const uint8_t* src,
-                        int src_stride,
-                        uint8_t* dst_a,
-                        int dst_stride_a,
-                        uint8_t* dst_b,
-                        int dst_stride_b,
-                        int width) {
-  uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-  uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
-  uint8_t* src_tmp = nullptr;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */
-      "ldc1       %[tmp12],        0x00(%[src])                     \n\t"
-      "dadd       %[src_tmp],      %[src],         %[src_stride]    \n\t"
-      /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                  \n\t"
-
-      /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */
-      "punpcklhw  %[tmp4],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */
-      "punpckhhw  %[tmp5],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */
-      "punpcklhw  %[tmp6],         %[tmp1],        %[tmp3]          \n\t"
-      /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */
-      "punpckhhw  %[tmp7],         %[tmp1],        %[tmp3]          \n\t"
-
-      "dadd       %[src_tmp],     %[src_tmp],      %[src_stride]    \n\t"
-      /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */
-      "punpcklhw  %[tmp8],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */
-      "punpckhhw  %[tmp9],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */
-      "punpcklhw  %[tmp10],        %[tmp1],        %[tmp3]          \n\t"
-      /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */
-      "punpckhhw  %[tmp11],        %[tmp1],        %[tmp3]          \n\t"
-
-      /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */
-      "punpcklwd  %[tmp0],         %[tmp4],        %[tmp8]          \n\t"
-      /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */
-      "punpckhwd  %[tmp1],         %[tmp4],        %[tmp8]          \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */
-      "punpcklwd  %[tmp0],         %[tmp5],        %[tmp9]          \n\t"
-      /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */
-      "punpckhwd  %[tmp1],         %[tmp5],        %[tmp9]          \n\t"
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */
-      "punpcklwd  %[tmp0],         %[tmp6],        %[tmp10]         \n\t"
-      /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */
-      "punpckhwd  %[tmp1],         %[tmp6],        %[tmp10]         \n\t"
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */
-      "punpcklwd  %[tmp0],         %[tmp7],        %[tmp11]         \n\t"
-      /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */
-      "punpckhwd  %[tmp1],         %[tmp7],        %[tmp11]         \n\t"
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "daddiu     %[src],          %[src],          0x08            \n\t"
-      "daddi      %[width],        %[width],       -0x04            \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
-        [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
-        [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
-        [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
-        [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a),
-        [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp)
-      : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a),
-        [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride)
-      : "memory");
-}
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/rotate_neon.cc b/files/source/rotate_neon.cc
index 844df2bf..569a7318 100644
--- a/files/source/rotate_neon.cc
+++ b/files/source/rotate_neon.cc
@@ -410,6 +410,46 @@ void TransposeUVWx8_NEON(const uint8_t* src,
       : "r"(&kVTbl4x4TransposeDi)  // %8
       : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
 }
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_NEON(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width) {
+  const uint8_t* src1 = src + src_stride;
+  const uint8_t* src2 = src1 + src_stride;
+  const uint8_t* src3 = src2 + src_stride;
+  uint8_t* dst1 = dst + dst_stride;
+  uint8_t* dst2 = dst1 + dst_stride;
+  uint8_t* dst3 = dst2 + dst_stride;
+  asm volatile(
+      // Main loop transpose 4x4.  Read a column, write a row.
+      "1:                                        \n"
+      "vld4.32     {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
+      "vld4.32     {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n"
+      "vld4.32     {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n"
+      "vld4.32     {d1[1], d3[1], d5[1], d7[1]}, [%3], %9 \n"
+      "subs        %8, %8, #4                    \n"  // w -= 4
+      "vst1.8      {q0}, [%4]!                   \n"
+      "vst1.8      {q1}, [%5]!                   \n"
+      "vst1.8      {q2}, [%6]!                   \n"
+      "vst1.8      {q3}, [%7]!                   \n"
+      "bgt         1b                            \n"
+
+      : "+r"(src),                        // %0
+        "+r"(src1),                       // %1
+        "+r"(src2),                       // %2
+        "+r"(src3),                       // %3
+        "+r"(dst),                        // %4
+        "+r"(dst1),                       // %5
+        "+r"(dst2),                       // %6
+        "+r"(dst3),                       // %7
+        "+r"(width)                       // %8
+      : "r"((ptrdiff_t)(src_stride * 4))  // %9
+      : "memory", "cc", "q0", "q1", "q2", "q3");
+}
+
 #endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/files/source/rotate_neon64.cc b/files/source/rotate_neon64.cc
index 43c15817..95047fa7 100644
--- a/files/source/rotate_neon64.cc
+++ b/files/source/rotate_neon64.cc
@@ -201,13 +201,13 @@ void TransposeWx8_NEON(const uint8_t* src,
 
       "4:                                          \n"
 
-      : "=&r"(src_temp),                          // %0
-        "+r"(src),                                // %1
-        "+r"(dst),                                // %2
-        "+r"(width)                               // %3
-      : "r"(&kVTbl4x4Transpose),                  // %4
-        "r"(static_cast<ptrdiff_t>(src_stride)),  // %5
-        "r"(static_cast<ptrdiff_t>(dst_stride))   // %6
+      : "=&r"(src_temp),             // %0
+        "+r"(src),                   // %1
+        "+r"(dst),                   // %2
+        "+r"(width)                  // %3
+      : "r"(&kVTbl4x4Transpose),     // %4
+        "r"((ptrdiff_t)src_stride),  // %5
+        "r"((ptrdiff_t)dst_stride)   // %6
       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
         "v17", "v18", "v19", "v20", "v21", "v22", "v23");
 }
@@ -423,18 +423,57 @@ void TransposeUVWx8_NEON(const uint8_t* src,
 
       "4:                                        \n"
 
-      : "=&r"(src_temp),                            // %0
-        "+r"(src),                                  // %1
-        "+r"(dst_a),                                // %2
-        "+r"(dst_b),                                // %3
-        "+r"(width)                                 // %4
-      : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
-        "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
-        "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
-        "r"(&kVTbl4x4TransposeDi)                   // %8
+      : "=&r"(src_temp),               // %0
+        "+r"(src),                     // %1
+        "+r"(dst_a),                   // %2
+        "+r"(dst_b),                   // %3
+        "+r"(width)                    // %4
+      : "r"((ptrdiff_t)src_stride),    // %5
+        "r"((ptrdiff_t)dst_stride_a),  // %6
+        "r"((ptrdiff_t)dst_stride_b),  // %7
+        "r"(&kVTbl4x4TransposeDi)      // %8
       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
         "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
 }
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_NEON(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width) {
+  const uint8_t* src1 = src + src_stride;
+  const uint8_t* src2 = src1 + src_stride;
+  const uint8_t* src3 = src2 + src_stride;
+  uint8_t* dst1 = dst + dst_stride;
+  uint8_t* dst2 = dst1 + dst_stride;
+  uint8_t* dst3 = dst2 + dst_stride;
+  asm volatile(
+      // Main loop transpose 4x4.  Read a column, write a row.
+      "1:                                        \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[3], [%3], %9 \n"
+      "subs        %w8, %w8, #4                  \n"  // w -= 4
+      "st1         {v0.4s}, [%4], 16             \n"
+      "st1         {v1.4s}, [%5], 16             \n"
+      "st1         {v2.4s}, [%6], 16             \n"
+      "st1         {v3.4s}, [%7], 16             \n"
+      "b.gt        1b                            \n"
+      : "+r"(src),                        // %0
+        "+r"(src1),                       // %1
+        "+r"(src2),                       // %2
+        "+r"(src3),                       // %3
+        "+r"(dst),                        // %4
+        "+r"(dst1),                       // %5
+        "+r"(dst2),                       // %6
+        "+r"(dst3),                       // %7
+        "+r"(width)                       // %8
+      : "r"((ptrdiff_t)(src_stride * 4))  // %9
+      : "memory", "cc", "v0", "v1", "v2", "v3");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/files/source/row_any.cc b/files/source/row_any.cc
index 3781a9f2..e574543c 100644
--- a/files/source/row_any.cc
+++ b/files/source/row_any.cc
@@ -19,7 +19,7 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// memset for temp is meant to clear the source buffer (not dest) so that
+// memset for vin is meant to clear the source buffer so that
 // SIMD that reads full multiple of 16 bytes will not trigger msan errors.
 // memset is not needed for production, as the garbage values are processed but
 // not used, although there may be edge cases for subsampling.
@@ -35,20 +35,20 @@ extern "C" {
   void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
                const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
                int width) {                                                  \
-    SIMD_ALIGNED(uint8_t temp[64 * 5]);                                      \
-    memset(temp, 0, 64 * 4); /* for msan */                                  \
+    SIMD_ALIGNED(uint8_t vin[64 * 4]);                                       \
+    SIMD_ALIGNED(uint8_t vout[64]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
       ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, n);                      \
     }                                                                        \
-    memcpy(temp, y_buf + n, r);                                              \
-    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-    memcpy(temp + 192, a_buf + n, r);                                        \
-    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, MASK + 1); \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
-           SS(r, DUVSHIFT) * BPP);                                           \
+    memcpy(vin, y_buf + n, r);                                               \
+    memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));                \
+    memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+    memcpy(vin + 192, a_buf + n, r);                                         \
+    ANY_SIMD(vin, vin + 64, vin + 128, vin + 192, vout, MASK + 1);           \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP);    \
   }
 
 #ifdef HAS_MERGEARGBROW_SSE2
@@ -68,25 +68,25 @@ ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15)
   void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
                const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
                const struct YuvConstants* yuvconstants, int width) {         \
-    SIMD_ALIGNED(uint8_t temp[64 * 5]);                                      \
-    memset(temp, 0, 64 * 4); /* for msan */                                  \
+    SIMD_ALIGNED(uint8_t vin[64 * 4]);                                       \
+    SIMD_ALIGNED(uint8_t vout[64]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
       ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
     }                                                                        \
-    memcpy(temp, y_buf + n, r);                                              \
-    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-    memcpy(temp + 192, a_buf + n, r);                                        \
+    memcpy(vin, y_buf + n, r);                                               \
+    memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));                \
+    memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+    memcpy(vin + 192, a_buf + n, r);                                         \
     if (width & 1) {                                                         \
-      temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1];             \
-      temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];           \
+      vin[64 + SS(r, UVSHIFT)] = vin[64 + SS(r, UVSHIFT) - 1];               \
+      vin[128 + SS(r, UVSHIFT)] = vin[128 + SS(r, UVSHIFT) - 1];             \
     }                                                                        \
-    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
-             yuvconstants, MASK + 1);                                        \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
-           SS(r, DUVSHIFT) * BPP);                                           \
+    ANY_SIMD(vin, vin + 64, vin + 128, vin + 192, vout, yuvconstants,        \
+             MASK + 1);                                                      \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP);    \
   }
 
 #ifdef HAS_I444ALPHATOARGBROW_SSSE3
@@ -113,6 +113,9 @@ ANY41C(I444AlphaToARGBRow_Any_MSA, I444AlphaToARGBRow_MSA, 0, 0, 4, 7)
 #ifdef HAS_I422ALPHATOARGBROW_MSA
 ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
 #endif
+#ifdef HAS_I422ALPHATOARGBROW_LSX
+ANY41C(I422AlphaToARGBRow_Any_LSX, I422AlphaToARGBRow_LSX, 1, 0, 4, 15)
+#endif
 #ifdef HAS_I422ALPHATOARGBROW_LASX
 ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15)
 #endif
@@ -123,21 +126,20 @@ ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15)
   void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, const T* a_buf, \
                uint8_t* dst_ptr, const struct YuvConstants* yuvconstants,      \
                int width) {                                                    \
-    SIMD_ALIGNED(T temp[16 * 4]);                                              \
-    SIMD_ALIGNED(uint8_t out[64]);                                             \
-    memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */                    \
+    SIMD_ALIGNED(T vin[16 * 4]);                                               \
+    SIMD_ALIGNED(uint8_t vout[64]);                                            \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                       \
     int r = width & MASK;                                                      \
     int n = width & ~MASK;                                                     \
     if (n > 0) {                                                               \
       ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);          \
     }                                                                          \
-    memcpy(temp, y_buf + n, r * SBPP);                                         \
-    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);          \
-    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);          \
-    memcpy(temp + 48, a_buf + n, r * SBPP);                                    \
-    ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, yuvconstants,         \
-             MASK + 1);                                                        \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);       \
+    memcpy(vin, y_buf + n, r * SBPP);                                          \
+    memcpy(vin + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);           \
+    memcpy(vin + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);           \
+    memcpy(vin + 48, a_buf + n, r * SBPP);                                     \
+    ANY_SIMD(vin, vin + 16, vin + 32, vin + 48, vout, yuvconstants, MASK + 1); \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP);      \
   }
 
 #ifdef HAS_I210ALPHATOARGBROW_SSSE3
@@ -190,20 +192,20 @@ ANY41CT(I410AlphaToARGBRow_Any_AVX2,
 #define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK)          \
   void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
                const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \
-    SIMD_ALIGNED(STYPE temp[16 * 4]);                                      \
-    SIMD_ALIGNED(DTYPE out[64]);                                           \
-    memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */                \
+    SIMD_ALIGNED(STYPE vin[16 * 4]);                                       \
+    SIMD_ALIGNED(DTYPE vout[64]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for msan */                            \
     int r = width & MASK;                                                  \
     int n = width & ~MASK;                                                 \
     if (n > 0) {                                                           \
       ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n);             \
     }                                                                      \
-    memcpy(temp, r_buf + n, r * SBPP);                                     \
-    memcpy(temp + 16, g_buf + n, r * SBPP);                                \
-    memcpy(temp + 32, b_buf + n, r * SBPP);                                \
-    memcpy(temp + 48, a_buf + n, r * SBPP);                                \
-    ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, depth, MASK + 1); \
-    memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP);                     \
+    memcpy(vin, r_buf + n, r * SBPP);                                      \
+    memcpy(vin + 16, g_buf + n, r * SBPP);                                 \
+    memcpy(vin + 32, b_buf + n, r * SBPP);                                 \
+    memcpy(vin + 48, a_buf + n, r * SBPP);                                 \
+    ANY_SIMD(vin, vin + 16, vin + 32, vin + 48, vout, depth, MASK + 1);    \
+    memcpy((uint8_t*)dst_ptr + n * BPP, vout, r * BPP);                    \
   }
 
 #ifdef HAS_MERGEAR64ROW_AVX2
@@ -237,22 +239,22 @@ ANY41PT(MergeARGB16To8Row_Any_NEON,
 #undef ANY41PT
 
 // Any 3 planes to 1.
-#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,          \
-               const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \
-    SIMD_ALIGNED(uint8_t temp[64 * 4]);                             \
-    memset(temp, 0, 64 * 3); /* for YUY2 and msan */                \
-    int r = width & MASK;                                           \
-    int n = width & ~MASK;                                          \
-    if (n > 0) {                                                    \
-      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                    \
-    }                                                               \
-    memcpy(temp, y_buf + n, r);                                     \
-    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
-    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));     \
-    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);    \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,             \
-           SS(r, DUVSHIFT) * BPP);                                  \
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)            \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                \
+               const uint8_t* v_buf, uint8_t* dst_ptr, int width) {       \
+    SIMD_ALIGNED(uint8_t vin[64 * 3]);                                    \
+    SIMD_ALIGNED(uint8_t vout[64]);                                       \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                  \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                          \
+    }                                                                     \
+    memcpy(vin, y_buf + n, r);                                            \
+    memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));             \
+    memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));            \
+    ANY_SIMD(vin, vin + 64, vin + 128, vout, MASK + 1);                   \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
   }
 
 // Merge functions.
@@ -285,6 +287,9 @@ ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
 #ifdef HAS_I422TOYUY2ROW_MSA
 ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
 #endif
+#ifdef HAS_I422TOYUY2ROW_LSX
+ANY31(I422ToYUY2Row_Any_LSX, I422ToYUY2Row_LSX, 1, 1, 4, 15)
+#endif
 #ifdef HAS_I422TOYUY2ROW_LASX
 ANY31(I422ToYUY2Row_Any_LASX, I422ToYUY2Row_LASX, 1, 1, 4, 31)
 #endif
@@ -294,6 +299,9 @@ ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
 #ifdef HAS_I422TOUYVYROW_MSA
 ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
 #endif
+#ifdef HAS_I422TOUYVYROW_LSX
+ANY31(I422ToUYVYRow_Any_LSX, I422ToUYVYRow_LSX, 1, 1, 4, 15)
+#endif
 #ifdef HAS_I422TOUYVYROW_LASX
 ANY31(I422ToUYVYRow_Any_LASX, I422ToUYVYRow_LASX, 1, 1, 4, 31)
 #endif
@@ -308,28 +316,27 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
 // Note that odd width replication includes 444 due to implementation
 // on arm that subsamples 444 to 422 internally.
 // Any 3 planes to 1 with yuvconstants
-#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,           \
-               const uint8_t* v_buf, uint8_t* dst_ptr,               \
-               const struct YuvConstants* yuvconstants, int width) { \
-    SIMD_ALIGNED(uint8_t temp[128 * 4]);                             \
-    memset(temp, 0, 128 * 3); /* for YUY2 and msan */                \
-    int r = width & MASK;                                            \
-    int n = width & ~MASK;                                           \
-    if (n > 0) {                                                     \
-      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);       \
-    }                                                                \
-    memcpy(temp, y_buf + n, r);                                      \
-    memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
-    memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
-    if (width & 1) {                                                 \
-      temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];   \
-      temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1];   \
-    }                                                                \
-    ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \
-             MASK + 1);                                              \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384,              \
-           SS(r, DUVSHIFT) * BPP);                                   \
+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)           \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                \
+               const uint8_t* v_buf, uint8_t* dst_ptr,                    \
+               const struct YuvConstants* yuvconstants, int width) {      \
+    SIMD_ALIGNED(uint8_t vin[128 * 3]);                                   \
+    SIMD_ALIGNED(uint8_t vout[128]);                                      \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                  \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);            \
+    }                                                                     \
+    memcpy(vin, y_buf + n, r);                                            \
+    memcpy(vin + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));            \
+    memcpy(vin + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));            \
+    if (width & 1) {                                                      \
+      vin[128 + SS(r, UVSHIFT)] = vin[128 + SS(r, UVSHIFT) - 1];          \
+      vin[256 + SS(r, UVSHIFT)] = vin[256 + SS(r, UVSHIFT) - 1];          \
+    }                                                                     \
+    ANY_SIMD(vin, vin + 128, vin + 256, vout, yuvconstants, MASK + 1);    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
   }
 
 #ifdef HAS_I422TOARGBROW_SSSE3
@@ -359,6 +366,9 @@ ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)
 #ifdef HAS_I444TOARGBROW_SSSE3
 ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
 #endif
+#ifdef HAS_I444TORGB24ROW_SSSE3
+ANY31C(I444ToRGB24Row_Any_SSSE3, I444ToRGB24Row_SSSE3, 0, 0, 3, 15)
+#endif
 #ifdef HAS_I422TORGB24ROW_AVX2
 ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
 #endif
@@ -374,6 +384,9 @@ ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
 #ifdef HAS_I444TOARGBROW_AVX2
 ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
 #endif
+#ifdef HAS_I444TORGB24ROW_AVX2
+ANY31C(I444ToRGB24Row_Any_AVX2, I444ToRGB24Row_AVX2, 0, 0, 3, 31)
+#endif
 #ifdef HAS_I422TOARGB4444ROW_AVX2
 ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15)
 #endif
@@ -383,6 +396,9 @@ ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15)
 #ifdef HAS_I422TORGB565ROW_AVX2
 ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)
 #endif
+#ifdef HAS_I444TORGB24ROW_NEON
+ANY31C(I444ToRGB24Row_Any_NEON, I444ToRGB24Row_NEON, 0, 0, 3, 7)
+#endif
 #ifdef HAS_I422TOARGBROW_NEON
 ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
 ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
@@ -401,6 +417,14 @@ ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
 ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
 ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
 #endif
+#ifdef HAS_I422TOARGBROW_LSX
+ANY31C(I422ToARGBRow_Any_LSX, I422ToARGBRow_LSX, 1, 0, 4, 15)
+ANY31C(I422ToRGBARow_Any_LSX, I422ToRGBARow_LSX, 1, 0, 4, 15)
+ANY31C(I422ToRGB24Row_Any_LSX, I422ToRGB24Row_LSX, 1, 0, 3, 15)
+ANY31C(I422ToRGB565Row_Any_LSX, I422ToRGB565Row_LSX, 1, 0, 2, 15)
+ANY31C(I422ToARGB4444Row_Any_LSX, I422ToARGB4444Row_LSX, 1, 0, 2, 15)
+ANY31C(I422ToARGB1555Row_Any_LSX, I422ToARGB1555Row_LSX, 1, 0, 2, 15)
+#endif
 #ifdef HAS_I422TOARGBROW_LASX
 ANY31C(I422ToARGBRow_Any_LASX, I422ToARGBRow_LASX, 1, 0, 4, 31)
 ANY31C(I422ToRGBARow_Any_LASX, I422ToRGBARow_LASX, 1, 0, 4, 31)
@@ -420,19 +444,19 @@ ANY31C(I444ToARGBRow_Any_LSX, I444ToARGBRow_LSX, 0, 0, 4, 15)
   void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf,            \
                uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
                int width) {                                               \
-    SIMD_ALIGNED(T temp[16 * 3]);                                         \
-    SIMD_ALIGNED(uint8_t out[64]);                                        \
-    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */               \
+    SIMD_ALIGNED(T vin[16 * 3]);                                          \
+    SIMD_ALIGNED(uint8_t vout[64]);                                       \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                  \
     int r = width & MASK;                                                 \
     int n = width & ~MASK;                                                \
     if (n > 0) {                                                          \
       ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);            \
     }                                                                     \
-    memcpy(temp, y_buf + n, r * SBPP);                                    \
-    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
-    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
-    ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1);    \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);  \
+    memcpy(vin, y_buf + n, r * SBPP);                                     \
+    memcpy(vin + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);      \
+    memcpy(vin + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);      \
+    ANY_SIMD(vin, vin + 16, vin + 32, vout, yuvconstants, MASK + 1);      \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
   }
 
 #ifdef HAS_I210TOAR30ROW_SSSE3
@@ -477,19 +501,19 @@ ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
 #define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK)          \
   void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
                DTYPE* dst_ptr, int depth, int width) {                     \
-    SIMD_ALIGNED(STYPE temp[16 * 3]);                                      \
-    SIMD_ALIGNED(DTYPE out[64]);                                           \
-    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */                \
+    SIMD_ALIGNED(STYPE vin[16 * 3]);                                       \
+    SIMD_ALIGNED(DTYPE vout[64]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                   \
     int r = width & MASK;                                                  \
     int n = width & ~MASK;                                                 \
     if (n > 0) {                                                           \
       ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n);                    \
     }                                                                      \
-    memcpy(temp, r_buf + n, r * SBPP);                                     \
-    memcpy(temp + 16, g_buf + n, r * SBPP);                                \
-    memcpy(temp + 32, b_buf + n, r * SBPP);                                \
-    ANY_SIMD(temp, temp + 16, temp + 32, out, depth, MASK + 1);            \
-    memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP);                     \
+    memcpy(vin, r_buf + n, r * SBPP);                                      \
+    memcpy(vin + 16, g_buf + n, r * SBPP);                                 \
+    memcpy(vin + 32, b_buf + n, r * SBPP);                                 \
+    ANY_SIMD(vin, vin + 16, vin + 32, vout, depth, MASK + 1);              \
+    memcpy((uint8_t*)dst_ptr + n * BPP, vout, r * BPP);                    \
   }
 
 #ifdef HAS_MERGEXR30ROW_AVX2
@@ -541,18 +565,19 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON,
 #define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
   void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
                int width) {                                                   \
-    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                      \
-    memset(temp, 0, 128 * 2); /* for msan */                                  \
+    SIMD_ALIGNED(uint8_t vin[128 * 2]);                                       \
+    SIMD_ALIGNED(uint8_t vout[128]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for msan */                               \
     int r = width & MASK;                                                     \
     int n = width & ~MASK;                                                    \
     if (n > 0) {                                                              \
       ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                    \
     }                                                                         \
-    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
-    memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
+    memcpy(vin, y_buf + n * SBPP, r * SBPP);                                  \
+    memcpy(vin + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                        \
            SS(r, UVSHIFT) * SBPP2);                                           \
-    ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                         \
-    memcpy(dst_ptr + n * BPP, temp + 256, r * BPP);                           \
+    ANY_SIMD(vin, vin + 128, vout, MASK + 1);                                 \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                                 \
   }
 
 // Merge functions.
@@ -560,7 +585,10 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON,
 ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
 #endif
 #ifdef HAS_MERGEUVROW_AVX2
-ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
+ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX512BW
+ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31)
 #endif
 #ifdef HAS_MERGEUVROW_NEON
 ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
@@ -611,18 +639,27 @@ ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
 #ifdef HAS_ARGBMULTIPLYROW_MSA
 ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
 #endif
+#ifdef HAS_ARGBMULTIPLYROW_LSX
+ANY21(ARGBMultiplyRow_Any_LSX, ARGBMultiplyRow_LSX, 0, 4, 4, 4, 3)
+#endif
 #ifdef HAS_ARGBMULTIPLYROW_LASX
 ANY21(ARGBMultiplyRow_Any_LASX, ARGBMultiplyRow_LASX, 0, 4, 4, 4, 7)
 #endif
 #ifdef HAS_ARGBADDROW_MSA
 ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBADDROW_LSX
+ANY21(ARGBAddRow_Any_LSX, ARGBAddRow_LSX, 0, 4, 4, 4, 3)
+#endif
 #ifdef HAS_ARGBADDROW_LASX
 ANY21(ARGBAddRow_Any_LASX, ARGBAddRow_LASX, 0, 4, 4, 4, 7)
 #endif
 #ifdef HAS_ARGBSUBTRACTROW_MSA
 ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBSUBTRACTROW_LSX
+ANY21(ARGBSubtractRow_Any_LSX, ARGBSubtractRow_LSX, 0, 4, 4, 4, 3)
+#endif
 #ifdef HAS_ARGBSUBTRACTROW_LASX
 ANY21(ARGBSubtractRow_Any_LASX, ARGBSubtractRow_LASX, 0, 4, 4, 4, 7)
 #endif
@@ -664,22 +701,53 @@ ANY21(SobelXYRow_Any_LSX, SobelXYRow_LSX, 0, 1, 1, 4, 15)
 #endif
 #undef ANY21
 
+// Any 2 planes to 1 with stride
+// width is measured in source pixels. 4 bytes contains 2 pixels
+#define ANY21S(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                        \
+  void NAMEANY(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, \
+               int width) {                                               \
+    SIMD_ALIGNED(uint8_t vin[32 * 2]);                                    \
+    SIMD_ALIGNED(uint8_t vout[32]);                                       \
+    memset(vin, 0, sizeof(vin)); /* for msan */                           \
+    int awidth = (width + 1) / 2;                                         \
+    int r = awidth & MASK;                                                \
+    int n = awidth & ~MASK;                                               \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_yuy2, stride_yuy2, dst_uv, n * 2);                     \
+    }                                                                     \
+    memcpy(vin, src_yuy2 + n * SBPP, r * SBPP);                           \
+    memcpy(vin + 32, src_yuy2 + stride_yuy2 + n * SBPP, r * SBPP);        \
+    ANY_SIMD(vin, 32, vout, MASK + 1);                                    \
+    memcpy(dst_uv + n * BPP, vout, r * BPP);                              \
+  }
+
+#ifdef HAS_YUY2TONVUVROW_NEON
+ANY21S(YUY2ToNVUVRow_Any_NEON, YUY2ToNVUVRow_NEON, 4, 2, 7)
+#endif
+#ifdef HAS_YUY2TONVUVROW_SSE2
+ANY21S(YUY2ToNVUVRow_Any_SSE2, YUY2ToNVUVRow_SSE2, 4, 2, 7)
+#endif
+#ifdef HAS_YUY2TONVUVROW_AVX2
+ANY21S(YUY2ToNVUVRow_Any_AVX2, YUY2ToNVUVRow_AVX2, 4, 2, 15)
+#endif
+
 // Any 2 planes to 1 with yuvconstants
 #define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)            \
   void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
                const struct YuvConstants* yuvconstants, int width) {          \
-    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                      \
-    memset(temp, 0, 128 * 2); /* for msan */                                  \
+    SIMD_ALIGNED(uint8_t vin[128 * 2]);                                       \
+    SIMD_ALIGNED(uint8_t vout[128]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for msan */                               \
     int r = width & MASK;                                                     \
     int n = width & ~MASK;                                                    \
     if (n > 0) {                                                              \
       ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                      \
     }                                                                         \
-    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
-    memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
+    memcpy(vin, y_buf + n * SBPP, r * SBPP);                                  \
+    memcpy(vin + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                        \
            SS(r, UVSHIFT) * SBPP2);                                           \
-    ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1);           \
-    memcpy(dst_ptr + n * BPP, temp + 256, r * BPP);                           \
+    ANY_SIMD(vin, vin + 128, vout, yuvconstants, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                                 \
   }
 
 // Biplanar to RGB.
@@ -758,21 +826,21 @@ ANY21C(NV12ToRGB565Row_Any_LASX, NV12ToRGB565Row_LASX, 1, 1, 2, 2, 15)
 #undef ANY21C
 
 // Any 2 planes of 16 bit to 1 with yuvconstants
-#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK)      \
-  void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr,              \
-               const struct YuvConstants* yuvconstants, int width) {           \
-    SIMD_ALIGNED(T temp[16 * 3]);                                              \
-    SIMD_ALIGNED(uint8_t out[64]);                                             \
-    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */                    \
-    int r = width & MASK;                                                      \
-    int n = width & ~MASK;                                                     \
-    if (n > 0) {                                                               \
-      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                       \
-    }                                                                          \
-    memcpy(temp, y_buf + n, r * SBPP);                                         \
-    memcpy(temp + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \
-    ANY_SIMD(temp, temp + 16, out, yuvconstants, MASK + 1);                    \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);       \
+#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK)     \
+  void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr,             \
+               const struct YuvConstants* yuvconstants, int width) {          \
+    SIMD_ALIGNED(T vin[16 * 2]);                                              \
+    SIMD_ALIGNED(uint8_t vout[64]);                                           \
+    memset(vin, 0, sizeof(vin)); /* for msan */                               \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                      \
+    }                                                                         \
+    memcpy(vin, y_buf + n, r * SBPP);                                         \
+    memcpy(vin + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \
+    ANY_SIMD(vin, vin + 16, vout, yuvconstants, MASK + 1);                    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP);     \
   }
 
 #ifdef HAS_P210TOAR30ROW_SSSE3
@@ -806,21 +874,22 @@ ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
 #define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK)                     \
   void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \
                int width) {                                          \
-    SIMD_ALIGNED(T temp[16 * 4]);                                    \
-    memset(temp, 0, 16 * 4 * BPP); /* for msan */                    \
+    SIMD_ALIGNED(T vin[16 * 2]);                                     \
+    SIMD_ALIGNED(T vout[16]);                                        \
+    memset(vin, 0, sizeof(vin)); /* for msan */                      \
     int r = width & MASK;                                            \
     int n = width & ~MASK;                                           \
     if (n > 0) {                                                     \
       ANY_SIMD(src_u, src_v, dst_uv, depth, n);                      \
     }                                                                \
-    memcpy(temp, src_u + n, r * BPP);                                \
-    memcpy(temp + 16, src_v + n, r * BPP);                           \
-    ANY_SIMD(temp, temp + 16, temp + 32, depth, MASK + 1);           \
-    memcpy(dst_uv + n * 2, temp + 32, r * BPP * 2);                  \
+    memcpy(vin, src_u + n, r * BPP);                                 \
+    memcpy(vin + 16, src_v + n, r * BPP);                            \
+    ANY_SIMD(vin, vin + 16, vout, depth, MASK + 1);                  \
+    memcpy(dst_uv + n * 2, vout, r * BPP * 2);                       \
   }
 
 #ifdef HAS_MERGEUVROW_16_AVX2
-ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 15)
+ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 7)
 #endif
 #ifdef HAS_MERGEUVROW_16_NEON
 ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7)
@@ -829,18 +898,19 @@ ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7)
 #undef ANY21CT
 
 // Any 1 to 1.
-#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
-    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
-    memset(temp, 0, 128); /* for YUY2 and msan */                         \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
-    }                                                                     \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
-    ANY_SIMD(temp, temp + 128, MASK + 1);                                 \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {    \
+    SIMD_ALIGNED(uint8_t vin[128]);                                      \
+    SIMD_ALIGNED(uint8_t vout[128]);                                     \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                 \
+    int r = width & MASK;                                                \
+    int n = width & ~MASK;                                               \
+    if (n > 0) {                                                         \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                     \
+    }                                                                    \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(vin, vout, MASK + 1);                                       \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                            \
   }
 
 #ifdef HAS_COPYROW_AVX
@@ -931,6 +1001,13 @@ ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
 ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
 ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
 #endif
+#if defined(HAS_ARGBTORGB24ROW_LSX)
+ANY11(ARGBToRGB24Row_Any_LSX, ARGBToRGB24Row_LSX, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_LSX, ARGBToRAWRow_LSX, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_LSX, ARGBToRGB565Row_LSX, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_LSX, ARGBToARGB1555Row_LSX, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_LSX, ARGBToARGB4444Row_LSX, 0, 4, 2, 7)
+#endif
 #if defined(HAS_ARGBTORGB24ROW_LASX)
 ANY11(ARGBToRGB24Row_Any_LASX, ARGBToRGB24Row_LASX, 0, 4, 3, 31)
 ANY11(ARGBToRAWRow_Any_LASX, ARGBToRAWRow_LASX, 0, 4, 3, 31)
@@ -959,6 +1036,9 @@ ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
 #ifdef HAS_ARGBTOYJROW_AVX2
 ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
 #endif
+#ifdef HAS_ABGRTOYJROW_AVX2
+ANY11(ABGRToYJRow_Any_AVX2, ABGRToYJRow_AVX2, 0, 4, 1, 31)
+#endif
 #ifdef HAS_RGBATOYJROW_AVX2
 ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31)
 #endif
@@ -983,6 +1063,9 @@ ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
 #ifdef HAS_ARGBTOYJROW_SSSE3
 ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
 #endif
+#ifdef HAS_ABGRTOYJROW_SSSE3
+ANY11(ABGRToYJRow_Any_SSSE3, ABGRToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
 #ifdef HAS_RGBATOYJROW_SSSE3
 ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
 #endif
@@ -992,12 +1075,18 @@ ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15)
 #ifdef HAS_ARGBTOYROW_MSA
 ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
 #endif
+#ifdef HAS_ARGBTOYROW_LSX
+ANY11(ARGBToYRow_Any_LSX, ARGBToYRow_LSX, 0, 4, 1, 15)
+#endif
 #ifdef HAS_ARGBTOYROW_LASX
 ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31)
 #endif
 #ifdef HAS_ARGBTOYJROW_NEON
 ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 15)
 #endif
+#ifdef HAS_ABGRTOYJROW_NEON
+ANY11(ABGRToYJRow_Any_NEON, ABGRToYJRow_NEON, 0, 4, 1, 15)
+#endif
 #ifdef HAS_RGBATOYJROW_NEON
 ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 15)
 #endif
@@ -1007,9 +1096,21 @@ ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
 #ifdef HAS_ARGBTOYJROW_LSX
 ANY11(ARGBToYJRow_Any_LSX, ARGBToYJRow_LSX, 0, 4, 1, 15)
 #endif
+#ifdef HAS_RGBATOYJROW_LSX
+ANY11(RGBAToYJRow_Any_LSX, RGBAToYJRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ABGRTOYJROW_LSX
+ANY11(ABGRToYJRow_Any_LSX, ABGRToYJRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYJROW_LASX
+ANY11(RGBAToYJRow_Any_LASX, RGBAToYJRow_LASX, 0, 4, 1, 31)
+#endif
 #ifdef HAS_ARGBTOYJROW_LASX
 ANY11(ARGBToYJRow_Any_LASX, ARGBToYJRow_LASX, 0, 4, 1, 31)
 #endif
+#ifdef HAS_ABGRTOYJROW_LASX
+ANY11(ABGRToYJRow_Any_LASX, ABGRToYJRow_LASX, 0, 4, 1, 31)
+#endif
 #ifdef HAS_BGRATOYROW_NEON
 ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 15)
 #endif
@@ -1019,6 +1120,9 @@ ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
 #ifdef HAS_BGRATOYROW_LSX
 ANY11(BGRAToYRow_Any_LSX, BGRAToYRow_LSX, 0, 4, 1, 15)
 #endif
+#ifdef HAS_BGRATOYROW_LASX
+ANY11(BGRAToYRow_Any_LASX, BGRAToYRow_LASX, 0, 4, 1, 31)
+#endif
 #ifdef HAS_ABGRTOYROW_NEON
 ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 15)
 #endif
@@ -1028,6 +1132,9 @@ ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
 #ifdef HAS_ABGRTOYROW_LSX
 ANY11(ABGRToYRow_Any_LSX, ABGRToYRow_LSX, 0, 4, 1, 15)
 #endif
+#ifdef HAS_ABGRTOYROW_LASX
+ANY11(ABGRToYRow_Any_LASX, ABGRToYRow_LASX, 0, 4, 1, 31)
+#endif
 #ifdef HAS_RGBATOYROW_NEON
 ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 15)
 #endif
@@ -1037,6 +1144,9 @@ ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
 #ifdef HAS_RGBATOYROW_LSX
 ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15)
 #endif
+#ifdef HAS_RGBATOYROW_LASX
+ANY11(RGBAToYRow_Any_LASX, RGBAToYRow_LASX, 0, 4, 1, 31)
+#endif
 #ifdef HAS_RGB24TOYROW_NEON
 ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15)
 #endif
@@ -1055,6 +1165,12 @@ ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
 #ifdef HAS_RGB24TOYROW_LSX
 ANY11(RGB24ToYRow_Any_LSX, RGB24ToYRow_LSX, 0, 3, 1, 15)
 #endif
+#ifdef HAS_RGB24TOYJROW_LSX
+ANY11(RGB24ToYJRow_Any_LSX, RGB24ToYJRow_LSX, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_LASX
+ANY11(RGB24ToYJRow_Any_LASX, RGB24ToYJRow_LASX, 0, 3, 1, 31)
+#endif
 #ifdef HAS_RGB24TOYROW_LASX
 ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31)
 #endif
@@ -1079,6 +1195,12 @@ ANY11(RAWToYRow_Any_LSX, RAWToYRow_LSX, 0, 3, 1, 15)
 #ifdef HAS_RAWTOYROW_LASX
 ANY11(RAWToYRow_Any_LASX, RAWToYRow_LASX, 0, 3, 1, 31)
 #endif
+#ifdef HAS_RAWTOYJROW_LSX
+ANY11(RAWToYJRow_Any_LSX, RAWToYJRow_LSX, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYJROW_LASX
+ANY11(RAWToYJRow_Any_LASX, RAWToYJRow_LASX, 0, 3, 1, 31)
+#endif
 #ifdef HAS_RGB565TOYROW_NEON
 ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
 #endif
@@ -1115,12 +1237,18 @@ ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
 #ifdef HAS_YUY2TOYROW_MSA
 ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
 #endif
+#ifdef HAS_YUY2TOYROW_LSX
+ANY11(YUY2ToYRow_Any_LSX, YUY2ToYRow_LSX, 1, 4, 1, 15)
+#endif
 #ifdef HAS_YUY2TOYROW_LASX
 ANY11(YUY2ToYRow_Any_LASX, YUY2ToYRow_LASX, 1, 4, 1, 31)
 #endif
 #ifdef HAS_UYVYTOYROW_MSA
 ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
 #endif
+#ifdef HAS_UYVYTOYROW_LSX
+ANY11(UYVYToYRow_Any_LSX, UYVYToYRow_LSX, 1, 4, 1, 15)
+#endif
 #ifdef HAS_UYVYTOYROW_LASX
 ANY11(UYVYToYRow_Any_LASX, UYVYToYRow_LASX, 1, 4, 1, 31)
 #endif
@@ -1217,6 +1345,9 @@ ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
 #ifdef HAS_ARGBATTENUATEROW_MSA
 ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBATTENUATEROW_LSX
+ANY11(ARGBAttenuateRow_Any_LSX, ARGBAttenuateRow_LSX, 0, 4, 4, 7)
+#endif
 #ifdef HAS_ARGBATTENUATEROW_LASX
 ANY11(ARGBAttenuateRow_Any_LASX, ARGBAttenuateRow_LASX, 0, 4, 4, 15)
 #endif
@@ -1238,19 +1369,21 @@ ANY11(ARGBExtractAlphaRow_Any_LSX, ARGBExtractAlphaRow_LSX, 0, 4, 1, 15)
 #undef ANY11
 
 // Any 1 to 1 blended.  Destination is read, modify, write.
-#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
-    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
-    memset(temp, 0, 64 * 2); /* for msan */                               \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
-    }                                                                     \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
-    memcpy(temp + 64, dst_ptr + n * BPP, r * BPP);                        \
-    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
-    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                        \
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)              \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {    \
+    SIMD_ALIGNED(uint8_t vin[64]);                                       \
+    SIMD_ALIGNED(uint8_t vout[64]);                                      \
+    memset(vin, 0, sizeof(vin));   /* for msan */                        \
+    memset(vout, 0, sizeof(vout)); /* for msan */                        \
+    int r = width & MASK;                                                \
+    int n = width & ~MASK;                                               \
+    if (n > 0) {                                                         \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                     \
+    }                                                                    \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    memcpy(vout, dst_ptr + n * BPP, r * BPP);                            \
+    ANY_SIMD(vin, vout, MASK + 1);                                       \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                            \
   }
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
@@ -1270,16 +1403,17 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
 // Any 1 to 1 with parameter.
 #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
   void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
-    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                        \
-    memset(temp, 0, 64); /* for msan */                                        \
+    SIMD_ALIGNED(uint8_t vin[64]);                                             \
+    SIMD_ALIGNED(uint8_t vout[64]);                                            \
+    memset(vin, 0, sizeof(vin)); /* for msan */                                \
     int r = width & MASK;                                                      \
     int n = width & ~MASK;                                                     \
     if (n > 0) {                                                               \
       ANY_SIMD(src_ptr, dst_ptr, param, n);                                    \
     }                                                                          \
-    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                                \
-    ANY_SIMD(temp, temp + 64, param, MASK + 1);                                \
-    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                             \
+    memcpy(vin, src_ptr + n * SBPP, r * SBPP);                                 \
+    ANY_SIMD(vin, vout, param, MASK + 1);                                      \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                                  \
   }
 
 #if defined(HAS_I400TOARGBROW_SSE2)
@@ -1355,6 +1489,14 @@ ANY11P(ARGBToRGB565DitherRow_Any_MSA,
        2,
        7)
 #endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LSX)
+ANY11P(ARGBToRGB565DitherRow_Any_LSX,
+       ARGBToRGB565DitherRow_LSX,
+       const uint32_t,
+       4,
+       2,
+       7)
+#endif
 #if defined(HAS_ARGBTORGB565DITHERROW_LASX)
 ANY11P(ARGBToRGB565DitherRow_Any_LASX,
        ARGBToRGB565DitherRow_LASX,
@@ -1375,6 +1517,9 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
 #ifdef HAS_ARGBSHUFFLEROW_MSA
 ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBSHUFFLEROW_LSX
+ANY11P(ARGBShuffleRow_Any_LSX, ARGBShuffleRow_LSX, const uint8_t*, 4, 4, 7)
+#endif
 #ifdef HAS_ARGBSHUFFLEROW_LASX
 ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15)
 #endif
@@ -1384,17 +1529,17 @@ ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15)
 // Any 1 to 1 with type
 #define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)  \
   void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int width) { \
-    SIMD_ALIGNED(uint8_t temp[(MASK + 1) * SBPP]);                \
-    SIMD_ALIGNED(uint8_t out[(MASK + 1) * BPP]);                  \
-    memset(temp, 0, (MASK + 1) * SBPP); /* for msan */            \
+    SIMD_ALIGNED(uint8_t vin[(MASK + 1) * SBPP]);                 \
+    SIMD_ALIGNED(uint8_t vout[(MASK + 1) * BPP]);                 \
+    memset(vin, 0, sizeof(vin)); /* for msan */                   \
     int r = width & MASK;                                         \
     int n = width & ~MASK;                                        \
     if (n > 0) {                                                  \
       ANY_SIMD(src_ptr, dst_ptr, n);                              \
     }                                                             \
-    memcpy(temp, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP);       \
-    ANY_SIMD((STYPE*)temp, (DTYPE*)out, MASK + 1);                \
-    memcpy((uint8_t*)(dst_ptr) + n * BPP, out, r * BPP);          \
+    memcpy(vin, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP);        \
+    ANY_SIMD((STYPE*)vin, (DTYPE*)vout, MASK + 1);                \
+    memcpy((uint8_t*)(dst_ptr) + n * BPP, vout, r * BPP);         \
   }
 
 #ifdef HAS_ARGBTOAR64ROW_SSSE3
@@ -1450,17 +1595,17 @@ ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
 // Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
 #define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)             \
   void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
-    SIMD_ALIGNED(STYPE temp[32]);                                            \
-    SIMD_ALIGNED(DTYPE out[32]);                                             \
-    memset(temp, 0, 32 * SBPP); /* for msan */                               \
+    SIMD_ALIGNED(STYPE vin[32]);                                             \
+    SIMD_ALIGNED(DTYPE vout[32]);                                            \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
       ANY_SIMD(src_ptr, dst_ptr, scale, n);                                  \
     }                                                                        \
-    memcpy(temp, src_ptr + n, r * SBPP);                                     \
-    ANY_SIMD(temp, out, scale, MASK + 1);                                    \
-    memcpy(dst_ptr + n, out, r * BPP);                                       \
+    memcpy(vin, src_ptr + n, r * SBPP);                                      \
+    ANY_SIMD(vin, vout, scale, MASK + 1);                                    \
+    memcpy(dst_ptr + n, vout, r * BPP);                                      \
   }
 
 #ifdef HAS_CONVERT16TO8ROW_SSSE3
@@ -1537,17 +1682,17 @@ ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15)
 // Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
 #define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK)             \
   void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
-    SIMD_ALIGNED(ST temp[32]);                                          \
-    SIMD_ALIGNED(T out[32]);                                            \
-    memset(temp, 0, SBPP * 32); /* for msan */                          \
+    SIMD_ALIGNED(ST vin[32]);                                           \
+    SIMD_ALIGNED(T vout[32]);                                           \
+    memset(vin, 0, sizeof(vin)); /* for msan */                         \
     int r = width & MASK;                                               \
     int n = width & ~MASK;                                              \
     if (n > 0) {                                                        \
       ANY_SIMD(src_ptr, dst_ptr, param, n);                             \
     }                                                                   \
-    memcpy(temp, src_ptr + n, r * SBPP);                                \
-    ANY_SIMD(temp, out, param, MASK + 1);                               \
-    memcpy(dst_ptr + n, out, r * BPP);                                  \
+    memcpy(vin, src_ptr + n, r * SBPP);                                 \
+    ANY_SIMD(vin, vout, param, MASK + 1);                               \
+    memcpy(dst_ptr + n, vout, r * BPP);                                 \
   }
 
 #ifdef HAS_HALFFLOATROW_SSE2
@@ -1588,20 +1733,22 @@ ANY11P16(HalfFloatRow_Any_LSX, HalfFloatRow_LSX, uint16_t, uint16_t, 2, 2, 31)
 #undef ANY11P16
 
 // Any 1 to 1 with yuvconstants
-#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr,                  \
-               const struct YuvConstants* yuvconstants, int width) {      \
-    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
-    memset(temp, 0, 128); /* for YUY2 and msan */                         \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                        \
-    }                                                                     \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
-    ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                   \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)              \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr,                 \
+               const struct YuvConstants* yuvconstants, int width) {     \
+    SIMD_ALIGNED(uint8_t vin[128]);                                      \
+    SIMD_ALIGNED(uint8_t vout[128]);                                     \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                 \
+    int r = width & MASK;                                                \
+    int n = width & ~MASK;                                               \
+    if (n > 0) {                                                         \
+      ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                       \
+    }                                                                    \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(vin, vout, yuvconstants, MASK + 1);                         \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                            \
   }
+
 #if defined(HAS_YUY2TOARGBROW_SSSE3)
 ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
 ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
@@ -1628,21 +1775,21 @@ ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7)
 #define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK)           \
   void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \
                int width, int source_y_fraction) {                   \
-    SIMD_ALIGNED(TS temps[64 * 2]);                                  \
-    SIMD_ALIGNED(TD tempd[64]);                                      \
-    memset(temps, 0, sizeof(temps)); /* for msan */                  \
+    SIMD_ALIGNED(TS vin[64 * 2]);                                    \
+    SIMD_ALIGNED(TD vout[64]);                                       \
+    memset(vin, 0, sizeof(vin)); /* for msan */                      \
     int r = width & MASK;                                            \
     int n = width & ~MASK;                                           \
     if (n > 0) {                                                     \
       ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction);  \
     }                                                                \
-    memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS));        \
+    memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS));          \
     if (source_y_fraction) {                                         \
-      memcpy(temps + 64, src_ptr + src_stride + n * SBPP,            \
+      memcpy(vin + 64, src_ptr + src_stride + n * SBPP,              \
              r * SBPP * sizeof(TS));                                 \
     }                                                                \
-    ANY_SIMD(tempd, temps, 64, MASK + 1, source_y_fraction);         \
-    memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD));          \
+    ANY_SIMD(vout, vin, 64, MASK + 1, source_y_fraction);            \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD));           \
   }
 
 #ifdef HAS_INTERPOLATEROW_AVX2
@@ -1682,21 +1829,21 @@ ANY11I(InterpolateRow_16_Any_NEON,
 #define ANY11IS(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK)                \
   void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride,       \
                int scale, int width, int source_y_fraction) {              \
-    SIMD_ALIGNED(TS temps[64 * 2]);                                        \
-    SIMD_ALIGNED(TD tempd[64]);                                            \
-    memset(temps, 0, sizeof(temps)); /* for msan */                        \
+    SIMD_ALIGNED(TS vin[64 * 2]);                                          \
+    SIMD_ALIGNED(TD vout[64]);                                             \
+    memset(vin, 0, sizeof(vin)); /* for msan */                            \
     int r = width & MASK;                                                  \
     int n = width & ~MASK;                                                 \
     if (n > 0) {                                                           \
       ANY_SIMD(dst_ptr, src_ptr, src_stride, scale, n, source_y_fraction); \
     }                                                                      \
-    memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS));              \
+    memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS));                \
     if (source_y_fraction) {                                               \
-      memcpy(temps + 64, src_ptr + src_stride + n * SBPP,                  \
+      memcpy(vin + 64, src_ptr + src_stride + n * SBPP,                    \
              r * SBPP * sizeof(TS));                                       \
     }                                                                      \
-    ANY_SIMD(tempd, temps, 64, scale, MASK + 1, source_y_fraction);        \
-    memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD));                \
+    ANY_SIMD(vout, vin, 64, scale, MASK + 1, source_y_fraction);           \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD));                 \
   }
 
 #ifdef HAS_INTERPOLATEROW_16TO8_NEON
@@ -1721,18 +1868,19 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2,
 #undef ANY11IS
 
 // Any 1 to 1 mirror.
-#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                              \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
-    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
-    memset(temp, 0, 64); /* for msan */                                   \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                            \
-    }                                                                     \
-    memcpy(temp, src_ptr, r* BPP);                                        \
-    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
-    memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                          \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
+    SIMD_ALIGNED(uint8_t vin[64]);                                    \
+    SIMD_ALIGNED(uint8_t vout[64]);                                   \
+    memset(vin, 0, sizeof(vin)); /* for msan */                       \
+    int r = width & MASK;                                             \
+    int n = width & ~MASK;                                            \
+    if (n > 0) {                                                      \
+      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                        \
+    }                                                                 \
+    memcpy(vin, src_ptr, r* BPP);                                     \
+    ANY_SIMD(vin, vout, MASK + 1);                                    \
+    memcpy(dst_ptr + n * BPP, vout + (MASK + 1 - r) * BPP, r * BPP);  \
   }
 
 #ifdef HAS_MIRRORROW_AVX2
@@ -1747,6 +1895,9 @@ ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31)
 #ifdef HAS_MIRRORROW_MSA
 ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
 #endif
+#ifdef HAS_MIRRORROW_LSX
+ANY11M(MirrorRow_Any_LSX, MirrorRow_LSX, 1, 31)
+#endif
 #ifdef HAS_MIRRORROW_LASX
 ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63)
 #endif
@@ -1762,6 +1913,9 @@ ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
 #ifdef HAS_MIRRORUVROW_MSA
 ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7)
 #endif
+#ifdef HAS_MIRRORUVROW_LSX
+ANY11M(MirrorUVRow_Any_LSX, MirrorUVRow_LSX, 2, 7)
+#endif
 #ifdef HAS_MIRRORUVROW_LASX
 ANY11M(MirrorUVRow_Any_LASX, MirrorUVRow_LASX, 2, 15)
 #endif
@@ -1777,6 +1931,9 @@ ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
 #ifdef HAS_ARGBMIRRORROW_MSA
 ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
 #endif
+#ifdef HAS_ARGBMIRRORROW_LSX
+ANY11M(ARGBMirrorRow_Any_LSX, ARGBMirrorRow_LSX, 4, 7)
+#endif
 #ifdef HAS_ARGBMIRRORROW_LASX
 ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15)
 #endif
@@ -1791,15 +1948,14 @@ ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15)
 // Any 1 plane. (memset)
 #define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)        \
   void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
-    SIMD_ALIGNED(uint8_t temp[64]);                  \
-    memset(temp, 0, 64); /* for msan */              \
+    SIMD_ALIGNED(uint8_t vout[64]);                  \
     int r = width & MASK;                            \
     int n = width & ~MASK;                           \
     if (n > 0) {                                     \
       ANY_SIMD(dst_ptr, v32, n);                     \
     }                                                \
-    ANY_SIMD(temp, v32, MASK + 1);                   \
-    memcpy(dst_ptr + n * BPP, temp, r * BPP);        \
+    ANY_SIMD(vout, v32, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);        \
   }
 
 #ifdef HAS_SETROW_X86
@@ -1823,20 +1979,21 @@ ANY1(ARGBSetRow_Any_LSX, ARGBSetRow_LSX, uint32_t, 4, 3)
 #undef ANY1
 
 // Any 1 to 2.  Outputs UV planes.
-#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)          \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v,  \
-               int width) {                                             \
-    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                \
-    memset(temp, 0, 128); /* for msan */                                \
-    int r = width & MASK;                                               \
-    int n = width & ~MASK;                                              \
-    if (n > 0) {                                                        \
-      ANY_SIMD(src_ptr, dst_u, dst_v, n);                               \
-    }                                                                   \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
-    ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                   \
-    memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));       \
-    memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));       \
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)         \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \
+               int width) {                                            \
+    SIMD_ALIGNED(uint8_t vin[128]);                                    \
+    SIMD_ALIGNED(uint8_t vout[128 * 2]);                               \
+    memset(vin, 0, sizeof(vin)); /* for msan */                        \
+    int r = width & MASK;                                              \
+    int n = width & ~MASK;                                             \
+    if (n > 0) {                                                       \
+      ANY_SIMD(src_ptr, dst_u, dst_v, n);                              \
+    }                                                                  \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+    ANY_SIMD(vin, vout, vout + 128, MASK + 1);                         \
+    memcpy(dst_u + (n >> DUVSHIFT), vout, SS(r, DUVSHIFT));            \
+    memcpy(dst_v + (n >> DUVSHIFT), vout + 128, SS(r, DUVSHIFT));      \
   }
 
 #ifdef HAS_SPLITUVROW_SSE2
@@ -1875,6 +2032,11 @@ ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
 ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
 ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
 #endif
+#ifdef HAS_YUY2TOUV422ROW_LSX
+ANY12(ARGBToUV444Row_Any_LSX, ARGBToUV444Row_LSX, 0, 4, 0, 15)
+ANY12(YUY2ToUV422Row_Any_LSX, YUY2ToUV422Row_LSX, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_LSX, UYVYToUV422Row_LSX, 1, 4, 1, 15)
+#endif
 #ifdef HAS_YUY2TOUV422ROW_LASX
 ANY12(ARGBToUV444Row_Any_LASX, ARGBToUV444Row_LASX, 0, 4, 0, 31)
 ANY12(YUY2ToUV422Row_Any_LASX, YUY2ToUV422Row_LASX, 1, 4, 1, 31)
@@ -1885,17 +2047,18 @@ ANY12(UYVYToUV422Row_Any_LASX, UYVYToUV422Row_LASX, 1, 4, 1, 31)
 // Any 2 16 bit planes with parameter to 1
 #define ANY12PT(NAMEANY, ANY_SIMD, T, BPP, MASK)                            \
   void NAMEANY(const T* src_uv, T* dst_u, T* dst_v, int depth, int width) { \
-    SIMD_ALIGNED(T temp[16 * 4]);                                           \
-    memset(temp, 0, 16 * 4 * BPP); /* for msan */                           \
+    SIMD_ALIGNED(T vin[16 * 2]);                                            \
+    SIMD_ALIGNED(T vout[16 * 2]);                                           \
+    memset(vin, 0, sizeof(vin)); /* for msan */                             \
     int r = width & MASK;                                                   \
     int n = width & ~MASK;                                                  \
     if (n > 0) {                                                            \
       ANY_SIMD(src_uv, dst_u, dst_v, depth, n);                             \
     }                                                                       \
-    memcpy(temp, src_uv + n * 2, r * BPP * 2);                              \
-    ANY_SIMD(temp, temp + 32, temp + 48, depth, MASK + 1);                  \
-    memcpy(dst_u + n, temp + 32, r * BPP);                                  \
-    memcpy(dst_v + n, temp + 48, r * BPP);                                  \
+    memcpy(vin, src_uv + n * 2, r * BPP * 2);                               \
+    ANY_SIMD(vin, vout, vout + 16, depth, MASK + 1);                        \
+    memcpy(dst_u + n, vout, r * BPP);                                       \
+    memcpy(dst_v + n, vout + 16, r * BPP);                                  \
   }
 
 #ifdef HAS_SPLITUVROW_16_AVX2
@@ -1909,21 +2072,22 @@ ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7)
 #undef ANY21CT
 
 // Any 1 to 3.  Outputs RGB planes.
-#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                                \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g,     \
-               uint8_t* dst_b, int width) {                                \
-    SIMD_ALIGNED(uint8_t temp[16 * 6]);                                    \
-    memset(temp, 0, 16 * 3); /* for msan */                                \
-    int r = width & MASK;                                                  \
-    int n = width & ~MASK;                                                 \
-    if (n > 0) {                                                           \
-      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n);                           \
-    }                                                                      \
-    memcpy(temp, src_ptr + n * BPP, r * BPP);                              \
-    ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \
-    memcpy(dst_r + n, temp + 16 * 3, r);                                   \
-    memcpy(dst_g + n, temp + 16 * 4, r);                                   \
-    memcpy(dst_b + n, temp + 16 * 5, r);                                   \
+#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                            \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \
+               uint8_t* dst_b, int width) {                            \
+    SIMD_ALIGNED(uint8_t vin[16 * 3]);                                 \
+    SIMD_ALIGNED(uint8_t vout[16 * 3]);                                \
+    memset(vin, 0, sizeof(vin)); /* for msan */                        \
+    int r = width & MASK;                                              \
+    int n = width & ~MASK;                                             \
+    if (n > 0) {                                                       \
+      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n);                       \
+    }                                                                  \
+    memcpy(vin, src_ptr + n * BPP, r * BPP);                           \
+    ANY_SIMD(vin, vout, vout + 16, vout + 32, MASK + 1);               \
+    memcpy(dst_r + n, vout, r);                                        \
+    memcpy(dst_g + n, vout + 16, r);                                   \
+    memcpy(dst_b + n, vout + 32, r);                                   \
   }
 
 #ifdef HAS_SPLITRGBROW_SSSE3
@@ -1946,23 +2110,23 @@ ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15)
 #endif
 
 // Any 1 to 4.  Outputs ARGB planes.
-#define ANY14(NAMEANY, ANY_SIMD, BPP, MASK)                                    \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g,         \
-               uint8_t* dst_b, uint8_t* dst_a, int width) {                    \
-    SIMD_ALIGNED(uint8_t temp[16 * 8]);                                        \
-    memset(temp, 0, 16 * 4); /* for msan */                                    \
-    int r = width & MASK;                                                      \
-    int n = width & ~MASK;                                                     \
-    if (n > 0) {                                                               \
-      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n);                        \
-    }                                                                          \
-    memcpy(temp, src_ptr + n * BPP, r * BPP);                                  \
-    ANY_SIMD(temp, temp + 16 * 4, temp + 16 * 5, temp + 16 * 6, temp + 16 * 7, \
-             MASK + 1);                                                        \
-    memcpy(dst_r + n, temp + 16 * 4, r);                                       \
-    memcpy(dst_g + n, temp + 16 * 5, r);                                       \
-    memcpy(dst_b + n, temp + 16 * 6, r);                                       \
-    memcpy(dst_a + n, temp + 16 * 7, r);                                       \
+#define ANY14(NAMEANY, ANY_SIMD, BPP, MASK)                            \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \
+               uint8_t* dst_b, uint8_t* dst_a, int width) {            \
+    SIMD_ALIGNED(uint8_t vin[16 * 4]);                                 \
+    SIMD_ALIGNED(uint8_t vout[16 * 4]);                                \
+    memset(vin, 0, sizeof(vin)); /* for msan */                        \
+    int r = width & MASK;                                              \
+    int n = width & ~MASK;                                             \
+    if (n > 0) {                                                       \
+      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n);                \
+    }                                                                  \
+    memcpy(vin, src_ptr + n * BPP, r * BPP);                           \
+    ANY_SIMD(vin, vout, vout + 16, vout + 32, vout + 48, MASK + 1);    \
+    memcpy(dst_r + n, vout, r);                                        \
+    memcpy(dst_g + n, vout + 16, r);                                   \
+    memcpy(dst_b + n, vout + 32, r);                                   \
+    memcpy(dst_a + n, vout + 48, r);                                   \
   }
 
 #ifdef HAS_SPLITARGBROW_SSE2
@@ -1983,25 +2147,26 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
 #define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
   void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u,       \
                uint8_t* dst_v, int width) {                                  \
-    SIMD_ALIGNED(uint8_t temp[128 * 4]);                                     \
-    memset(temp, 0, 128 * 2); /* for msan */                                 \
+    SIMD_ALIGNED(uint8_t vin[128 * 2]);                                      \
+    SIMD_ALIGNED(uint8_t vout[128 * 2]);                                     \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
       ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n);                        \
     }                                                                        \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
-    memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,          \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);       \
+    memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,           \
            SS(r, UVSHIFT) * BPP);                                            \
     if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
-      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+      memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP,   \
              BPP);                                                           \
-      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
-             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
+      memcpy(vin + 128 + SS(r, UVSHIFT) * BPP,                               \
+             vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                   \
     }                                                                        \
-    ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
-    memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
-    memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
+    ANY_SIMD(vin, 128, vout, vout + 128, MASK + 1);                          \
+    memcpy(dst_u + (n >> 1), vout, SS(r, 1));                                \
+    memcpy(dst_v + (n >> 1), vout + 128, SS(r, 1));                          \
   }
 
 #ifdef HAS_ARGBTOUVROW_AVX2
@@ -2013,9 +2178,17 @@ ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
 #ifdef HAS_ARGBTOUVJROW_AVX2
 ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
 #endif
+#ifdef HAS_ABGRTOUVJROW_AVX2
+ANY12S(ABGRToUVJRow_Any_AVX2, ABGRToUVJRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVJROW_SSSE3
+ANY12S(ABGRToUVJRow_Any_SSSE3, ABGRToUVJRow_SSSE3, 0, 4, 15)
+#endif
 #ifdef HAS_ARGBTOUVROW_SSSE3
 ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
-ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
 ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
 ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
 ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
@@ -2034,12 +2207,18 @@ ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
 #ifdef HAS_ARGBTOUVROW_MSA
 ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
 #endif
+#ifdef HAS_ARGBTOUVROW_LSX
+ANY12S(ARGBToUVRow_Any_LSX, ARGBToUVRow_LSX, 0, 4, 15)
+#endif
 #ifdef HAS_ARGBTOUVROW_LASX
 ANY12S(ARGBToUVRow_Any_LASX, ARGBToUVRow_LASX, 0, 4, 31)
 #endif
 #ifdef HAS_ARGBTOUVJROW_NEON
 ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ABGRTOUVJROW_NEON
+ANY12S(ABGRToUVJRow_Any_NEON, ABGRToUVJRow_NEON, 0, 4, 15)
+#endif
 #ifdef HAS_ARGBTOUVJROW_MSA
 ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
 #endif
@@ -2142,12 +2321,18 @@ ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
 #ifdef HAS_YUY2TOUVROW_MSA
 ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
 #endif
+#ifdef HAS_YUY2TOUVROW_LSX
+ANY12S(YUY2ToUVRow_Any_LSX, YUY2ToUVRow_LSX, 1, 4, 15)
+#endif
 #ifdef HAS_YUY2TOUVROW_LASX
 ANY12S(YUY2ToUVRow_Any_LASX, YUY2ToUVRow_LASX, 1, 4, 31)
 #endif
 #ifdef HAS_UYVYTOUVROW_MSA
 ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
 #endif
+#ifdef HAS_UYVYTOUVROW_LSX
+ANY12S(UYVYToUVRow_Any_LSX, UYVYToUVRow_LSX, 1, 4, 15)
+#endif
 #ifdef HAS_UYVYTOUVROW_LASX
 ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31)
 #endif
@@ -2158,24 +2343,25 @@ ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31)
 #define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
   void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu,      \
                int width) {                                                  \
-    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                     \
-    memset(temp, 0, 128 * 2); /* for msan */                                 \
+    SIMD_ALIGNED(uint8_t vin[128 * 2]);                                      \
+    SIMD_ALIGNED(uint8_t vout[128]);                                         \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
       ANY_SIMD(src_ptr, src_stride, dst_vu, n);                              \
     }                                                                        \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
-    memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,          \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);       \
+    memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,           \
            SS(r, UVSHIFT) * BPP);                                            \
     if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
-      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+      memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP,   \
              BPP);                                                           \
-      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
-             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
+      memcpy(vin + 128 + SS(r, UVSHIFT) * BPP,                               \
+             vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                   \
     }                                                                        \
-    ANY_SIMD(temp, 128, temp + 256, MASK + 1);                               \
-    memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2);                 \
+    ANY_SIMD(vin, 128, vout, MASK + 1);                                      \
+    memcpy(dst_vu + (n >> 1) * 2, vout, SS(r, 1) * 2);                       \
   }
 
 #ifdef HAS_AYUVTOVUROW_NEON
@@ -2184,42 +2370,53 @@ ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
 #endif
 #undef ANY11S
 
-#define ANYDETILE(NAMEANY, ANY_SIMD, MASK)                                  \
-  void NAMEANY(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, \
-               int width) {                                                 \
-    SIMD_ALIGNED(uint8_t temp[16 * 2]);                                     \
-    memset(temp, 0, 16); /* for msan */                                     \
-    int r = width & MASK;                                                   \
-    int n = width & ~MASK;                                                  \
-    if (n > 0) {                                                            \
-      ANY_SIMD(src, src_tile_stride, dst, n);                               \
-    }                                                                       \
-    memcpy(temp, src + (n / 16) * src_tile_stride, r);                      \
-    ANY_SIMD(temp, src_tile_stride, temp + 16, MASK + 1);                   \
-    memcpy(dst + n, temp + 16, r);                                          \
+#define ANYDETILE(NAMEANY, ANY_SIMD, T, BPP, MASK)                           \
+  void NAMEANY(const T* src, ptrdiff_t src_tile_stride, T* dst, int width) { \
+    SIMD_ALIGNED(T vin[16]);                                                 \
+    SIMD_ALIGNED(T vout[16]);                                                \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src, src_tile_stride, dst, n);                                \
+    }                                                                        \
+    memcpy(vin, src + (n / 16) * src_tile_stride, r * BPP);                  \
+    ANY_SIMD(vin, src_tile_stride, vout, MASK + 1);                          \
+    memcpy(dst + n, vout, r * BPP);                                          \
   }
 
 #ifdef HAS_DETILEROW_NEON
-ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, 15)
+ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, uint8_t, 1, 15)
 #endif
 #ifdef HAS_DETILEROW_SSE2
-ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, 15)
+ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, uint8_t, 1, 15)
+#endif
+#ifdef HAS_DETILEROW_16_NEON
+ANYDETILE(DetileRow_16_Any_NEON, DetileRow_16_NEON, uint16_t, 2, 15)
+#endif
+#ifdef HAS_DETILEROW_16_SSE2
+ANYDETILE(DetileRow_16_Any_SSE2, DetileRow_16_SSE2, uint16_t, 2, 15)
+#endif
+#ifdef HAS_DETILEROW_16_AVX
+ANYDETILE(DetileRow_16_Any_AVX, DetileRow_16_AVX, uint16_t, 2, 15)
 #endif
 
+// DetileSplitUVRow width is in bytes
 #define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK)                \
   void NAMEANY(const uint8_t* src_uv, ptrdiff_t src_tile_stride, \
                uint8_t* dst_u, uint8_t* dst_v, int width) {      \
-    SIMD_ALIGNED(uint8_t temp[16 * 2]);                          \
-    memset(temp, 0, 16 * 2); /* for msan */                      \
+    SIMD_ALIGNED(uint8_t vin[16]);                               \
+    SIMD_ALIGNED(uint8_t vout[8 * 2]);                           \
+    memset(vin, 0, sizeof(vin)); /* for msan */                  \
     int r = width & MASK;                                        \
     int n = width & ~MASK;                                       \
     if (n > 0) {                                                 \
       ANY_SIMD(src_uv, src_tile_stride, dst_u, dst_v, n);        \
     }                                                            \
-    memcpy(temp, src_uv + (n / 16) * src_tile_stride, r);        \
-    ANY_SIMD(temp, src_tile_stride, temp + 16, temp + 24, r);    \
-    memcpy(dst_u + n / 2, temp + 16, (r + 1) / 2);               \
-    memcpy(dst_v + n / 2, temp + 24, (r + 1) / 2);               \
+    memcpy(vin, src_uv + (n / 16) * src_tile_stride, r);         \
+    ANY_SIMD(vin, src_tile_stride, vout, vout + 8, r);           \
+    memcpy(dst_u + n / 2, vout, (r + 1) / 2);                    \
+    memcpy(dst_v + n / 2, vout + 8, (r + 1) / 2);                \
   }
 
 #ifdef HAS_DETILESPLITUVROW_NEON
@@ -2229,6 +2426,33 @@ ANYDETILESPLITUV(DetileSplitUVRow_Any_NEON, DetileSplitUVRow_NEON, 15)
 ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15)
 #endif
 
+#define ANYDETILEMERGE(NAMEANY, ANY_SIMD, MASK)                                \
+  void NAMEANY(const uint8_t* src_y, ptrdiff_t src_y_tile_stride,              \
+               const uint8_t* src_uv, ptrdiff_t src_uv_tile_stride,            \
+               uint8_t* dst_yuy2, int width) {                                 \
+    SIMD_ALIGNED(uint8_t vin[16 * 2]);                                         \
+    SIMD_ALIGNED(uint8_t vout[16 * 2]);                                        \
+    memset(vin, 0, sizeof(vin)); /* for msan */                                \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, dst_yuy2, \
+               n);                                                             \
+    }                                                                          \
+    memcpy(vin, src_y + (n / 16) * src_y_tile_stride, r);                      \
+    memcpy(vin + 16, src_uv + (n / 16) * src_uv_tile_stride, r);               \
+    ANY_SIMD(vin, src_y_tile_stride, vin + 16, src_uv_tile_stride, vout, r);   \
+    memcpy(dst_yuy2 + 2 * n, vout, 2 * r);                                     \
+  }
+
+#ifdef HAS_DETILETOYUY2_NEON
+ANYDETILEMERGE(DetileToYUY2_Any_NEON, DetileToYUY2_NEON, 15)
+#endif
+
+#ifdef HAS_DETILETOYUY2_SSE2
+ANYDETILEMERGE(DetileToYUY2_Any_SSE2, DetileToYUY2_SSE2, 15)
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/source/row_common.cc b/files/source/row_common.cc
index 83442496..8be37fb5 100644
--- a/files/source/row_common.cc
+++ b/files/source/row_common.cc
@@ -21,6 +21,12 @@ namespace libyuv {
 extern "C" {
 #endif
 
+#ifdef __cplusplus
+#define STATIC_CAST(type, expr) static_cast<type>(expr)
+#else
+#define STATIC_CAST(type, expr) (type)(expr)
+#endif
+
 // This macro controls YUV to RGB using unsigned math to extend range of
 // YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B:
 // LIBYUV_UNLIMITED_DATA
@@ -182,12 +188,13 @@ void RGB565ToARGBRow_C(const uint8_t* src_rgb565,
                        int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8_t b = src_rgb565[0] & 0x1f;
-    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8_t r = src_rgb565[1] >> 3;
-    dst_argb[0] = (b << 3) | (b >> 2);
-    dst_argb[1] = (g << 2) | (g >> 4);
-    dst_argb[2] = (r << 3) | (r >> 2);
+    uint8_t b = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
+    uint8_t g = STATIC_CAST(
+        uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+    uint8_t r = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
+    dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+    dst_argb[1] = STATIC_CAST(uint8_t, (g << 2) | (g >> 4));
+    dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
     dst_argb[3] = 255u;
     dst_argb += 4;
     src_rgb565 += 2;
@@ -199,13 +206,14 @@ void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8_t b = src_argb1555[0] & 0x1f;
-    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
-    uint8_t a = src_argb1555[1] >> 7;
-    dst_argb[0] = (b << 3) | (b >> 2);
-    dst_argb[1] = (g << 3) | (g >> 2);
-    dst_argb[2] = (r << 3) | (r >> 2);
+    uint8_t b = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f);
+    uint8_t g = STATIC_CAST(
+        uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
+    uint8_t r = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2);
+    uint8_t a = STATIC_CAST(uint8_t, src_argb1555[1] >> 7);
+    dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+    dst_argb[1] = STATIC_CAST(uint8_t, (g << 3) | (g >> 2));
+    dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
     dst_argb[3] = -a;
     dst_argb += 4;
     src_argb1555 += 2;
@@ -217,14 +225,14 @@ void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8_t b = src_argb4444[0] & 0x0f;
-    uint8_t g = src_argb4444[0] >> 4;
-    uint8_t r = src_argb4444[1] & 0x0f;
-    uint8_t a = src_argb4444[1] >> 4;
-    dst_argb[0] = (b << 4) | b;
-    dst_argb[1] = (g << 4) | g;
-    dst_argb[2] = (r << 4) | r;
-    dst_argb[3] = (a << 4) | a;
+    uint8_t b = STATIC_CAST(uint8_t, src_argb4444[0] & 0x0f);
+    uint8_t g = STATIC_CAST(uint8_t, src_argb4444[0] >> 4);
+    uint8_t r = STATIC_CAST(uint8_t, src_argb4444[1] & 0x0f);
+    uint8_t a = STATIC_CAST(uint8_t, src_argb4444[1] >> 4);
+    dst_argb[0] = STATIC_CAST(uint8_t, (b << 4) | b);
+    dst_argb[1] = STATIC_CAST(uint8_t, (g << 4) | g);
+    dst_argb[2] = STATIC_CAST(uint8_t, (r << 4) | r);
+    dst_argb[3] = STATIC_CAST(uint8_t, (a << 4) | a);
     dst_argb += 4;
     src_argb4444 += 2;
   }
@@ -320,7 +328,7 @@ void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t b0 = src_argb[0] >> 3;
     uint8_t g0 = src_argb[1] >> 2;
     uint8_t r0 = src_argb[2] >> 3;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16_t*)(dst_rgb) = STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
   }
 }
 
@@ -334,29 +342,31 @@ void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
 // or the upper byte for big endian.
 void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
                              uint8_t* dst_rgb,
-                             const uint32_t dither4,
+                             uint32_t dither4,
                              int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     int dither0 = ((const unsigned char*)(&dither4))[x & 3];
     int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
-    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
-    uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;
-    uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;
-    uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;
-    *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 11);
-    *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 11);
+    uint8_t b0 = STATIC_CAST(uint8_t, clamp255(src_argb[0] + dither0) >> 3);
+    uint8_t g0 = STATIC_CAST(uint8_t, clamp255(src_argb[1] + dither0) >> 2);
+    uint8_t r0 = STATIC_CAST(uint8_t, clamp255(src_argb[2] + dither0) >> 3);
+    uint8_t b1 = STATIC_CAST(uint8_t, clamp255(src_argb[4] + dither1) >> 3);
+    uint8_t g1 = STATIC_CAST(uint8_t, clamp255(src_argb[5] + dither1) >> 2);
+    uint8_t r1 = STATIC_CAST(uint8_t, clamp255(src_argb[6] + dither1) >> 3);
+    *(uint16_t*)(dst_rgb + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
+    *(uint16_t*)(dst_rgb + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 11));
     dst_rgb += 4;
     src_argb += 8;
   }
   if (width & 1) {
     int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
-    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+    uint8_t b0 = STATIC_CAST(uint8_t, clamp255(src_argb[0] + dither0) >> 3);
+    uint8_t g0 = STATIC_CAST(uint8_t, clamp255(src_argb[1] + dither0) >> 2);
+    uint8_t r0 = STATIC_CAST(uint8_t, clamp255(src_argb[2] + dither0) >> 3);
+    *(uint16_t*)(dst_rgb) = STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
   }
 }
 
@@ -371,8 +381,10 @@ void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t g1 = src_argb[5] >> 3;
     uint8_t r1 = src_argb[6] >> 3;
     uint8_t a1 = src_argb[7] >> 7;
-    *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
-    *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 10) | (a1 << 15);
+    *(uint16_t*)(dst_rgb + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | (a0 << 15));
+    *(uint16_t*)(dst_rgb + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 10) | (a1 << 15));
     dst_rgb += 4;
     src_argb += 8;
   }
@@ -381,7 +393,8 @@ void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t g0 = src_argb[1] >> 3;
     uint8_t r0 = src_argb[2] >> 3;
     uint8_t a0 = src_argb[3] >> 7;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+    *(uint16_t*)(dst_rgb) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | (a0 << 15));
   }
 }
 
@@ -396,8 +409,10 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t g1 = src_argb[5] >> 4;
     uint8_t r1 = src_argb[6] >> 4;
     uint8_t a1 = src_argb[7] >> 4;
-    *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
-    *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 4) | (r1 << 8) | (a1 << 12);
+    *(uint16_t*)(dst_rgb + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | (a0 << 12));
+    *(uint16_t*)(dst_rgb + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 4) | (r1 << 8) | (a1 << 12));
     dst_rgb += 4;
     src_argb += 8;
   }
@@ -406,18 +421,20 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t g0 = src_argb[1] >> 4;
     uint8_t r0 = src_argb[2] >> 4;
     uint8_t a0 = src_argb[3] >> 4;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+    *(uint16_t*)(dst_rgb) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | (a0 << 12));
   }
 }
 
 void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
+    uint32_t r0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
     uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);
-    uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
+    uint32_t b0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
     uint32_t a0 = (src_abgr[3] >> 6);
-    *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);
+    *(uint32_t*)(dst_ar30) =
+        STATIC_CAST(uint32_t, b0 | (g0 << 10) | (r0 << 20) | (a0 << 30));
     dst_ar30 += 4;
     src_abgr += 4;
   }
@@ -430,7 +447,8 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
     uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);
     uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);
     uint32_t a0 = (src_argb[3] >> 6);
-    *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
+    *(uint32_t*)(dst_ar30) =
+        STATIC_CAST(uint32_t, b0 | (g0 << 10) | (r0 << 20) | (a0 << 30));
     dst_ar30 += 4;
     src_argb += 4;
   }
@@ -439,10 +457,14 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
 void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_ar64[0] = src_argb[0] * 0x0101;
-    dst_ar64[1] = src_argb[1] * 0x0101;
-    dst_ar64[2] = src_argb[2] * 0x0101;
-    dst_ar64[3] = src_argb[3] * 0x0101;
+    uint16_t b = src_argb[0] * 0x0101;
+    uint16_t g = src_argb[1] * 0x0101;
+    uint16_t r = src_argb[2] * 0x0101;
+    uint16_t a = src_argb[3] * 0x0101;
+    dst_ar64[0] = b;
+    dst_ar64[1] = g;
+    dst_ar64[2] = r;
+    dst_ar64[3] = a;
     dst_ar64 += 4;
     src_argb += 4;
   }
@@ -451,10 +473,14 @@ void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
 void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_ab64[0] = src_argb[2] * 0x0101;
-    dst_ab64[1] = src_argb[1] * 0x0101;
-    dst_ab64[2] = src_argb[0] * 0x0101;
-    dst_ab64[3] = src_argb[3] * 0x0101;
+    uint16_t b = src_argb[0] * 0x0101;
+    uint16_t g = src_argb[1] * 0x0101;
+    uint16_t r = src_argb[2] * 0x0101;
+    uint16_t a = src_argb[3] * 0x0101;
+    dst_ab64[0] = r;
+    dst_ab64[1] = g;
+    dst_ab64[2] = b;
+    dst_ab64[3] = a;
     dst_ab64 += 4;
     src_argb += 4;
   }
@@ -463,10 +489,14 @@ void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
 void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_argb[0] = src_ar64[0] >> 8;
-    dst_argb[1] = src_ar64[1] >> 8;
-    dst_argb[2] = src_ar64[2] >> 8;
-    dst_argb[3] = src_ar64[3] >> 8;
+    uint8_t b = src_ar64[0] >> 8;
+    uint8_t g = src_ar64[1] >> 8;
+    uint8_t r = src_ar64[2] >> 8;
+    uint8_t a = src_ar64[3] >> 8;
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
     dst_argb += 4;
     src_ar64 += 4;
   }
@@ -475,10 +505,14 @@ void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
 void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_argb[0] = src_ab64[2] >> 8;
-    dst_argb[1] = src_ab64[1] >> 8;
-    dst_argb[2] = src_ab64[0] >> 8;
-    dst_argb[3] = src_ab64[3] >> 8;
+    uint8_t r = src_ab64[0] >> 8;
+    uint8_t g = src_ab64[1] >> 8;
+    uint8_t b = src_ab64[2] >> 8;
+    uint8_t a = src_ab64[3] >> 8;
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
     dst_argb += 4;
     src_ab64 += 4;
   }
@@ -514,8 +548,8 @@ void AR64ShuffleRow_C(const uint8_t* src_ar64,
 
 #ifdef LIBYUV_RGB7
 // Old 7 bit math for compatibility on unsupported platforms.
-static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
-  return ((33 * r + 65 * g + 13 * b) >> 7) + 16;
+static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, ((33 * r + 65 * g + 13 * b) >> 7) + 16);
 }
 #else
 // 8 bit
@@ -524,8 +558,8 @@ static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
 //  return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
 //  0x7e80) >> 8;
 
-static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
-  return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
+static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8);
 }
 #endif
 
@@ -533,29 +567,31 @@ static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
 
 // LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round.
 #ifdef LIBYUV_RGBTOU_TRUNCATE
-static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
-  return (112 * b - 74 * g - 38 * r + 0x8000) >> 8;
+static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8000) >> 8);
 }
-static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
-  return (112 * r - 94 * g - 18 * b + 0x8000) >> 8;
+static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8000) >> 8);
 }
 #else
 // TODO(fbarchard): Add rounding to x86 SIMD and use this
-static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
-  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
+static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8);
 }
-static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
-  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
+static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8080) >> 8);
 }
 #endif
 
 // LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb.
 #if !defined(LIBYUV_ARGBTOUV_PAVGB)
 static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
-  return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8;
+  return STATIC_CAST(
+      uint8_t, ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8);
 }
 static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
-  return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8;
+  return STATIC_CAST(
+      uint8_t, ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8);
 }
 #endif
 
@@ -674,28 +710,28 @@ MAKEROWY(RAW, 0, 1, 2, 3)
 
 #ifdef LIBYUV_RGB7
 // Old 7 bit math for compatibility on unsupported platforms.
-static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
   return (38 * r + 75 * g + 15 * b + 64) >> 7;
 }
 #else
 // 8 bit
-static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
   return (77 * r + 150 * g + 29 * b + 128) >> 8;
 }
 #endif
 
 #if defined(LIBYUV_ARGBTOUV_PAVGB)
-static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
+static __inline uint8_t RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
   return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
 }
-static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
+static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
   return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
 }
 #else
-static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
+static __inline uint8_t RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
   return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
 }
-static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
+static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
   return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
 }
 #endif
@@ -782,6 +818,7 @@ static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
 #endif
 
 MAKEROWYJ(ARGB, 2, 1, 0, 4)
+MAKEROWYJ(ABGR, 0, 1, 2, 4)
 MAKEROWYJ(RGBA, 3, 2, 1, 4)
 MAKEROWYJ(RGB24, 2, 1, 0, 3)
 MAKEROWYJ(RAW, 0, 1, 2, 3)
@@ -791,11 +828,12 @@ void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
     uint8_t b = src_rgb565[0] & 0x1f;
-    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t g = STATIC_CAST(
+        uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
     uint8_t r = src_rgb565[1] >> 3;
-    b = (b << 3) | (b >> 2);
-    g = (g << 2) | (g >> 4);
-    r = (r << 3) | (r >> 2);
+    b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+    g = STATIC_CAST(uint8_t, (g << 2) | (g >> 4));
+    r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
     dst_y[0] = RGBToY(r, g, b);
     src_rgb565 += 2;
     dst_y += 1;
@@ -806,11 +844,12 @@ void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
     uint8_t b = src_argb1555[0] & 0x1f;
-    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t g = STATIC_CAST(
+        uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
     uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
-    b = (b << 3) | (b >> 2);
-    g = (g << 3) | (g >> 2);
-    r = (r << 3) | (r >> 2);
+    b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+    g = STATIC_CAST(uint8_t, (g << 3) | (g >> 2));
+    r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
     dst_y[0] = RGBToY(r, g, b);
     src_argb1555 += 2;
     dst_y += 1;
@@ -823,9 +862,9 @@ void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
     uint8_t b = src_argb4444[0] & 0x0f;
     uint8_t g = src_argb4444[0] >> 4;
     uint8_t r = src_argb4444[1] & 0x0f;
-    b = (b << 4) | b;
-    g = (g << 4) | g;
-    r = (r << 4) | r;
+    b = STATIC_CAST(uint8_t, (b << 4) | b);
+    g = STATIC_CAST(uint8_t, (g << 4) | g);
+    r = STATIC_CAST(uint8_t, (r << 4) | r);
     dst_y[0] = RGBToY(r, g, b);
     src_argb4444 += 2;
     dst_y += 1;
@@ -840,31 +879,35 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
   const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8_t b0 = src_rgb565[0] & 0x1f;
-    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8_t r0 = src_rgb565[1] >> 3;
-    uint8_t b1 = src_rgb565[2] & 0x1f;
-    uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
-    uint8_t r1 = src_rgb565[3] >> 3;
-    uint8_t b2 = next_rgb565[0] & 0x1f;
-    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8_t r2 = next_rgb565[1] >> 3;
-    uint8_t b3 = next_rgb565[2] & 0x1f;
-    uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
-    uint8_t r3 = next_rgb565[3] >> 3;
-
-    b0 = (b0 << 3) | (b0 >> 2);
-    g0 = (g0 << 2) | (g0 >> 4);
-    r0 = (r0 << 3) | (r0 >> 2);
-    b1 = (b1 << 3) | (b1 >> 2);
-    g1 = (g1 << 2) | (g1 >> 4);
-    r1 = (r1 << 3) | (r1 >> 2);
-    b2 = (b2 << 3) | (b2 >> 2);
-    g2 = (g2 << 2) | (g2 >> 4);
-    r2 = (r2 << 3) | (r2 >> 2);
-    b3 = (b3 << 3) | (b3 >> 2);
-    g3 = (g3 << 2) | (g3 >> 4);
-    r3 = (r3 << 3) | (r3 >> 2);
+    uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
+    uint8_t g0 = STATIC_CAST(
+        uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+    uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
+    uint8_t b1 = STATIC_CAST(uint8_t, src_rgb565[2] & 0x1f);
+    uint8_t g1 = STATIC_CAST(
+        uint8_t, (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3));
+    uint8_t r1 = STATIC_CAST(uint8_t, src_rgb565[3] >> 3);
+    uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f);
+    uint8_t g2 = STATIC_CAST(
+        uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3));
+    uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3);
+    uint8_t b3 = STATIC_CAST(uint8_t, next_rgb565[2] & 0x1f);
+    uint8_t g3 = STATIC_CAST(
+        uint8_t, (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3));
+    uint8_t r3 = STATIC_CAST(uint8_t, next_rgb565[3] >> 3);
+
+    b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+    g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4));
+    r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+    b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2));
+    g1 = STATIC_CAST(uint8_t, (g1 << 2) | (g1 >> 4));
+    r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2));
+    b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+    g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4));
+    r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
+    b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2));
+    g3 = STATIC_CAST(uint8_t, (g3 << 2) | (g3 >> 4));
+    r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2));
 
 #if LIBYUV_ARGBTOUV_PAVGB
     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
@@ -886,19 +929,20 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
     dst_v += 1;
   }
   if (width & 1) {
-    uint8_t b0 = src_rgb565[0] & 0x1f;
-    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8_t r0 = src_rgb565[1] >> 3;
-    uint8_t b2 = next_rgb565[0] & 0x1f;
-    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8_t r2 = next_rgb565[1] >> 3;
-
-    b0 = (b0 << 3) | (b0 >> 2);
-    g0 = (g0 << 2) | (g0 >> 4);
-    r0 = (r0 << 3) | (r0 >> 2);
-    b2 = (b2 << 3) | (b2 >> 2);
-    g2 = (g2 << 2) | (g2 >> 4);
-    r2 = (r2 << 3) | (r2 >> 2);
+    uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
+    uint8_t g0 = STATIC_CAST(
+        uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+    uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
+    uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f);
+    uint8_t g2 = STATIC_CAST(
+        uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3));
+    uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3);
+    b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+    g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4));
+    r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+    b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+    g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4));
+    r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
 
 #if LIBYUV_ARGBTOUV_PAVGB
     uint8_t ab = AVGB(b0, b2);
@@ -924,31 +968,35 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
   const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8_t b0 = src_argb1555[0] & 0x1f;
-    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8_t b1 = src_argb1555[2] & 0x1f;
-    uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
-    uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2;
-    uint8_t b2 = next_argb1555[0] & 0x1f;
-    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
-    uint8_t b3 = next_argb1555[2] & 0x1f;
-    uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
-    uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
-
-    b0 = (b0 << 3) | (b0 >> 2);
-    g0 = (g0 << 3) | (g0 >> 2);
-    r0 = (r0 << 3) | (r0 >> 2);
-    b1 = (b1 << 3) | (b1 >> 2);
-    g1 = (g1 << 3) | (g1 >> 2);
-    r1 = (r1 << 3) | (r1 >> 2);
-    b2 = (b2 << 3) | (b2 >> 2);
-    g2 = (g2 << 3) | (g2 >> 2);
-    r2 = (r2 << 3) | (r2 >> 2);
-    b3 = (b3 << 3) | (b3 >> 2);
-    g3 = (g3 << 3) | (g3 >> 2);
-    r3 = (r3 << 3) | (r3 >> 2);
+    uint8_t b0 = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f);
+    uint8_t g0 = STATIC_CAST(
+        uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
+    uint8_t r0 = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2);
+    uint8_t b1 = STATIC_CAST(uint8_t, src_argb1555[2] & 0x1f);
+    uint8_t g1 = STATIC_CAST(
+        uint8_t, (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3));
+    uint8_t r1 = STATIC_CAST(uint8_t, (src_argb1555[3] & 0x7c) >> 2);
+    uint8_t b2 = STATIC_CAST(uint8_t, next_argb1555[0] & 0x1f);
+    uint8_t g2 = STATIC_CAST(
+        uint8_t, (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3));
+    uint8_t r2 = STATIC_CAST(uint8_t, (next_argb1555[1] & 0x7c) >> 2);
+    uint8_t b3 = STATIC_CAST(uint8_t, next_argb1555[2] & 0x1f);
+    uint8_t g3 = STATIC_CAST(
+        uint8_t, (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3));
+    uint8_t r3 = STATIC_CAST(uint8_t, (next_argb1555[3] & 0x7c) >> 2);
+
+    b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+    g0 = STATIC_CAST(uint8_t, (g0 << 3) | (g0 >> 2));
+    r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+    b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2));
+    g1 = STATIC_CAST(uint8_t, (g1 << 3) | (g1 >> 2));
+    r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2));
+    b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+    g2 = STATIC_CAST(uint8_t, (g2 << 3) | (g2 >> 2));
+    r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
+    b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2));
+    g3 = STATIC_CAST(uint8_t, (g3 << 3) | (g3 >> 2));
+    r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2));
 
 #if LIBYUV_ARGBTOUV_PAVGB
     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
@@ -970,19 +1018,21 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
     dst_v += 1;
   }
   if (width & 1) {
-    uint8_t b0 = src_argb1555[0] & 0x1f;
-    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8_t b2 = next_argb1555[0] & 0x1f;
-    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
-
-    b0 = (b0 << 3) | (b0 >> 2);
-    g0 = (g0 << 3) | (g0 >> 2);
-    r0 = (r0 << 3) | (r0 >> 2);
-    b2 = (b2 << 3) | (b2 >> 2);
-    g2 = (g2 << 3) | (g2 >> 2);
-    r2 = (r2 << 3) | (r2 >> 2);
+    uint8_t b0 = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f);
+    uint8_t g0 = STATIC_CAST(
+        uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
+    uint8_t r0 = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2);
+    uint8_t b2 = STATIC_CAST(uint8_t, next_argb1555[0] & 0x1f);
+    uint8_t g2 = STATIC_CAST(
+        uint8_t, (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3));
+    uint8_t r2 = STATIC_CAST(uint8_t, (next_argb1555[1] & 0x7c) >> 2);
+
+    b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+    g0 = STATIC_CAST(uint8_t, (g0 << 3) | (g0 >> 2));
+    r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+    b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+    g2 = STATIC_CAST(uint8_t, (g2 << 3) | (g2 >> 2));
+    r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
 
 #if LIBYUV_ARGBTOUV_PAVGB
     uint8_t ab = AVGB(b0, b2);
@@ -1021,18 +1071,18 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
     uint8_t g3 = next_argb4444[2] >> 4;
     uint8_t r3 = next_argb4444[3] & 0x0f;
 
-    b0 = (b0 << 4) | b0;
-    g0 = (g0 << 4) | g0;
-    r0 = (r0 << 4) | r0;
-    b1 = (b1 << 4) | b1;
-    g1 = (g1 << 4) | g1;
-    r1 = (r1 << 4) | r1;
-    b2 = (b2 << 4) | b2;
-    g2 = (g2 << 4) | g2;
-    r2 = (r2 << 4) | r2;
-    b3 = (b3 << 4) | b3;
-    g3 = (g3 << 4) | g3;
-    r3 = (r3 << 4) | r3;
+    b0 = STATIC_CAST(uint8_t, (b0 << 4) | b0);
+    g0 = STATIC_CAST(uint8_t, (g0 << 4) | g0);
+    r0 = STATIC_CAST(uint8_t, (r0 << 4) | r0);
+    b1 = STATIC_CAST(uint8_t, (b1 << 4) | b1);
+    g1 = STATIC_CAST(uint8_t, (g1 << 4) | g1);
+    r1 = STATIC_CAST(uint8_t, (r1 << 4) | r1);
+    b2 = STATIC_CAST(uint8_t, (b2 << 4) | b2);
+    g2 = STATIC_CAST(uint8_t, (g2 << 4) | g2);
+    r2 = STATIC_CAST(uint8_t, (r2 << 4) | r2);
+    b3 = STATIC_CAST(uint8_t, (b3 << 4) | b3);
+    g3 = STATIC_CAST(uint8_t, (g3 << 4) | g3);
+    r3 = STATIC_CAST(uint8_t, (r3 << 4) | r3);
 
 #if LIBYUV_ARGBTOUV_PAVGB
     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
@@ -1061,12 +1111,12 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
     uint8_t g2 = next_argb4444[0] >> 4;
     uint8_t r2 = next_argb4444[1] & 0x0f;
 
-    b0 = (b0 << 4) | b0;
-    g0 = (g0 << 4) | g0;
-    r0 = (r0 << 4) | r0;
-    b2 = (b2 << 4) | b2;
-    g2 = (g2 << 4) | g2;
-    r2 = (r2 << 4) | r2;
+    b0 = STATIC_CAST(uint8_t, (b0 << 4) | b0);
+    g0 = STATIC_CAST(uint8_t, (g0 << 4) | g0);
+    r0 = STATIC_CAST(uint8_t, (r0 << 4) | r0);
+    b2 = STATIC_CAST(uint8_t, (b2 << 4) | b2);
+    g2 = STATIC_CAST(uint8_t, (g2 << 4) | g2);
+    r2 = STATIC_CAST(uint8_t, (r2 << 4) | r2);
 
 #if LIBYUV_ARGBTOUV_PAVGB
     uint8_t ab = AVGB(b0, b2);
@@ -1123,9 +1173,9 @@ void ARGBSepiaRow_C(uint8_t* dst_argb, int width) {
     int sg = (b * 22 + g * 88 + r * 45) >> 7;
     int sr = (b * 24 + g * 98 + r * 50) >> 7;
     // b does not over flow. a is preserved from original.
-    dst_argb[0] = sb;
-    dst_argb[1] = clamp255(sg);
-    dst_argb[2] = clamp255(sr);
+    dst_argb[0] = STATIC_CAST(uint8_t, sb);
+    dst_argb[1] = STATIC_CAST(uint8_t, clamp255(sg));
+    dst_argb[2] = STATIC_CAST(uint8_t, clamp255(sr));
     dst_argb += 4;
   }
 }
@@ -1154,10 +1204,10 @@ void ARGBColorMatrixRow_C(const uint8_t* src_argb,
     int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
               a * matrix_argb[15]) >>
              6;
-    dst_argb[0] = Clamp(sb);
-    dst_argb[1] = Clamp(sg);
-    dst_argb[2] = Clamp(sr);
-    dst_argb[3] = Clamp(sa);
+    dst_argb[0] = STATIC_CAST(uint8_t, Clamp(sb));
+    dst_argb[1] = STATIC_CAST(uint8_t, Clamp(sg));
+    dst_argb[2] = STATIC_CAST(uint8_t, Clamp(sr));
+    dst_argb[3] = STATIC_CAST(uint8_t, Clamp(sa));
     src_argb += 4;
     dst_argb += 4;
   }
@@ -1207,9 +1257,12 @@ void ARGBQuantizeRow_C(uint8_t* dst_argb,
     int b = dst_argb[0];
     int g = dst_argb[1];
     int r = dst_argb[2];
-    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
-    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
-    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
+    dst_argb[0] = STATIC_CAST(
+        uint8_t, (b * scale >> 16) * interval_size + interval_offset);
+    dst_argb[1] = STATIC_CAST(
+        uint8_t, (g * scale >> 16) * interval_size + interval_offset);
+    dst_argb[2] = STATIC_CAST(
+        uint8_t, (r * scale >> 16) * interval_size + interval_offset);
     dst_argb += 4;
   }
 }
@@ -1260,10 +1313,10 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb,
     const uint32_t g_scale = src_argb1[1];
     const uint32_t r_scale = src_argb1[2];
     const uint32_t a_scale = src_argb1[3];
-    dst_argb[0] = SHADE(b, b_scale);
-    dst_argb[1] = SHADE(g, g_scale);
-    dst_argb[2] = SHADE(r, r_scale);
-    dst_argb[3] = SHADE(a, a_scale);
+    dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_scale));
+    dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_scale));
+    dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_scale));
+    dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_scale));
     src_argb += 4;
     src_argb1 += 4;
     dst_argb += 4;
@@ -1288,10 +1341,10 @@ void ARGBAddRow_C(const uint8_t* src_argb,
     const int g_add = src_argb1[1];
     const int r_add = src_argb1[2];
     const int a_add = src_argb1[3];
-    dst_argb[0] = SHADE(b, b_add);
-    dst_argb[1] = SHADE(g, g_add);
-    dst_argb[2] = SHADE(r, r_add);
-    dst_argb[3] = SHADE(a, a_add);
+    dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_add));
+    dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_add));
+    dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_add));
+    dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_add));
     src_argb += 4;
     src_argb1 += 4;
     dst_argb += 4;
@@ -1315,10 +1368,10 @@ void ARGBSubtractRow_C(const uint8_t* src_argb,
     const int g_sub = src_argb1[1];
     const int r_sub = src_argb1[2];
     const int a_sub = src_argb1[3];
-    dst_argb[0] = SHADE(b, b_sub);
-    dst_argb[1] = SHADE(g, g_sub);
-    dst_argb[2] = SHADE(r, r_sub);
-    dst_argb[3] = SHADE(a, a_sub);
+    dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_sub));
+    dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_sub));
+    dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_sub));
+    dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_sub));
     src_argb += 4;
     src_argb1 += 4;
     dst_argb += 4;
@@ -1431,7 +1484,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 
 // clang-format off
 
-#if defined(__aarch64__) || defined(__arm__)
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
 // Bias values include subtract 128 from U and V, bias from Y and rounding.
 // For B and R bias is negative. For G bias is positive.
 #define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                             \
@@ -1627,7 +1680,7 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)
 
 #undef MAKEYUVCONSTANTS
 
-#if defined(__aarch64__) || defined(__arm__)
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
 #define LOAD_YUV_CONSTANTS                 \
   int ub = yuvconstants->kUVCoeff[0];      \
   int vr = yuvconstants->kUVCoeff[1];      \
@@ -1675,9 +1728,9 @@ static __inline void YuvPixel(uint8_t y,
   LOAD_YUV_CONSTANTS;
   uint32_t y32 = y * 0x0101;
   CALC_RGB16;
-  *b = Clamp((int32_t)(b16) >> 6);
-  *g = Clamp((int32_t)(g16) >> 6);
-  *r = Clamp((int32_t)(r16) >> 6);
+  *b = STATIC_CAST(uint8_t, Clamp((int32_t)(b16) >> 6));
+  *g = STATIC_CAST(uint8_t, Clamp((int32_t)(g16) >> 6));
+  *r = STATIC_CAST(uint8_t, Clamp((int32_t)(r16) >> 6));
 }
 
 // Reads 8 bit YUV and leaves result as 16 bit.
@@ -1706,9 +1759,9 @@ static __inline void YuvPixel10_16(uint16_t y,
                                    int* r,
                                    const struct YuvConstants* yuvconstants) {
   LOAD_YUV_CONSTANTS;
-  uint32_t y32 = y << 6;
-  u = clamp255(u >> 2);
-  v = clamp255(v >> 2);
+  uint32_t y32 = (y << 6) | (y >> 4);
+  u = STATIC_CAST(uint8_t, clamp255(u >> 2));
+  v = STATIC_CAST(uint8_t, clamp255(v >> 2));
   CALC_RGB16;
   *b = b16;
   *g = g16;
@@ -1725,9 +1778,9 @@ static __inline void YuvPixel12_16(int16_t y,
                                    int* r,
                                    const struct YuvConstants* yuvconstants) {
   LOAD_YUV_CONSTANTS;
-  uint32_t y32 = y << 4;
-  u = clamp255(u >> 4);
-  v = clamp255(v >> 4);
+  uint32_t y32 = (y << 4) | (y >> 8);
+  u = STATIC_CAST(uint8_t, clamp255(u >> 4));
+  v = STATIC_CAST(uint8_t, clamp255(v >> 4));
   CALC_RGB16;
   *b = b16;
   *g = g16;
@@ -1747,9 +1800,9 @@ static __inline void YuvPixel10(uint16_t y,
   int g16;
   int r16;
   YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants);
-  *b = Clamp(b16 >> 6);
-  *g = Clamp(g16 >> 6);
-  *r = Clamp(r16 >> 6);
+  *b = STATIC_CAST(uint8_t, Clamp(b16 >> 6));
+  *g = STATIC_CAST(uint8_t, Clamp(g16 >> 6));
+  *r = STATIC_CAST(uint8_t, Clamp(r16 >> 6));
 }
 
 // C reference code that mimics the YUV 12 bit assembly.
@@ -1765,9 +1818,9 @@ static __inline void YuvPixel12(uint16_t y,
   int g16;
   int r16;
   YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants);
-  *b = Clamp(b16 >> 6);
-  *g = Clamp(g16 >> 6);
-  *r = Clamp(r16 >> 6);
+  *b = STATIC_CAST(uint8_t, Clamp(b16 >> 6));
+  *g = STATIC_CAST(uint8_t, Clamp(g16 >> 6));
+  *r = STATIC_CAST(uint8_t, Clamp(r16 >> 6));
 }
 
 // C reference code that mimics the YUV 16 bit assembly.
@@ -1781,12 +1834,12 @@ static __inline void YuvPixel16_8(uint16_t y,
                                   const struct YuvConstants* yuvconstants) {
   LOAD_YUV_CONSTANTS;
   uint32_t y32 = y;
-  u = clamp255(u >> 8);
-  v = clamp255(v >> 8);
+  u = STATIC_CAST(uint16_t, clamp255(u >> 8));
+  v = STATIC_CAST(uint16_t, clamp255(v >> 8));
   CALC_RGB16;
-  *b = Clamp((int32_t)(b16) >> 6);
-  *g = Clamp((int32_t)(g16) >> 6);
-  *r = Clamp((int32_t)(r16) >> 6);
+  *b = STATIC_CAST(uint8_t, Clamp((int32_t)(b16) >> 6));
+  *g = STATIC_CAST(uint8_t, Clamp((int32_t)(g16) >> 6));
+  *r = STATIC_CAST(uint8_t, Clamp((int32_t)(r16) >> 6));
 }
 
 // C reference code that mimics the YUV 16 bit assembly.
@@ -1800,8 +1853,8 @@ static __inline void YuvPixel16_16(uint16_t y,
                                    const struct YuvConstants* yuvconstants) {
   LOAD_YUV_CONSTANTS;
   uint32_t y32 = y;
-  u = clamp255(u >> 8);
-  v = clamp255(v >> 8);
+  u = STATIC_CAST(uint16_t, clamp255(u >> 8));
+  v = STATIC_CAST(uint16_t, clamp255(v >> 8));
   CALC_RGB16;
   *b = b16;
   *g = g16;
@@ -1815,7 +1868,7 @@ static __inline void YPixel(uint8_t y,
                             uint8_t* g,
                             uint8_t* r,
                             const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__) || defined(__arm__)
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
   int yg = yuvconstants->kRGBCoeffBias[0];
   int ygb = yuvconstants->kRGBCoeffBias[4];
 #else
@@ -1823,9 +1876,9 @@ static __inline void YPixel(uint8_t y,
   int yg = yuvconstants->kYToRgb[0];
 #endif
   uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
-  *b = Clamp(((int32_t)(y1) + ygb) >> 6);
-  *g = Clamp(((int32_t)(y1) + ygb) >> 6);
-  *r = Clamp(((int32_t)(y1) + ygb) >> 6);
+  *b = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6));
+  *g = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6));
+  *r = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6));
 }
 
 void I444ToARGBRow_C(const uint8_t* src_y,
@@ -1846,6 +1899,23 @@ void I444ToARGBRow_C(const uint8_t* src_y,
   }
 }
 
+void I444ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_u,
+                      const uint8_t* src_v,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 3;  // Advance 1 pixel.
+  }
+}
+
 // Also used for 420
 void I422ToARGBRow_C(const uint8_t* src_y,
                      const uint8_t* src_u,
@@ -1929,10 +1999,10 @@ void I210AlphaToARGBRow_C(const uint16_t* src_y,
   for (x = 0; x < width - 1; x += 2) {
     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
                rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = clamp255(src_a[0] >> 2);
+    rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2));
     YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
                rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = clamp255(src_a[1] >> 2);
+    rgb_buf[7] = STATIC_CAST(uint8_t, clamp255(src_a[1] >> 2));
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1942,7 +2012,7 @@ void I210AlphaToARGBRow_C(const uint16_t* src_y,
   if (width & 1) {
     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
                rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = clamp255(src_a[0] >> 2);
+    rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2));
   }
 }
 
@@ -1957,7 +2027,7 @@ void I410AlphaToARGBRow_C(const uint16_t* src_y,
   for (x = 0; x < width; ++x) {
     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
                rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = clamp255(src_a[0] >> 2);
+    rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2));
     src_y += 1;
     src_u += 1;
     src_v += 1;
@@ -2283,8 +2353,10 @@ void I422ToARGB4444Row_C(const uint8_t* src_y,
     b1 = b1 >> 4;
     g1 = g1 >> 4;
     r1 = r1 >> 4;
-    *(uint16_t*)(dst_argb4444 + 0) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
-    *(uint16_t*)(dst_argb4444 + 2) = b1 | (g1 << 4) | (r1 << 8) | 0xf000;
+    *(uint16_t*)(dst_argb4444 + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | 0xf000);
+    *(uint16_t*)(dst_argb4444 + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 4) | (r1 << 8) | 0xf000);
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -2295,7 +2367,8 @@ void I422ToARGB4444Row_C(const uint8_t* src_y,
     b0 = b0 >> 4;
     g0 = g0 >> 4;
     r0 = r0 >> 4;
-    *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
+    *(uint16_t*)(dst_argb4444) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | 0xf000);
   }
 }
 
@@ -2321,8 +2394,10 @@ void I422ToARGB1555Row_C(const uint8_t* src_y,
     b1 = b1 >> 3;
     g1 = g1 >> 3;
     r1 = r1 >> 3;
-    *(uint16_t*)(dst_argb1555 + 0) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
-    *(uint16_t*)(dst_argb1555 + 2) = b1 | (g1 << 5) | (r1 << 10) | 0x8000;
+    *(uint16_t*)(dst_argb1555 + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | 0x8000);
+    *(uint16_t*)(dst_argb1555 + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 10) | 0x8000);
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -2333,7 +2408,8 @@ void I422ToARGB1555Row_C(const uint8_t* src_y,
     b0 = b0 >> 3;
     g0 = g0 >> 3;
     r0 = r0 >> 3;
-    *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
+    *(uint16_t*)(dst_argb1555) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | 0x8000);
   }
 }
 
@@ -2359,8 +2435,10 @@ void I422ToRGB565Row_C(const uint8_t* src_y,
     b1 = b1 >> 3;
     g1 = g1 >> 2;
     r1 = r1 >> 3;
-    *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11);  // for ubsan
-    *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11);
+    *(uint16_t*)(dst_rgb565 + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
+    *(uint16_t*)(dst_rgb565 + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 11));
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -2371,7 +2449,8 @@ void I422ToRGB565Row_C(const uint8_t* src_y,
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
-    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16_t*)(dst_rgb565 + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
   }
 }
 
@@ -2486,8 +2565,12 @@ void NV12ToRGB565Row_C(const uint8_t* src_y,
     b1 = b1 >> 3;
     g1 = g1 >> 2;
     r1 = r1 >> 3;
-    *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11);
-    *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11);
+    *(uint16_t*)(dst_rgb565 + 0) = STATIC_CAST(uint16_t, b0) |
+                                   STATIC_CAST(uint16_t, g0 << 5) |
+                                   STATIC_CAST(uint16_t, r0 << 11);
+    *(uint16_t*)(dst_rgb565 + 2) = STATIC_CAST(uint16_t, b1) |
+                                   STATIC_CAST(uint16_t, g1 << 5) |
+                                   STATIC_CAST(uint16_t, r1 << 11);
     src_y += 2;
     src_uv += 2;
     dst_rgb565 += 4;  // Advance 2 pixels.
@@ -2497,7 +2580,9 @@ void NV12ToRGB565Row_C(const uint8_t* src_y,
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
-    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16_t*)(dst_rgb565) = STATIC_CAST(uint16_t, b0) |
+                               STATIC_CAST(uint16_t, g0 << 5) |
+                               STATIC_CAST(uint16_t, r0 << 11);
   }
 }
 
@@ -2603,6 +2688,19 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
   }
 }
 
+void MirrorRow_16_C(const uint16_t* src, uint16_t* dst, int width) {
+  int x;
+  src += width - 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst[x] = src[0];
+    dst[x + 1] = src[-1];
+    src -= 2;
+  }
+  if (width & 1) {
+    dst[width - 1] = src[0];
+  }
+}
+
 void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
   int x;
   src_uv += (width - 1) << 1;
@@ -2714,6 +2812,21 @@ void DetileRow_C(const uint8_t* src,
   }
 }
 
+void DetileRow_16_C(const uint16_t* src,
+                    ptrdiff_t src_tile_stride,
+                    uint16_t* dst,
+                    int width) {
+  int x;
+  for (x = 0; x < width - 15; x += 16) {
+    memcpy(dst, src, 16 * sizeof(uint16_t));
+    dst += 16;
+    src += src_tile_stride;
+  }
+  if (width & 15) {
+    memcpy(dst, src, (width & 15) * sizeof(uint16_t));
+  }
+}
+
 void DetileSplitUVRow_C(const uint8_t* src_uv,
                         ptrdiff_t src_tile_stride,
                         uint8_t* dst_u,
@@ -2731,6 +2844,51 @@ void DetileSplitUVRow_C(const uint8_t* src_uv,
   }
 }
 
+void DetileToYUY2_C(const uint8_t* src_y,
+                    ptrdiff_t src_y_tile_stride,
+                    const uint8_t* src_uv,
+                    ptrdiff_t src_uv_tile_stride,
+                    uint8_t* dst_yuy2,
+                    int width) {
+  for (int x = 0; x < width - 15; x += 16) {
+    for (int i = 0; i < 8; i++) {
+      dst_yuy2[0] = src_y[0];
+      dst_yuy2[1] = src_uv[0];
+      dst_yuy2[2] = src_y[1];
+      dst_yuy2[3] = src_uv[1];
+      dst_yuy2 += 4;
+      src_y += 2;
+      src_uv += 2;
+    }
+    src_y += src_y_tile_stride - 16;
+    src_uv += src_uv_tile_stride - 16;
+  }
+}
+
+// Unpack MT2T into tiled P010 64 pixels at a time. MT2T's bitstream is encoded
+// in 80 byte blocks representing 64 pixels each. The first 16 bytes of the
+// block contain all of the lower 2 bits of each pixel packed together, and the
+// next 64 bytes represent all the upper 8 bits of the pixel. The lower bits are
+// packed into 1x4 blocks, whereas the upper bits are packed in normal raster
+// order.
+void UnpackMT2T_C(const uint8_t* src, uint16_t* dst, size_t size) {
+  for (size_t i = 0; i < size; i += 80) {
+    const uint8_t* src_lower_bits = src;
+    const uint8_t* src_upper_bits = src + 16;
+
+    for (int j = 0; j < 4; j++) {
+      for (int k = 0; k < 16; k++) {
+        *dst++ = ((src_lower_bits[k] >> (j * 2)) & 0x3) << 6 |
+                 (uint16_t)*src_upper_bits << 8 |
+                 (uint16_t)*src_upper_bits >> 2;
+        src_upper_bits++;
+      }
+    }
+
+    src += 80;
+  }
+}
+
 void SplitRGBRow_C(const uint8_t* src_rgb,
                    uint8_t* dst_r,
                    uint8_t* dst_g,
@@ -2823,10 +2981,10 @@ void MergeAR64Row_C(const uint16_t* src_r,
   int shift = 16 - depth;
   int max = (1 << depth) - 1;
   for (x = 0; x < width; ++x) {
-    dst_ar64[0] = ClampMax(src_b[x], max) << shift;
-    dst_ar64[1] = ClampMax(src_g[x], max) << shift;
-    dst_ar64[2] = ClampMax(src_r[x], max) << shift;
-    dst_ar64[3] = ClampMax(src_a[x], max) << shift;
+    dst_ar64[0] = STATIC_CAST(uint16_t, ClampMax(src_b[x], max) << shift);
+    dst_ar64[1] = STATIC_CAST(uint16_t, ClampMax(src_g[x], max) << shift);
+    dst_ar64[2] = STATIC_CAST(uint16_t, ClampMax(src_r[x], max) << shift);
+    dst_ar64[3] = STATIC_CAST(uint16_t, ClampMax(src_a[x], max) << shift);
     dst_ar64 += 4;
   }
 }
@@ -2843,10 +3001,10 @@ void MergeARGB16To8Row_C(const uint16_t* src_r,
   int x;
   int shift = depth - 8;
   for (x = 0; x < width; ++x) {
-    dst_argb[0] = clamp255(src_b[x] >> shift);
-    dst_argb[1] = clamp255(src_g[x] >> shift);
-    dst_argb[2] = clamp255(src_r[x] >> shift);
-    dst_argb[3] = clamp255(src_a[x] >> shift);
+    dst_argb[0] = STATIC_CAST(uint8_t, clamp255(src_b[x] >> shift));
+    dst_argb[1] = STATIC_CAST(uint8_t, clamp255(src_g[x] >> shift));
+    dst_argb[2] = STATIC_CAST(uint8_t, clamp255(src_r[x] >> shift));
+    dst_argb[3] = STATIC_CAST(uint8_t, clamp255(src_a[x] >> shift));
     dst_argb += 4;
   }
 }
@@ -2863,9 +3021,9 @@ void MergeXR64Row_C(const uint16_t* src_r,
   int shift = 16 - depth;
   int max = (1 << depth) - 1;
   for (x = 0; x < width; ++x) {
-    dst_ar64[0] = ClampMax(src_b[x], max) << shift;
-    dst_ar64[1] = ClampMax(src_g[x], max) << shift;
-    dst_ar64[2] = ClampMax(src_r[x], max) << shift;
+    dst_ar64[0] = STATIC_CAST(uint16_t, ClampMax(src_b[x], max) << shift);
+    dst_ar64[1] = STATIC_CAST(uint16_t, ClampMax(src_g[x], max) << shift);
+    dst_ar64[2] = STATIC_CAST(uint16_t, ClampMax(src_r[x], max) << shift);
     dst_ar64[3] = 0xffff;
     dst_ar64 += 4;
   }
@@ -2882,9 +3040,9 @@ void MergeXRGB16To8Row_C(const uint16_t* src_r,
   int x;
   int shift = depth - 8;
   for (x = 0; x < width; ++x) {
-    dst_argb[0] = clamp255(src_b[x] >> shift);
-    dst_argb[1] = clamp255(src_g[x] >> shift);
-    dst_argb[2] = clamp255(src_r[x] >> shift);
+    dst_argb[0] = STATIC_CAST(uint8_t, clamp255(src_b[x] >> shift));
+    dst_argb[1] = STATIC_CAST(uint8_t, clamp255(src_g[x] >> shift));
+    dst_argb[2] = STATIC_CAST(uint8_t, clamp255(src_r[x] >> shift));
     dst_argb[3] = 0xff;
     dst_argb += 4;
   }
@@ -2930,8 +3088,8 @@ void MergeUVRow_16_C(const uint16_t* src_u,
   assert(depth <= 16);
   int x;
   for (x = 0; x < width; ++x) {
-    dst_uv[0] = src_u[x] << shift;
-    dst_uv[1] = src_v[x] << shift;
+    dst_uv[0] = STATIC_CAST(uint16_t, src_u[x] << shift);
+    dst_uv[1] = STATIC_CAST(uint16_t, src_v[x] << shift);
     dst_uv += 2;
   }
 }
@@ -2959,7 +3117,7 @@ void MultiplyRow_16_C(const uint16_t* src_y,
                       int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_y[x] = src_y[x] * scale;
+    dst_y[x] = STATIC_CAST(uint16_t, src_y[x] * scale);
   }
 }
 
@@ -2990,7 +3148,7 @@ void Convert16To8Row_C(const uint16_t* src_y,
   assert(scale <= 32768);
 
   for (x = 0; x < width; ++x) {
-    dst_y[x] = C16TO8(src_y[x], scale);
+    dst_y[x] = STATIC_CAST(uint8_t, C16TO8(src_y[x], scale));
   }
 }
 
@@ -3043,6 +3201,21 @@ void YUY2ToUVRow_C(const uint8_t* src_yuy2,
   }
 }
 
+// Filter 2 rows of YUY2 UV's (422) into UV (NV12).
+void YUY2ToNVUVRow_C(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_uv,
+                     int width) {
+  // Output a row of UV values, filtering 2 rows of YUY2.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_uv[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+    dst_uv[1] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+    src_yuy2 += 4;
+    dst_uv += 2;
+  }
+}
+
 // Copy row of YUY2 UV's (422) into U and V (422).
 void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
@@ -3138,9 +3311,9 @@ void ARGBBlendRow_C(const uint8_t* src_argb,
     uint32_t bb = src_argb1[0];
     uint32_t bg = src_argb1[1];
     uint32_t br = src_argb1[2];
-    dst_argb[0] = BLEND(fb, bb, a);
-    dst_argb[1] = BLEND(fg, bg, a);
-    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a));
+    dst_argb[1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a));
+    dst_argb[2] = STATIC_CAST(uint8_t, BLEND(fr, br, a));
     dst_argb[3] = 255u;
 
     fb = src_argb[4 + 0];
@@ -3150,9 +3323,9 @@ void ARGBBlendRow_C(const uint8_t* src_argb,
     bb = src_argb1[4 + 0];
     bg = src_argb1[4 + 1];
     br = src_argb1[4 + 2];
-    dst_argb[4 + 0] = BLEND(fb, bb, a);
-    dst_argb[4 + 1] = BLEND(fg, bg, a);
-    dst_argb[4 + 2] = BLEND(fr, br, a);
+    dst_argb[4 + 0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a));
+    dst_argb[4 + 1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a));
+    dst_argb[4 + 2] = STATIC_CAST(uint8_t, BLEND(fr, br, a));
     dst_argb[4 + 3] = 255u;
     src_argb += 8;
     src_argb1 += 8;
@@ -3167,9 +3340,9 @@ void ARGBBlendRow_C(const uint8_t* src_argb,
     uint32_t bb = src_argb1[0];
     uint32_t bg = src_argb1[1];
     uint32_t br = src_argb1[2];
-    dst_argb[0] = BLEND(fb, bb, a);
-    dst_argb[1] = BLEND(fg, bg, a);
-    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a));
+    dst_argb[1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a));
+    dst_argb[2] = STATIC_CAST(uint8_t, BLEND(fr, br, a));
     dst_argb[3] = 255u;
   }
 }
@@ -3214,7 +3387,7 @@ void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
     dst_argb[0] = ATTENUATE(b, a);
     dst_argb[1] = ATTENUATE(g, a);
     dst_argb[2] = ATTENUATE(r, a);
-    dst_argb[3] = a;
+    dst_argb[3] = STATIC_CAST(uint8_t, a);
     b = src_argb[4];
     g = src_argb[5];
     r = src_argb[6];
@@ -3222,7 +3395,7 @@ void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
     dst_argb[4] = ATTENUATE(b, a);
     dst_argb[5] = ATTENUATE(g, a);
     dst_argb[6] = ATTENUATE(r, a);
-    dst_argb[7] = a;
+    dst_argb[7] = STATIC_CAST(uint8_t, a);
     src_argb += 8;
     dst_argb += 8;
   }
@@ -3235,7 +3408,7 @@ void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
     dst_argb[0] = ATTENUATE(b, a);
     dst_argb[1] = ATTENUATE(g, a);
     dst_argb[2] = ATTENUATE(r, a);
-    dst_argb[3] = a;
+    dst_argb[3] = STATIC_CAST(uint8_t, a);
   }
 }
 #undef ATTENUATE
@@ -3307,10 +3480,10 @@ void ARGBUnattenuateRow_C(const uint8_t* src_argb,
     const uint32_t ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
 
     // Clamping should not be necessary but is free in assembly.
-    dst_argb[0] = UNATTENUATE(b, ia);
-    dst_argb[1] = UNATTENUATE(g, ia);
-    dst_argb[2] = UNATTENUATE(r, ia);
-    dst_argb[3] = a;
+    dst_argb[0] = STATIC_CAST(uint8_t, UNATTENUATE(b, ia));
+    dst_argb[1] = STATIC_CAST(uint8_t, UNATTENUATE(g, ia));
+    dst_argb[2] = STATIC_CAST(uint8_t, UNATTENUATE(r, ia));
+    dst_argb[3] = STATIC_CAST(uint8_t, a);
     src_argb += 4;
     dst_argb += 4;
   }
@@ -3344,12 +3517,20 @@ void CumulativeSumToAverageRow_C(const int32_t* tl,
   int i;
   assert(area != 0);
 
-  ooa = 1.0f / area;
+  ooa = 1.0f / STATIC_CAST(float, area);
   for (i = 0; i < count; ++i) {
-    dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
-    dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
-    dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
-    dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+    dst[0] =
+        (uint8_t)(STATIC_CAST(float, bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) *
+                  ooa);
+    dst[1] =
+        (uint8_t)(STATIC_CAST(float, bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) *
+                  ooa);
+    dst[2] =
+        (uint8_t)(STATIC_CAST(float, bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) *
+                  ooa);
+    dst[3] =
+        (uint8_t)(STATIC_CAST(float, bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) *
+                  ooa);
     dst += 4;
     tl += 4;
     bl += 4;
@@ -3407,7 +3588,9 @@ static void HalfRow_16To8_C(const uint16_t* src_uv,
                             int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_uv[x] = C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale);
+    dst_uv[x] = STATIC_CAST(
+        uint8_t,
+        C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale));
   }
 }
 
@@ -3433,8 +3616,9 @@ void InterpolateRow_C(uint8_t* dst_ptr,
     return;
   }
   for (x = 0; x < width; ++x) {
-    dst_ptr[0] =
-        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+    dst_ptr[0] = STATIC_CAST(
+        uint8_t,
+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8);
     ++src_ptr;
     ++src_ptr1;
     ++dst_ptr;
@@ -3463,8 +3647,9 @@ void InterpolateRow_16_C(uint16_t* dst_ptr,
     return;
   }
   for (x = 0; x < width; ++x) {
-    dst_ptr[0] =
-        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+    dst_ptr[0] = STATIC_CAST(
+        uint16_t,
+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8);
     ++src_ptr;
     ++src_ptr1;
     ++dst_ptr;
@@ -3501,9 +3686,11 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr,
     return;
   }
   for (x = 0; x < width; ++x) {
-    dst_ptr[0] = C16TO8(
-        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8,
-        scale);
+    dst_ptr[0] = STATIC_CAST(
+        uint8_t,
+        C16TO8(
+            (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8,
+            scale));
     src_ptr += 1;
     src_ptr1 += 1;
     dst_ptr += 1;
@@ -3615,10 +3802,10 @@ void ARGBPolynomialRow_C(const uint8_t* src_argb,
     dr += poly[14] * r3;
     da += poly[15] * a3;
 
-    dst_argb[0] = Clamp((int32_t)(db));
-    dst_argb[1] = Clamp((int32_t)(dg));
-    dst_argb[2] = Clamp((int32_t)(dr));
-    dst_argb[3] = Clamp((int32_t)(da));
+    dst_argb[0] = STATIC_CAST(uint8_t, Clamp((int32_t)(db)));
+    dst_argb[1] = STATIC_CAST(uint8_t, Clamp((int32_t)(dg)));
+    dst_argb[2] = STATIC_CAST(uint8_t, Clamp((int32_t)(dr)));
+    dst_argb[3] = STATIC_CAST(uint8_t, Clamp((int32_t)(da)));
     src_argb += 4;
     dst_argb += 4;
   }
@@ -4023,6 +4210,32 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y,
 }
 #endif
 
+#if defined(HAS_I444TORGB24ROW_AVX2)
+void I444ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I444ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
+    src_y += twidth;
+    src_u += twidth;
+    src_v += twidth;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
 #if defined(HAS_NV12TORGB565ROW_AVX2)
 void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
                           const uint8_t* src_uv,
@@ -4164,8 +4377,9 @@ void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
 void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    *dst++ =
-        (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
+    *dst++ = STATIC_CAST(
+        uint16_t,
+        (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8);
     ++src;
   }
 }
@@ -4325,6 +4539,8 @@ void HalfMergeUVRow_C(const uint8_t* src_u,
   }
 }
 
+#undef STATIC_CAST
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/source/row_gcc.cc b/files/source/row_gcc.cc
index dce8c439..e94fd04d 100644
--- a/files/source/row_gcc.cc
+++ b/files/source/row_gcc.cc
@@ -27,6 +27,9 @@ static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
 static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
                                 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
 
+static const uvec8 kABGRToYJ = {77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u,
+                                77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u};
+
 static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
                                 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
@@ -39,12 +42,18 @@ static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
                                127, -84, -43, 0, 127, -84, -43, 0};
 
+static const vec8 kABGRToUJ = {-43, -84, 127, 0, -43, -84, 127, 0,
+                               -43, -84, 127, 0, -43, -84, 127, 0};
+
 static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
                               -18, -94, 112, 0, -18, -94, 112, 0};
 
 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
                                -20, -107, 127, 0, -20, -107, 127, 0};
 
+static const vec8 kABGRToVJ = {127, -107, -20, 0, 127, -107, -20, 0,
+                               127, -107, -20, 0, 127, -107, -20, 0};
+
 // Constants for BGRA
 static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
                                0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
@@ -729,7 +738,7 @@ void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 
 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
                                 uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width) {
   asm volatile(
       "movd        %3,%%xmm6                     \n"
@@ -777,7 +786,7 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
                                 uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width) {
   asm volatile(
       "vbroadcastss %3,%%xmm6                    \n"
@@ -1201,6 +1210,7 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
       "lea         0x40(%1),%1                   \n"
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
+      "vzeroupper                                \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_ar64),  // %1
         "+r"(width)      // %2
@@ -1228,6 +1238,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
       "lea         0x40(%1),%1                   \n"
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
+      "vzeroupper                                \n"
       : "+r"(src_argb),             // %0
         "+r"(dst_ab64),             // %1
         "+r"(width)                 // %2
@@ -1256,6 +1267,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
       "lea         0x20(%1),%1                   \n"
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
+      "vzeroupper                                \n"
       : "+r"(src_ar64),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
@@ -1284,6 +1296,7 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
       "lea         0x20(%1),%1                   \n"
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
+      "vzeroupper                                \n"
       : "+r"(src_ab64),          // %0
         "+r"(dst_argb),          // %1
         "+r"(width)              // %2
@@ -1398,6 +1411,24 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 }
 #endif  // HAS_ARGBTOYJROW_SSSE3
 
+#ifdef HAS_ABGRTOYJROW_SSSE3
+// Convert 16 ABGR pixels (64 bytes) to 16 YJ values.
+// Same as ABGRToYRow but different coefficients, no add 16.
+void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      LABELALIGN RGBTOY(xmm5)
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kABGRToYJ),  // %3
+        "m"(kSub128)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_ABGRTOYJROW_SSSE3
+
 #ifdef HAS_RGBATOYJROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16.
@@ -1416,7 +1447,8 @@ void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
 }
 #endif  // HAS_RGBATOYJROW_SSSE3
 
-#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \
+    defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
 // vpermd for vphaddw + vpackuswb vpermd.
 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
 #endif
@@ -1429,9 +1461,8 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vbroadcastf128 %5,%%ymm7                  \n"
-      "vmovdqu     %6,%%ymm6                     \n"
-
-      LABELALIGN RGBTOY_AVX2(ymm7)
+      "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
+      ymm7) "vzeroupper                                \n"
       : "+r"(src_argb),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
@@ -1451,9 +1482,8 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
       "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vbroadcastf128 %5,%%ymm7                  \n"
-      "vmovdqu     %6,%%ymm6                     \n"
-
-      LABELALIGN RGBTOY_AVX2(ymm7)
+      "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
+      ymm7) "vzeroupper                                \n"
       : "+r"(src_abgr),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
@@ -1472,9 +1502,8 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   asm volatile(
       "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
-      "vmovdqu     %5,%%ymm6                     \n"
-
-      LABELALIGN RGBTOY_AVX2(ymm5)
+      "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
+      ymm5) "vzeroupper                                \n"
       : "+r"(src_argb),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
@@ -1486,15 +1515,32 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 }
 #endif  // HAS_ARGBTOYJROW_AVX2
 
+#ifdef HAS_ABGRTOYJROW_AVX2
+// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
+void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
+      ymm5) "vzeroupper                                \n"
+      : "+r"(src_abgr),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kABGRToYJ),         // %3
+        "m"(kSub128),           // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ABGRTOYJROW_AVX2
+
 #ifdef HAS_RGBATOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
   asm volatile(
       "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
-      "vmovdqu     %5,%%ymm6                     \n"
-
-      LABELALIGN RGBTOY_AVX2(
+      "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
       ymm5) "vzeroupper                                \n"
       : "+r"(src_rgba),         // %0
         "+r"(dst_y),            // %1
@@ -1571,11 +1617,15 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBTOUVROW_SSSE3
 
-#ifdef HAS_ARGBTOUVROW_AVX2
+#if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \
+    defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2)
 // vpshufb for vphaddw + vpackuswb packed to shorts.
 static const lvec8 kShufARGBToUV_AVX = {
     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+#endif
+
+#if defined(HAS_ARGBTOUVROW_AVX2)
 void ARGBToUVRow_AVX2(const uint8_t* src_argb,
                       int src_stride_argb,
                       uint8_t* dst_u,
@@ -1765,6 +1815,71 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBTOUVJROW_AVX2
 
+// TODO(fbarchard): Pass kABGRToVJ / kABGRToUJ as matrix
+#ifdef HAS_ABGRTOUVJROW_AVX2
+void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
+      "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
+      "lea         0x80(%0),%0                   \n"
+      "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
+      "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
+      "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
+
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
+      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
+      "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %8,%%ymm0,%%ymm0              \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_abgr),                    // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kSub128),                      // %5
+        "m"(kABGRToVJ),                    // %6
+        "m"(kABGRToUJ),                    // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ABGRTOUVJROW_AVX2
+
 #ifdef HAS_ARGBTOUVJROW_SSSE3
 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
                         int src_stride_argb,
@@ -1831,6 +1946,72 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBTOUVJROW_SSSE3
 
+#ifdef HAS_ABGRTOUVJROW_SSSE3
+void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr,
+                        int src_stride_abgr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  asm volatile(
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "paddw       %%xmm5,%%xmm0                 \n"
+      "paddw       %%xmm5,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_abgr),                    // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kABGRToVJ),                    // %5
+        "m"(kABGRToUJ),                    // %6
+        "m"(kSub128)                       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+#endif  // HAS_ABGRTOUVJROW_SSSE3
+
 #ifdef HAS_ARGBTOUV444ROW_SSSE3
 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
                           uint8_t* dst_u,
@@ -2153,9 +2334,6 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 UV from 422 10 bit, upsample to 8 UV
-// TODO(fbarchard): Consider shufb to replace pack/unpack
-// TODO(fbarchard): Consider pmulhuw to replace psraw
-// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
 #define READYUV210                                                \
   "movq       (%[u_buf]),%%xmm3                               \n" \
   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
@@ -2165,7 +2343,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
   "packuswb   %%xmm3,%%xmm3                                   \n" \
   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "movdqa     %%xmm4,%%xmm2                                   \n" \
   "psllw      $6,%%xmm4                                       \n" \
+  "psrlw      $4,%%xmm2                                       \n" \
+  "paddw      %%xmm2,%%xmm4                                   \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 
 #define READYUVA210                                               \
@@ -2177,7 +2358,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
   "packuswb   %%xmm3,%%xmm3                                   \n" \
   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "movdqa     %%xmm4,%%xmm2                                   \n" \
   "psllw      $6,%%xmm4                                       \n" \
+  "psrlw      $4,%%xmm2                                       \n" \
+  "paddw      %%xmm2,%%xmm4                                   \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
   "movdqu     (%[a_buf]),%%xmm5                               \n" \
   "psraw      $2,%%xmm5                                       \n" \
@@ -2196,7 +2380,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
   "punpckhwd  %%xmm2,%%xmm1                                   \n" \
   "packuswb   %%xmm1,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "movdqa     %%xmm4,%%xmm2                                   \n" \
   "psllw      $6,%%xmm4                                       \n" \
+  "psrlw      $4,%%xmm2                                       \n" \
+  "paddw      %%xmm2,%%xmm4                                   \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 
 // Read 8 UV from 444 10 bit.  With 8 Alpha.
@@ -2211,7 +2398,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
   "punpckhwd  %%xmm2,%%xmm1                                   \n" \
   "packuswb   %%xmm1,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
-  "psllw      $0x6,%%xmm4                                     \n" \
+  "movdqa     %%xmm4,%%xmm2                                   \n" \
+  "psllw      $6,%%xmm4                                       \n" \
+  "psrlw      $4,%%xmm2                                       \n" \
+  "paddw      %%xmm2,%%xmm4                                   \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
   "movdqu     (%[a_buf]),%%xmm5                               \n" \
   "psraw      $2,%%xmm5                                       \n" \
@@ -2228,7 +2418,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
   "packuswb   %%xmm3,%%xmm3                                   \n" \
   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
-  "psllw      $0x4,%%xmm4                                     \n" \
+  "movdqa     %%xmm4,%%xmm2                                   \n" \
+  "psllw      $4,%%xmm4                                       \n" \
+  "psrlw      $8,%%xmm2                                       \n" \
+  "paddw      %%xmm2,%%xmm4                                   \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
@@ -2399,6 +2592,20 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
   "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
   "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
 
+// Store 8 RGB24 values.
+#define STORERGB24                                                      \
+  "punpcklbw   %%xmm1,%%xmm0                                        \n" \
+  "punpcklbw   %%xmm2,%%xmm2                                        \n" \
+  "movdqa      %%xmm0,%%xmm1                                        \n" \
+  "punpcklwd   %%xmm2,%%xmm0                                        \n" \
+  "punpckhwd   %%xmm2,%%xmm1                                        \n" \
+  "pshufb      %%xmm5,%%xmm0                                        \n" \
+  "pshufb      %%xmm6,%%xmm1                                        \n" \
+  "palignr     $0xc,%%xmm0,%%xmm1                                   \n" \
+  "movq        %%xmm0,(%[dst_rgb24])                                \n" \
+  "movdqu      %%xmm1,0x8(%[dst_rgb24])                             \n" \
+  "lea         0x18(%[dst_rgb24]),%[dst_rgb24]                      \n"
+
 // Store 8 AR30 values.
 #define STOREAR30                                                  \
   "psraw      $0x4,%%xmm0                                      \n" \
@@ -2508,17 +2715,43 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
       "1:                                        \n"
     READYUV422
     YUVTORGB(yuvconstants)
-      "punpcklbw   %%xmm1,%%xmm0                 \n"
-      "punpcklbw   %%xmm2,%%xmm2                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklwd   %%xmm2,%%xmm0                 \n"
-      "punpckhwd   %%xmm2,%%xmm1                 \n"
-      "pshufb      %%xmm5,%%xmm0                 \n"
-      "pshufb      %%xmm6,%%xmm1                 \n"
-      "palignr     $0xc,%%xmm0,%%xmm1            \n"
-      "movq        %%xmm0,(%[dst_rgb24])         \n"
-      "movdqu      %%xmm1,0x8(%[dst_rgb24])      \n"
-      "lea         0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+    STORERGB24
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
+#if defined(__i386__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
+    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void OMITFP I444ToRGB24Row_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_rgb24,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+      "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+      "sub         %[u_buf],%[v_buf]             \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV444
+    YUVTORGB(yuvconstants)
+    STORERGB24
       "subl        $0x8,%[width]                 \n"
       "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -3209,7 +3442,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
+  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
+  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
+  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
 
 // Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
@@ -3224,7 +3459,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
+  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
+  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
+  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
   "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
   "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
@@ -3242,7 +3479,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
   "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
   "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
+  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
+  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
+  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
 
 // Read 8 UV from 212 12 bit, upsample to 16 UV
@@ -3257,7 +3496,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $0x4,%%ymm4,%%ymm4                               \n" \
+  "vpsllw     $4,%%ymm4,%%ymm2                                 \n" \
+  "vpsrlw     $8,%%ymm4,%%ymm4                                 \n" \
+  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
 
 // Read 16 UV from 410. With 16 Alpha.
@@ -3271,7 +3512,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
   "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
   "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
+  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
+  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
+  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
   "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
   "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
@@ -4785,6 +5028,84 @@ void DetileRow_SSE2(const uint8_t* src,
 }
 #endif  // HAS_DETILEROW_SSE2
 
+#ifdef HAS_DETILEROW_16_SSE2
+void DetileRow_16_SSE2(const uint16_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint16_t* dst,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src),            // %0
+        "+r"(dst),            // %1
+        "+r"(width)           // %2
+      : "r"(src_tile_stride)  // %3
+      : "cc", "memory", "xmm0", "xmm1");
+}
+#endif  // HAS_DETILEROW_SSE2
+
+#ifdef HAS_DETILEROW_16_AVX
+void DetileRow_16_AVX(const uint16_t* src,
+                      ptrdiff_t src_tile_stride,
+                      uint16_t* dst,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),            // %0
+        "+r"(dst),            // %1
+        "+r"(width)           // %2
+      : "r"(src_tile_stride)  // %3
+      : "cc", "memory", "xmm0");
+}
+#endif  // HAS_DETILEROW_AVX
+
+#ifdef HAS_DETILETOYUY2_SSE2
+// Read 16 Y, 8 UV, and write 8 YUYV.
+void DetileToYUY2_SSE2(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // Load 16 Y
+      "sub         $0x10,%3                      \n"
+      "lea         (%0,%4),%0                    \n"
+      "movdqu      (%1),%%xmm1                   \n"  // Load 8 UV
+      "lea         (%1,%5),%1                    \n"
+      "movdqu      %%xmm0,%%xmm2                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm2                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "movdqu      %%xmm2,0x10(%2)               \n"
+      "lea         0x20(%2),%2                   \n"
+      "jg          1b                            \n"
+      : "+r"(src_y),                            // %0
+        "+r"(src_uv),                           // %1
+        "+r"(dst_yuy2),                         // %2
+        "+r"(width)                             // %3
+      : "r"(src_y_tile_stride),                 // %4
+        "r"(src_uv_tile_stride)                 // %5
+      : "cc", "memory", "xmm0", "xmm1", "xmm2"  // Clobber list
+  );
+}
+#endif
+
 #ifdef HAS_DETILESPLITUVROW_SSSE3
 // TODO(greenjustin): Look into generating these constants instead of loading
 // them since this can cause branch mispredicts for fPIC code on 32-bit
@@ -4821,36 +5142,59 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
 }
 #endif  // HAS_DETILESPLITUVROW_SSSE3
 
+#ifdef HAS_MERGEUVROW_AVX512BW
+void MergeUVRow_AVX512BW(const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_uv,
+                         int width) {
+      asm volatile("sub         %0,%1                         \n"
+
+               LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbw   (%0),%%zmm0                   \n"
+      "vpmovzxbw   0x00(%0,%1,1),%%zmm1          \n"
+      "lea         0x20(%0),%0                   \n"
+      "vpsllw      $0x8,%%zmm1,%%zmm1            \n"
+      "vporq       %%zmm0,%%zmm1,%%zmm2          \n"
+      "vmovdqu64   %%zmm2,(%2)                   \n"
+      "lea         0x40(%2),%2                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+               : "+r"(src_u),   // %0
+                 "+r"(src_v),   // %1
+                 "+r"(dst_uv),  // %2
+                 "+r"(width)    // %3
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_MERGEUVROW_AVX512BW
+
 #ifdef HAS_MERGEUVROW_AVX2
 void MergeUVRow_AVX2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile(
+      asm volatile("sub         %0,%1                         \n"
 
-      "sub         %0,%1                         \n"
-
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x00(%0,%1,1),%%ymm1          \n"
-      "lea         0x20(%0),%0                   \n"
-      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm2          \n"
-      "vpunpckhbw  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vextractf128 $0x0,%%ymm2,(%2)             \n"
-      "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
-      "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
-      "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
-      "lea         0x40(%2),%2                   \n"
-      "sub         $0x20,%3                      \n"
+      "vpmovzxbw   (%0),%%ymm0                   \n"
+      "vpmovzxbw   0x00(%0,%1,1),%%ymm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "vpsllw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpor        %%ymm0,%%ymm1,%%ymm2          \n"
+      "vmovdqu     %%ymm2,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_u),   // %0
-        "+r"(src_v),   // %1
-        "+r"(dst_uv),  // %2
-        "+r"(width)    // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+               : "+r"(src_u),   // %0
+                 "+r"(src_v),   // %1
+                 "+r"(dst_uv),  // %2
+                 "+r"(width)    // %3
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_AVX2
 
@@ -4859,11 +5203,9 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile(
+      asm volatile("sub         %0,%1                         \n"
 
-      "sub         %0,%1                         \n"
-
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
@@ -4876,12 +5218,12 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
       "lea         0x20(%2),%2                   \n"
       "sub         $0x10,%3                      \n"
       "jg          1b                            \n"
-      : "+r"(src_u),   // %0
-        "+r"(src_v),   // %1
-        "+r"(dst_uv),  // %2
-        "+r"(width)    // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+               : "+r"(src_u),   // %0
+                 "+r"(src_v),   // %1
+                 "+r"(dst_uv),  // %2
+                 "+r"(width)    // %3
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_SSE2
 
@@ -4891,37 +5233,35 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
                         uint16_t* dst_uv,
                         int depth,
                         int width) {
-  depth = 16 - depth;
   // clang-format off
   asm volatile (
       "vmovd       %4,%%xmm3                     \n"
+      "vmovd       %5,%%xmm4                     \n"
+
+
       "sub         %0,%1                         \n"
+      // 8 pixels per loop.
 
-    // 16 pixels per loop.
-    LABELALIGN
+      LABELALIGN
       "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     (%0,%1,1),%%ymm1              \n"
-      "add         $0x20,%0                      \n"
-
+      "vpmovzxwd   (%0),%%ymm0                   \n"
+      "vpmovzxwd   0x00(%0,%1,1),%%ymm1          \n"
+      "lea         0x10(%0),%0                   \n"
       "vpsllw      %%xmm3,%%ymm0,%%ymm0          \n"
-      "vpsllw      %%xmm3,%%ymm1,%%ymm1          \n"
-      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm2          \n"  // mutates
-      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vextractf128 $0x0,%%ymm2,(%2)             \n"
-      "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
-      "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
-      "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
-      "add         $0x40,%2                      \n"
-      "sub         $0x10,%3                      \n"
+      "vpslld      %%xmm4,%%ymm1,%%ymm1          \n"
+      "vpor        %%ymm0,%%ymm1,%%ymm2          \n"
+      "vmovdqu     %%ymm2,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x8,%3                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : "+r"(src_u),   // %0
-    "+r"(src_v),   // %1
-    "+r"(dst_uv),  // %2
-    "+r"(width)    // %3
-  : "r"(depth)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+  : "+r"(src_u),      // %0
+    "+r"(src_v),      // %1
+    "+r"(dst_uv),     // %2
+    "+r"(width)       // %3
+  : "r"(16 - depth),  // %4
+    "r"(32 - depth)   // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
   // clang-format on
 }
 #endif  // HAS_MERGEUVROW_AVX2
@@ -5127,7 +5467,6 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
 // 512 = 9 bits
 // 1024 = 10 bits
 // 4096 = 12 bits
-// TODO(fbarchard): reduce to SSE2
 void Convert8To16Row_SSE2(const uint8_t* src_y,
                           uint16_t* dst_y,
                           int scale,
@@ -6178,6 +6517,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
       "lea         0x40(%1),%1                   \n"
       "sub         $0x40,%2                      \n"
       "jg          1b                            \n"
+      "vzeroupper                                \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -6461,6 +6801,33 @@ void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
       : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
+void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width) {
+  asm volatile(LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "pavgb       %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+               : "+r"(src_yuy2),               // %0
+                 "+r"(dst_uv),                 // %1
+                 "+r"(width)                   // %2
+               : "r"((intptr_t)(stride_yuy2))  // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
 void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
                       int stride_yuy2,
                       uint8_t* dst_u,
@@ -6661,6 +7028,35 @@ void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
       : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
+void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpavgb      0x00(%0,%3,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%3,1),%%ymm1,%%ymm1   \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_uv),                 // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(stride_yuy2))  // %3
+      : "memory", "cc", "xmm0", "xmm1");
+}
+
 void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
                       int stride_yuy2,
                       uint8_t* dst_u,
diff --git a/files/source/row_lasx.cc b/files/source/row_lasx.cc
index 7dd18f40..1082ad80 100644
--- a/files/source/row_lasx.cc
+++ b/files/source/row_lasx.cc
@@ -775,40 +775,6 @@ void UYVYToUV422Row_LASX(const uint8_t* src_uyvy,
   }
 }
 
-void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 32;
-  __m256i src0, src1, src2, src3, vec0, vec1, vec2, vec3;
-  __m256i tmp0, tmp1, dst0;
-  __m256i const_19 = __lasx_xvldi(0x19);
-  __m256i const_42 = __lasx_xvldi(0x42);
-  __m256i const_81 = __lasx_xvldi(0x81);
-  __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
-                        0x1080108010801080, 0x1080108010801080};
-  __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
-                     0x0000000700000003};
-
-  for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64,
-              src_argb0, 96, src0, src1, src2, src3);
-    vec0 = __lasx_xvpickev_b(src1, src0);
-    vec1 = __lasx_xvpickev_b(src3, src2);
-    vec2 = __lasx_xvpickod_b(src1, src0);
-    vec3 = __lasx_xvpickod_b(src3, src2);
-    tmp0 = __lasx_xvmaddwev_h_bu(const_1080, vec0, const_19);
-    tmp1 = __lasx_xvmaddwev_h_bu(const_1080, vec1, const_19);
-    tmp0 = __lasx_xvmaddwev_h_bu(tmp0, vec2, const_81);
-    tmp1 = __lasx_xvmaddwev_h_bu(tmp1, vec3, const_81);
-    tmp0 = __lasx_xvmaddwod_h_bu(tmp0, vec0, const_42);
-    tmp1 = __lasx_xvmaddwod_h_bu(tmp1, vec1, const_42);
-    dst0 = __lasx_xvssrani_b_h(tmp1, tmp0, 8);
-    dst0 = __lasx_xvperm_w(dst0, control);
-    __lasx_xvst(dst0, dst_y, 0);
-    src_argb0 += 128;
-    dst_y += 32;
-  }
-}
-
 void ARGBToUVRow_LASX(const uint8_t* src_argb0,
                       int src_stride_argb,
                       uint8_t* dst_u,
@@ -1216,7 +1182,7 @@ void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
 
 void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width) {
   int x;
   int len = width / 16;
@@ -1811,48 +1777,6 @@ void RGB565ToUVRow_LASX(const uint8_t* src_rgb565,
   }
 }
 
-void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 32;
-  __m256i src0, src1, src2;
-  __m256i tmp0, tmp1, tmp2, tmp3;
-  __m256i reg0, reg1, reg2, dst0;
-  __m256i const_129 = __lasx_xvldi(129);
-  __m256i const_br = {0x4219421942194219, 0x4219421942194219,
-                      0x4219421942194219, 0x4219421942194219};
-  __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
-                        0x1080108010801080, 0x1080108010801080};
-  __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200,
-                    0x17151412110F0E0C};
-  __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18,
-                    0x0F0D0C0A09070604};
-  __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001,
-                    0x001600130010000D};
-  __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019,
-                    0x000E000B00080005};
-
-  for (x = 0; x < len; x++) {
-    reg0 = __lasx_xvld(src_rgb24, 0);
-    reg1 = __lasx_xvld(src_rgb24, 32);
-    reg2 = __lasx_xvld(src_rgb24, 64);
-    src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
-    src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
-    src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
-    tmp0 = __lasx_xvshuf_b(src1, src0, shuff0);
-    tmp1 = __lasx_xvshuf_b(src1, src2, shuff1);
-    tmp2 = __lasx_xvshuf_b(src1, src0, shuff2);
-    tmp3 = __lasx_xvshuf_b(src1, src2, shuff3);
-    reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129);
-    reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129);
-    reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1);
-    dst0 = __lasx_xvpickod_b(reg1, reg0);
-    __lasx_xvst(dst0, dst_y, 0);
-    dst_y += 32;
-    src_rgb24 += 96;
-  }
-}
-
 void RGB24ToUVRow_LASX(const uint8_t* src_rgb24,
                        int src_stride_rgb24,
                        uint8_t* dst_u,
@@ -1916,48 +1840,6 @@ void RGB24ToUVRow_LASX(const uint8_t* src_rgb24,
   }
 }
 
-void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 32;
-  __m256i src0, src1, src2;
-  __m256i tmp0, tmp1, tmp2, tmp3;
-  __m256i reg0, reg1, reg2, dst0;
-  __m256i const_129 = __lasx_xvldi(129);
-  __m256i const_br = {0x1942194219421942, 0x1942194219421942,
-                      0x1942194219421942, 0x1942194219421942};
-  __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
-                        0x1080108010801080, 0x1080108010801080};
-  __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200,
-                    0x17151412110F0E0C};
-  __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18,
-                    0x0F0D0C0A09070604};
-  __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001,
-                    0x001600130010000D};
-  __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019,
-                    0x000E000B00080005};
-
-  for (x = 0; x < len; x++) {
-    reg0 = __lasx_xvld(src_raw, 0);
-    reg1 = __lasx_xvld(src_raw, 32);
-    reg2 = __lasx_xvld(src_raw, 64);
-    src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
-    src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
-    src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
-    tmp0 = __lasx_xvshuf_b(src1, src0, shuff0);
-    tmp1 = __lasx_xvshuf_b(src1, src2, shuff1);
-    tmp2 = __lasx_xvshuf_b(src1, src0, shuff2);
-    tmp3 = __lasx_xvshuf_b(src1, src2, shuff3);
-    reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129);
-    reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129);
-    reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1);
-    dst0 = __lasx_xvpickod_b(reg1, reg0);
-    __lasx_xvst(dst0, dst_y, 0);
-    dst_y += 32;
-    src_raw += 96;
-  }
-}
-
 void RAWToUVRow_LASX(const uint8_t* src_raw,
                      int src_stride_raw,
                      uint8_t* dst_u,
@@ -2118,36 +2000,228 @@ void NV21ToARGBRow_LASX(const uint8_t* src_y,
   }
 }
 
-void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 32;
-  __m256i src0, src1, src2, src3, dst0;
-  __m256i tmp0, tmp1, tmp2, tmp3;
-  __m256i reg0, reg1;
-  __m256i const_128 = __lasx_xvldi(0x480);
-  __m256i const_150 = __lasx_xvldi(0x96);
-  __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D,
-                      0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
-  __m256i shuff = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
-                   0x0000000700000003};
+struct RgbConstants {
+  uint8_t kRGBToY[4];
+  uint16_t kAddY;
+  uint16_t pad;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+                                                        128,
+                                                        0};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                        0x1080,
+                                                        0};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
+                                                      0x1080,
+                                                      0};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
+                                  uint8_t* dst_y,
+                                  int width,
+                                  const struct RgbConstants* rgbconstants) {
+  int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+  asm volatile(
+      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
+      "xvldrepl.h      $xr3,  %3,    4             \n\t"  // load rgbconstants
+      "xvld            $xr20, %4,    0             \n\t"  // load shuff
+      "1:                                          \n\t"
+      "xvld            $xr4,  %0,    0             \n\t"
+      "xvld            $xr5,  %0,    32            \n\t"
+      "xvld            $xr6,  %0,    64            \n\t"
+      "xvld            $xr7,  %0,    96            \n\t"  // load 32 pixels of
+                                                          // ARGB
+      "xvor.v          $xr12, $xr3,  $xr3          \n\t"
+      "xvor.v          $xr13, $xr3,  $xr3          \n\t"
+      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per
+                                                          // loop.
+      "xvpickev.b      $xr8,  $xr5,  $xr4          \n\t"  // BR
+      "xvpickev.b      $xr10, $xr7,  $xr6          \n\t"
+      "xvpickod.b      $xr9,  $xr5,  $xr4          \n\t"  // GA
+      "xvpickod.b      $xr11, $xr7,  $xr6          \n\t"
+      "xvmaddwev.h.bu  $xr12, $xr8,  $xr0          \n\t"  // B
+      "xvmaddwev.h.bu  $xr13, $xr10, $xr0          \n\t"
+      "xvmaddwev.h.bu  $xr12, $xr9,  $xr1          \n\t"  // G
+      "xvmaddwev.h.bu  $xr13, $xr11, $xr1          \n\t"
+      "xvmaddwod.h.bu  $xr12, $xr8,  $xr2          \n\t"  // R
+      "xvmaddwod.h.bu  $xr13, $xr10, $xr2          \n\t"
+      "addi.d          %0,    %0,    128           \n\t"
+      "xvpickod.b      $xr10, $xr13, $xr12         \n\t"
+      "xvperm.w        $xr11, $xr10, $xr20         \n\t"
+      "xvst            $xr11, %1,    0             \n\t"
+      "addi.d          %1,    %1,    32            \n\t"
+      "bnez            %2,    1b                   \n\t"
+      : "+&r"(src_argb),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(rgbconstants), "r"(shuff)
+      : "memory");
+}
 
-  for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
-              96, src0, src1, src2, src3);
-    tmp0 = __lasx_xvpickev_b(src1, src0);
-    tmp1 = __lasx_xvpickod_b(src1, src0);
-    tmp2 = __lasx_xvpickev_b(src3, src2);
-    tmp3 = __lasx_xvpickod_b(src3, src2);
-    reg0 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150);
-    reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp3, const_150);
-    reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp2);
-    dst0 = __lasx_xvpickod_b(reg1, reg0);
-    dst0 = __lasx_xvperm_w(dst0, shuff);
-    __lasx_xvst(dst0, dst_y, 0);
-    dst_y += 32;
-    src_argb += 128;
-  }
+void ARGBToYRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_LASX(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+
+void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_LASX(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_LASX(src_abgr, dst_y, width, &kRawI601Constants);
+}
+
+void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_LASX(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
+                                  uint8_t* dst_y,
+                                  int width,
+                                  const struct RgbConstants* rgbconstants) {
+  int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+  asm volatile(
+      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
+      "xvldrepl.h      $xr3,  %3,    4             \n\t"  // load rgbconstants
+      "xvld            $xr20, %4,    0             \n\t"  // load shuff
+      "1:                                          \n\t"
+      "xvld            $xr4,  %0,    0             \n\t"
+      "xvld            $xr5,  %0,    32            \n\t"
+      "xvld            $xr6,  %0,    64            \n\t"
+      "xvld            $xr7,  %0,    96            \n\t"  // load 32 pixels of
+                                                          // RGBA
+      "xvor.v          $xr12, $xr3,  $xr3          \n\t"
+      "xvor.v          $xr13, $xr3,  $xr3          \n\t"
+      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per
+                                                          // loop.
+      "xvpickev.b      $xr8,  $xr5,  $xr4          \n\t"  // AG
+      "xvpickev.b      $xr10, $xr7,  $xr6          \n\t"
+      "xvpickod.b      $xr9,  $xr5,  $xr4          \n\t"  // BR
+      "xvpickod.b      $xr11, $xr7,  $xr6          \n\t"
+      "xvmaddwev.h.bu  $xr12, $xr9,  $xr0          \n\t"  // B
+      "xvmaddwev.h.bu  $xr13, $xr11, $xr0          \n\t"
+      "xvmaddwod.h.bu  $xr12, $xr8,  $xr1          \n\t"  // G
+      "xvmaddwod.h.bu  $xr13, $xr10, $xr1          \n\t"
+      "xvmaddwod.h.bu  $xr12, $xr9,  $xr2          \n\t"  // R
+      "xvmaddwod.h.bu  $xr13, $xr11, $xr2          \n\t"
+      "addi.d          %0,    %0,    128           \n\t"
+      "xvpickod.b      $xr10, $xr13, $xr12         \n\t"
+      "xvperm.w        $xr11, $xr10, $xr20         \n\t"
+      "xvst            $xr11, %1,    0             \n\t"
+      "addi.d          %1,    %1,    32            \n\t"
+      "bnez            %2,    1b                   \n\t"
+      : "+&r"(src_rgba),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(rgbconstants), "r"(shuff)
+      : "memory");
+}
+
+void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_LASX(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+
+void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+  RGBAToYMatrixRow_LASX(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_LASX(src_bgra, dst_y, width, &kRawI601Constants);
+}
+
+static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct RgbConstants* rgbconstants) {
+  int8_t shuff[128] = {
+      0,  2,  3,  5,  6,  8, 9,  11, 12, 14, 15, 17, 18, 20, 21, 23,
+      0,  2,  3,  5,  6,  8, 9,  11, 12, 14, 15, 17, 18, 20, 21, 23,
+      24, 26, 27, 29, 30, 0, 1,  3,  4,  6,  7,  9,  10, 12, 13, 15,
+      24, 26, 27, 29, 30, 0, 1,  3,  4,  6,  7,  9,  10, 12, 13, 15,
+      1,  0,  4,  0,  7,  0, 10, 0,  13, 0,  16, 0,  19, 0,  22, 0,
+      1,  0,  4,  0,  7,  0, 10, 0,  13, 0,  16, 0,  19, 0,  22, 0,
+      25, 0,  28, 0,  31, 0, 2,  0,  5,  0,  8,  0,  11, 0,  14, 0,
+      25, 0,  28, 0,  31, 0, 2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
+  asm volatile(
+      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
+      "xvldrepl.h      $xr3,  %3,    4             \n\t"  // load rgbconstants
+      "xvld            $xr4,  %4,    0             \n\t"  // load shuff
+      "xvld            $xr5,  %4,    32            \n\t"
+      "xvld            $xr6,  %4,    64            \n\t"
+      "xvld            $xr7,  %4,    96            \n\t"
+      "1:                                          \n\t"
+      "xvld            $xr8,  %0,    0             \n\t"
+      "xvld            $xr9,  %0,    32            \n\t"
+      "xvld            $xr10, %0,    64            \n\t"  // load 32 pixels of
+                                                          // RGB
+      "xvor.v          $xr12, $xr3,  $xr3          \n\t"
+      "xvor.v          $xr13, $xr3,  $xr3          \n\t"
+      "xvor.v          $xr11, $xr9,  $xr9          \n\t"
+      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per
+                                                          // loop.
+      "xvpermi.q       $xr9,  $xr8,  0x30          \n\t"  // src0
+      "xvpermi.q       $xr8,  $xr10, 0x03          \n\t"  // src1
+      "xvpermi.q       $xr10, $xr11, 0x30          \n\t"  // src2
+      "xvshuf.b        $xr14, $xr8,  $xr9,  $xr4   \n\t"
+      "xvshuf.b        $xr15, $xr8,  $xr10, $xr5   \n\t"
+      "xvshuf.b        $xr16, $xr8,  $xr9,  $xr6   \n\t"
+      "xvshuf.b        $xr17, $xr8,  $xr10, $xr7   \n\t"
+      "xvmaddwev.h.bu  $xr12, $xr16, $xr1          \n\t"  // G
+      "xvmaddwev.h.bu  $xr13, $xr17, $xr1          \n\t"
+      "xvmaddwev.h.bu  $xr12, $xr14, $xr0          \n\t"  // B
+      "xvmaddwev.h.bu  $xr13, $xr15, $xr0          \n\t"
+      "xvmaddwod.h.bu  $xr12, $xr14, $xr2          \n\t"  // R
+      "xvmaddwod.h.bu  $xr13, $xr15, $xr2          \n\t"
+      "addi.d          %0,    %0,    96            \n\t"
+      "xvpickod.b      $xr10, $xr13, $xr12         \n\t"
+      "xvst            $xr10, %1,    0             \n\t"
+      "addi.d          %1,    %1,    32            \n\t"
+      "bnez            %2,    1b                   \n\t"
+      : "+&r"(src_rgba),    // %0
+        "+&r"(dst_y),       // %1
+        "+&r"(width)        // %2
+      : "r"(rgbconstants),  // %3
+        "r"(shuff)          // %4
+      : "memory");
+}
+
+void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LASX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LASX(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+
+void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LASX(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+
+void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LASX(src_raw, dst_y, width, &kRawI601Constants);
 }
 
 void ARGBToUVJRow_LASX(const uint8_t* src_argb,
diff --git a/files/source/row_lsx.cc b/files/source/row_lsx.cc
index 3e8b901a..e626072a 100644
--- a/files/source/row_lsx.cc
+++ b/files/source/row_lsx.cc
@@ -31,6 +31,91 @@ extern "C" {
     yb = __lsx_vreplgr2vr_w(yuvconst->kYBiasToRgb[0]);   \
   }
 
+// Load 32 YUV422 pixel data
+#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \
+  {                                                             \
+    __m128i temp0, temp1;                                       \
+                                                                \
+    DUP2_ARG2(__lsx_vld, psrc_y, 0, psrc_u, 0, out_y, temp0);   \
+    temp1 = __lsx_vld(psrc_v, 0);                               \
+    temp0 = __lsx_vsub_b(temp0, const_80);                      \
+    temp1 = __lsx_vsub_b(temp1, const_80);                      \
+    temp0 = __lsx_vsllwil_h_b(temp0, 0);                        \
+    temp1 = __lsx_vsllwil_h_b(temp1, 0);                        \
+    uv_l = __lsx_vilvl_h(temp0, temp1);                         \
+    uv_h = __lsx_vilvh_h(temp0, temp1);                         \
+  }
+
+// Load 16 YUV422 pixel data
+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \
+  {                                                   \
+    __m128i temp0, temp1;                             \
+                                                      \
+    out_y = __lsx_vld(psrc_y, 0);                     \
+    temp0 = __lsx_vldrepl_d(psrc_u, 0);               \
+    temp1 = __lsx_vldrepl_d(psrc_v, 0);               \
+    uv = __lsx_vilvl_b(temp0, temp1);                 \
+    uv = __lsx_vsub_b(uv, const_80);                  \
+    uv = __lsx_vsllwil_h_b(uv, 0);                    \
+  }
+
+// Convert 16 pixels of YUV420 to RGB.
+#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \
+                   g_h, r_l, r_h)                                           \
+  {                                                                         \
+    __m128i u_l, u_h, v_l, v_h;                                             \
+    __m128i yl_ev, yl_od, yh_ev, yh_od;                                     \
+    __m128i temp0, temp1, temp2, temp3;                                     \
+                                                                            \
+    temp0 = __lsx_vilvl_b(in_y, in_y);                                      \
+    temp1 = __lsx_vilvh_b(in_y, in_y);                                      \
+    yl_ev = __lsx_vmulwev_w_hu_h(temp0, yg);                                \
+    yl_od = __lsx_vmulwod_w_hu_h(temp0, yg);                                \
+    yh_ev = __lsx_vmulwev_w_hu_h(temp1, yg);                                \
+    yh_od = __lsx_vmulwod_w_hu_h(temp1, yg);                                \
+    DUP4_ARG2(__lsx_vsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16,    \
+              yl_ev, yl_od, yh_ev, yh_od);                                  \
+    yl_ev = __lsx_vadd_w(yl_ev, yb);                                        \
+    yl_od = __lsx_vadd_w(yl_od, yb);                                        \
+    yh_ev = __lsx_vadd_w(yh_ev, yb);                                        \
+    yh_od = __lsx_vadd_w(yh_od, yb);                                        \
+    v_l = __lsx_vmulwev_w_h(in_uvl, ubvr);                                  \
+    u_l = __lsx_vmulwod_w_h(in_uvl, ubvr);                                  \
+    v_h = __lsx_vmulwev_w_h(in_uvh, ubvr);                                  \
+    u_h = __lsx_vmulwod_w_h(in_uvh, ubvr);                                  \
+    temp0 = __lsx_vadd_w(yl_ev, u_l);                                       \
+    temp1 = __lsx_vadd_w(yl_od, u_l);                                       \
+    temp2 = __lsx_vadd_w(yh_ev, u_h);                                       \
+    temp3 = __lsx_vadd_w(yh_od, u_h);                                       \
+    DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+              temp1, temp2, temp3);                                         \
+    DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1,   \
+              temp2, temp3);                                                \
+    b_l = __lsx_vpackev_h(temp1, temp0);                                    \
+    b_h = __lsx_vpackev_h(temp3, temp2);                                    \
+    temp0 = __lsx_vadd_w(yl_ev, v_l);                                       \
+    temp1 = __lsx_vadd_w(yl_od, v_l);                                       \
+    temp2 = __lsx_vadd_w(yh_ev, v_h);                                       \
+    temp3 = __lsx_vadd_w(yh_od, v_h);                                       \
+    DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+              temp1, temp2, temp3);                                         \
+    DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1,   \
+              temp2, temp3);                                                \
+    r_l = __lsx_vpackev_h(temp1, temp0);                                    \
+    r_h = __lsx_vpackev_h(temp3, temp2);                                    \
+    DUP2_ARG2(__lsx_vdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h);        \
+    temp0 = __lsx_vsub_w(yl_ev, u_l);                                       \
+    temp1 = __lsx_vsub_w(yl_od, u_l);                                       \
+    temp2 = __lsx_vsub_w(yh_ev, u_h);                                       \
+    temp3 = __lsx_vsub_w(yh_od, u_h);                                       \
+    DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+              temp1, temp2, temp3);                                         \
+    DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1,   \
+              temp2, temp3);                                                \
+    g_l = __lsx_vpackev_h(temp1, temp0);                                    \
+    g_h = __lsx_vpackev_h(temp3, temp2);                                    \
+  }
+
 // Convert 8 pixels of YUV420 to RGB.
 #define YUVTORGB(in_y, in_vu, vrub, vgug, yg, yb, out_b, out_g, out_r) \
   {                                                                    \
@@ -118,42 +203,1083 @@ extern "C" {
     out_g = __lsx_vpackev_h(tmp1, tmp0);                                 \
   }
 
-// Pack and Store 8 ARGB values.
-#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
-  {                                                  \
-    __m128i temp0, temp1;                            \
-    __m128i dst0, dst1;                              \
-                                                     \
-    temp0 = __lsx_vpackev_b(in_g, in_b);             \
-    temp1 = __lsx_vpackev_b(in_a, in_r);             \
-    dst0 = __lsx_vilvl_h(temp1, temp0);              \
-    dst1 = __lsx_vilvh_h(temp1, temp0);              \
-    __lsx_vst(dst0, pdst_argb, 0);                   \
-    __lsx_vst(dst1, pdst_argb, 16);                  \
-    pdst_argb += 32;                                 \
+// Pack and Store 16 ARGB values.
+#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \
+  {                                                                    \
+    __m128i temp0, temp1, temp2, temp3;                                \
+    temp0 = __lsx_vpackev_b(g_l, b_l);                                 \
+    temp1 = __lsx_vpackev_b(a_l, r_l);                                 \
+    temp2 = __lsx_vpackev_b(g_h, b_h);                                 \
+    temp3 = __lsx_vpackev_b(a_h, r_h);                                 \
+    r_l = __lsx_vilvl_h(temp1, temp0);                                 \
+    r_h = __lsx_vilvh_h(temp1, temp0);                                 \
+    g_l = __lsx_vilvl_h(temp3, temp2);                                 \
+    g_h = __lsx_vilvh_h(temp3, temp2);                                 \
+    __lsx_vst(r_l, pdst_argb, 0);                                      \
+    __lsx_vst(r_h, pdst_argb, 16);                                     \
+    __lsx_vst(g_l, pdst_argb, 32);                                     \
+    __lsx_vst(g_h, pdst_argb, 48);                                     \
+    pdst_argb += 64;                                                   \
+  }
+
+// Pack and Store 8 ARGB values.
+#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
+  {                                                  \
+    __m128i temp0, temp1;                            \
+    __m128i dst0, dst1;                              \
+                                                     \
+    temp0 = __lsx_vpackev_b(in_g, in_b);             \
+    temp1 = __lsx_vpackev_b(in_a, in_r);             \
+    dst0 = __lsx_vilvl_h(temp1, temp0);              \
+    dst1 = __lsx_vilvh_h(temp1, temp0);              \
+    __lsx_vst(dst0, pdst_argb, 0);                   \
+    __lsx_vst(dst1, pdst_argb, 16);                  \
+    pdst_argb += 32;                                 \
+  }
+
+#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
+  {                                                              \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3;                          \
+    __m128i _reg0, _reg1;                                        \
+    _tmp0 = __lsx_vaddwev_h_bu(_tmpb, _nexb);                    \
+    _tmp1 = __lsx_vaddwod_h_bu(_tmpb, _nexb);                    \
+    _tmp2 = __lsx_vaddwev_h_bu(_tmpg, _nexg);                    \
+    _tmp3 = __lsx_vaddwod_h_bu(_tmpg, _nexg);                    \
+    _reg0 = __lsx_vaddwev_h_bu(_tmpr, _nexr);                    \
+    _reg1 = __lsx_vaddwod_h_bu(_tmpr, _nexr);                    \
+    _tmpb = __lsx_vavgr_hu(_tmp0, _tmp1);                        \
+    _tmpg = __lsx_vavgr_hu(_tmp2, _tmp3);                        \
+    _tmpr = __lsx_vavgr_hu(_reg0, _reg1);                        \
+    _reg0 = __lsx_vmadd_h(const_8080, const_112, _tmpb);         \
+    _reg1 = __lsx_vmadd_h(const_8080, const_112, _tmpr);         \
+    _reg0 = __lsx_vmsub_h(_reg0, const_74, _tmpg);               \
+    _reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg);               \
+    _reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr);               \
+    _reg1 = __lsx_vmsub_h(_reg1, const_18, _tmpb);               \
+    _dst0 = __lsx_vpickod_b(_reg1, _reg0);                       \
+  }
+
+void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  int len = width / 32;
+  __m128i src0, src1;
+  __m128i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607};
+  src += width - 32;
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
+              src1);
+    __lsx_vst(src1, dst, 0);
+    __lsx_vst(src0, dst, 16);
+    dst += 32;
+    src -= 32;
+  }
+}
+
+void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  int x;
+  int len = width / 8;
+  __m128i src, dst;
+  __m128i shuffler = {0x0004000500060007, 0x0000000100020003};
+
+  src_uv += (width - 8) << 1;
+  for (x = 0; x < len; x++) {
+    src = __lsx_vld(src_uv, 0);
+    dst = __lsx_vshuf_h(shuffler, src, src);
+    __lsx_vst(dst, dst_uv, 0);
+    src_uv -= 16;
+    dst_uv += 16;
+  }
+}
+
+void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1;
+  __m128i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504};
+
+  src += (width * 4) - 32;
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
+              src1);
+    __lsx_vst(src1, dst, 0);
+    __lsx_vst(src0, dst, 16);
+    dst += 32;
+    src -= 32;
+  }
+}
+
+void I422ToYUY2Row_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  int x;
+  int len = width / 16;
+  __m128i src_u0, src_v0, src_y0, vec_uv0;
+  __m128i vec_yuy2_0, vec_yuy2_1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0);
+    src_y0 = __lsx_vld(src_y, 0);
+    vec_uv0 = __lsx_vilvl_b(src_v0, src_u0);
+    vec_yuy2_0 = __lsx_vilvl_b(vec_uv0, src_y0);
+    vec_yuy2_1 = __lsx_vilvh_b(vec_uv0, src_y0);
+    __lsx_vst(vec_yuy2_0, dst_yuy2, 0);
+    __lsx_vst(vec_yuy2_1, dst_yuy2, 16);
+    src_u += 8;
+    src_v += 8;
+    src_y += 16;
+    dst_yuy2 += 32;
+  }
+}
+
+void I422ToUYVYRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_uyvy,
+                       int width) {
+  int x;
+  int len = width / 16;
+  __m128i src_u0, src_v0, src_y0, vec_uv0;
+  __m128i vec_uyvy0, vec_uyvy1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0);
+    src_y0 = __lsx_vld(src_y, 0);
+    vec_uv0 = __lsx_vilvl_b(src_v0, src_u0);
+    vec_uyvy0 = __lsx_vilvl_b(src_y0, vec_uv0);
+    vec_uyvy1 = __lsx_vilvh_b(src_y0, vec_uv0);
+    __lsx_vst(vec_uyvy0, dst_uyvy, 0);
+    __lsx_vst(vec_uyvy1, dst_uyvy, 16);
+    src_u += 8;
+    src_v += 8;
+    src_y += 16;
+    dst_uyvy += 32;
+  }
+}
+
+void I422ToARGBRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i alpha = __lsx_vldi(0xFF);
+  __m128i const_80 = __lsx_vldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+void I422ToRGBARow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i alpha = __lsx_vldi(0xFF);
+  __m128i const_80 = __lsx_vldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb);
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+void I422AlphaToARGBRow_LSX(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  int x;
+  int len = width / 16;
+  int res = width & 15;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i zero = __lsx_vldi(0);
+  __m128i const_80 = __lsx_vldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h;
+
+    y = __lsx_vld(src_a, 0);
+    a_l = __lsx_vilvl_b(zero, y);
+    a_h = __lsx_vilvh_b(zero, y);
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+    src_a += 16;
+  }
+  if (res) {
+    __m128i y, uv, r, g, b, a;
+    a = __lsx_vld(src_a, 0);
+    a = __lsx_vsllwil_hu_bu(a, 0);
+    READYUV422(src_y, src_u, src_v, y, uv);
+    YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r);
+    STOREARGB(a, r, g, b, dst_argb);
+  }
+}
+
+void I422ToRGB24Row_LSX(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int32_t width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i const_80 = __lsx_vldi(0x80);
+  __m128i shuffler0 = {0x0504120302100100, 0x0A18090816070614};
+  __m128i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B};
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+    __m128i temp0, temp1, temp2, temp3;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    temp0 = __lsx_vpackev_b(g_l, b_l);
+    temp1 = __lsx_vpackev_b(g_h, b_h);
+    DUP4_ARG3(__lsx_vshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1, r_l,
+              temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0,
+              temp1);
+
+    b_l = __lsx_vilvl_d(temp1, temp2);
+    b_h = __lsx_vilvh_d(temp3, temp1);
+    __lsx_vst(temp0, dst_argb, 0);
+    __lsx_vst(b_l, dst_argb, 16);
+    __lsx_vst(b_h, dst_argb, 32);
+    dst_argb += 48;
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
+void I422ToRGB565Row_LSX(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i const_80 = __lsx_vldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    b_l = __lsx_vsrli_h(b_l, 3);
+    b_h = __lsx_vsrli_h(b_h, 3);
+    g_l = __lsx_vsrli_h(g_l, 2);
+    g_h = __lsx_vsrli_h(g_h, 2);
+    r_l = __lsx_vsrli_h(r_l, 3);
+    r_h = __lsx_vsrli_h(r_h, 3);
+    r_l = __lsx_vslli_h(r_l, 11);
+    r_h = __lsx_vslli_h(r_h, 11);
+    g_l = __lsx_vslli_h(g_l, 5);
+    g_h = __lsx_vslli_h(g_h, 5);
+    r_l = __lsx_vor_v(r_l, g_l);
+    r_l = __lsx_vor_v(r_l, b_l);
+    r_h = __lsx_vor_v(r_h, g_h);
+    r_h = __lsx_vor_v(r_h, b_h);
+    __lsx_vst(r_l, dst_rgb565, 0);
+    __lsx_vst(r_h, dst_rgb565, 16);
+    dst_rgb565 += 32;
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
+void I422ToARGB4444Row_LSX(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i const_80 = __lsx_vldi(0x80);
+  __m128i alpha = {0xF000F000F000F000, 0xF000F000F000F000};
+  __m128i mask = {0x00F000F000F000F0, 0x00F000F000F000F0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    b_l = __lsx_vsrli_h(b_l, 4);
+    b_h = __lsx_vsrli_h(b_h, 4);
+    r_l = __lsx_vsrli_h(r_l, 4);
+    r_h = __lsx_vsrli_h(r_h, 4);
+    g_l = __lsx_vand_v(g_l, mask);
+    g_h = __lsx_vand_v(g_h, mask);
+    r_l = __lsx_vslli_h(r_l, 8);
+    r_h = __lsx_vslli_h(r_h, 8);
+    r_l = __lsx_vor_v(r_l, alpha);
+    r_h = __lsx_vor_v(r_h, alpha);
+    r_l = __lsx_vor_v(r_l, g_l);
+    r_h = __lsx_vor_v(r_h, g_h);
+    r_l = __lsx_vor_v(r_l, b_l);
+    r_h = __lsx_vor_v(r_h, b_h);
+    __lsx_vst(r_l, dst_argb4444, 0);
+    __lsx_vst(r_h, dst_argb4444, 16);
+    dst_argb4444 += 32;
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+void I422ToARGB1555Row_LSX(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i const_80 = __lsx_vldi(0x80);
+  __m128i alpha = {0x8000800080008000, 0x8000800080008000};
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    b_l = __lsx_vsrli_h(b_l, 3);
+    b_h = __lsx_vsrli_h(b_h, 3);
+    g_l = __lsx_vsrli_h(g_l, 3);
+
+    g_h = __lsx_vsrli_h(g_h, 3);
+    g_l = __lsx_vslli_h(g_l, 5);
+    g_h = __lsx_vslli_h(g_h, 5);
+    r_l = __lsx_vsrli_h(r_l, 3);
+    r_h = __lsx_vsrli_h(r_h, 3);
+    r_l = __lsx_vslli_h(r_l, 10);
+    r_h = __lsx_vslli_h(r_h, 10);
+    r_l = __lsx_vor_v(r_l, alpha);
+    r_h = __lsx_vor_v(r_h, alpha);
+    r_l = __lsx_vor_v(r_l, g_l);
+    r_h = __lsx_vor_v(r_h, g_h);
+    r_l = __lsx_vor_v(r_l, b_l);
+    r_h = __lsx_vor_v(r_h, b_h);
+    __lsx_vst(r_l, dst_argb1555, 0);
+    __lsx_vst(r_h, dst_argb1555, 16);
+    dst_argb1555 += 32;
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1);
+    dst0 = __lsx_vpickev_b(src1, src0);
+    __lsx_vst(dst0, dst_y, 0);
+    src_yuy2 += 32;
+    dst_y += 16;
+  }
+}
+
+void YUY2ToUVRow_LSX(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src_yuy2_next, 0,
+              src_yuy2_next, 16, src0, src1, src2, src3);
+    src0 = __lsx_vpickod_b(src1, src0);
+    src1 = __lsx_vpickod_b(src3, src2);
+    tmp0 = __lsx_vavgr_bu(src1, src0);
+    dst0 = __lsx_vpickev_b(tmp0, tmp0);
+    dst1 = __lsx_vpickod_b(tmp0, tmp0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst1, dst_v, 0, 0);
+    src_yuy2 += 32;
+    src_yuy2_next += 32;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1);
+    tmp0 = __lsx_vpickod_b(src1, src0);
+    dst0 = __lsx_vpickev_b(tmp0, tmp0);
+    dst1 = __lsx_vpickod_b(tmp0, tmp0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst1, dst_v, 0, 0);
+    src_yuy2 += 32;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1);
+    dst0 = __lsx_vpickod_b(src1, src0);
+    __lsx_vst(dst0, dst_y, 0);
+    src_uyvy += 32;
+    dst_y += 16;
+  }
+}
+
+void UYVYToUVRow_LSX(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, src2, src3, tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src_uyvy_next, 0,
+              src_uyvy_next, 16, src0, src1, src2, src3);
+    src0 = __lsx_vpickev_b(src1, src0);
+    src1 = __lsx_vpickev_b(src3, src2);
+    tmp0 = __lsx_vavgr_bu(src1, src0);
+    dst0 = __lsx_vpickev_b(tmp0, tmp0);
+    dst1 = __lsx_vpickod_b(tmp0, tmp0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst1, dst_v, 0, 0);
+    src_uyvy += 32;
+    src_uyvy_next += 32;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void UYVYToUV422Row_LSX(const uint8_t* src_uyvy,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    dst0 = __lsx_vpickev_b(tmp0, tmp0);
+    dst1 = __lsx_vpickod_b(tmp0, tmp0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst1, dst_v, 0, 0);
+    src_uyvy += 32;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void ARGBToUVRow_LSX(const uint8_t* src_argb0,
+                     int src_stride_argb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  int len = width / 16;
+  const uint8_t* src_argb1 = src_argb0 + src_stride_argb;
+
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i vec0, vec1, vec2, vec3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
+  __m128i const_0x70 = {0x0038003800380038, 0x0038003800380038};
+  __m128i const_0x4A = {0x0025002500250025, 0x0025002500250025};
+  __m128i const_0x26 = {0x0013001300130013, 0x0013001300130013};
+  __m128i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f};
+  __m128i const_0x12 = {0x0009000900090009, 0x0009000900090009};
+  __m128i const_0x8080 = {0x8080808080808080, 0x8080808080808080};
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32, src_argb0,
+              48, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vld, src_argb1, 0, src_argb1, 16, src_argb1, 32, src_argb1,
+              48, src4, src5, src6, src7);
+    vec0 = __lsx_vaddwev_h_bu(src0, src4);
+    vec1 = __lsx_vaddwev_h_bu(src1, src5);
+    vec2 = __lsx_vaddwev_h_bu(src2, src6);
+    vec3 = __lsx_vaddwev_h_bu(src3, src7);
+    tmp0 = __lsx_vpickev_h(vec1, vec0);
+    tmp1 = __lsx_vpickev_h(vec3, vec2);
+    tmp2 = __lsx_vpickod_h(vec1, vec0);
+    tmp3 = __lsx_vpickod_h(vec3, vec2);
+    vec0 = __lsx_vaddwod_h_bu(src0, src4);
+    vec1 = __lsx_vaddwod_h_bu(src1, src5);
+    vec2 = __lsx_vaddwod_h_bu(src2, src6);
+    vec3 = __lsx_vaddwod_h_bu(src3, src7);
+    tmp4 = __lsx_vpickev_h(vec1, vec0);
+    tmp5 = __lsx_vpickev_h(vec3, vec2);
+    vec0 = __lsx_vpickev_h(tmp1, tmp0);
+    vec1 = __lsx_vpickod_h(tmp1, tmp0);
+    src0 = __lsx_vavgr_h(vec0, vec1);
+    vec0 = __lsx_vpickev_h(tmp3, tmp2);
+    vec1 = __lsx_vpickod_h(tmp3, tmp2);
+    src1 = __lsx_vavgr_h(vec0, vec1);
+    vec0 = __lsx_vpickev_h(tmp5, tmp4);
+    vec1 = __lsx_vpickod_h(tmp5, tmp4);
+    src2 = __lsx_vavgr_h(vec0, vec1);
+    dst0 = __lsx_vmadd_h(const_0x8080, src0, const_0x70);
+    dst0 = __lsx_vmsub_h(dst0, src2, const_0x4A);
+    dst0 = __lsx_vmsub_h(dst0, src1, const_0x26);
+    dst1 = __lsx_vmadd_h(const_0x8080, src1, const_0x70);
+    dst1 = __lsx_vmsub_h(dst1, src2, const_0x5E);
+    dst1 = __lsx_vmsub_h(dst1, src0, const_0x12);
+    dst0 = __lsx_vsrai_h(dst0, 8);
+    dst1 = __lsx_vsrai_h(dst1, 8);
+    dst0 = __lsx_vpickev_b(dst1, dst0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst0, dst_v, 0, 1);
+    src_argb0 += 64;
+    src_argb1 += 64;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  int len = (width / 16) - 1;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i shuf = {0x0908060504020100, 0x000000000E0D0C0A};
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+              src0, src1, src2, src3);
+    tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+    tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+    tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+    tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+    __lsx_vst(tmp0, dst_rgb, 0);
+    __lsx_vst(tmp1, dst_rgb, 12);
+    __lsx_vst(tmp2, dst_rgb, 24);
+    __lsx_vst(tmp3, dst_rgb, 36);
+    dst_rgb += 48;
+    src_argb += 64;
+  }
+  DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+            src0, src1, src2, src3);
+  tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+  tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+  tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+  tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+  __lsx_vst(tmp0, dst_rgb, 0);
+  __lsx_vst(tmp1, dst_rgb, 12);
+  __lsx_vst(tmp2, dst_rgb, 24);
+  dst_rgb += 36;
+  __lsx_vst(tmp3, dst_rgb, 0);
+}
+
+void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  int len = (width / 16) - 1;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i shuf = {0x090A040506000102, 0x000000000C0D0E08};
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+              src0, src1, src2, src3);
+    tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+    tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+    tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+    tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+    __lsx_vst(tmp0, dst_rgb, 0);
+    __lsx_vst(tmp1, dst_rgb, 12);
+    __lsx_vst(tmp2, dst_rgb, 24);
+    __lsx_vst(tmp3, dst_rgb, 36);
+    dst_rgb += 48;
+    src_argb += 64;
+  }
+  DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+            src0, src1, src2, src3);
+  tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+  tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+  tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+  tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+  __lsx_vst(tmp0, dst_rgb, 0);
+  __lsx_vst(tmp1, dst_rgb, 12);
+  __lsx_vst(tmp2, dst_rgb, 24);
+  dst_rgb += 36;
+  __lsx_vst(tmp3, dst_rgb, 0);
+}
+
+void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  int len = width / 8;
+  __m128i zero = __lsx_vldi(0);
+  __m128i src0, src1, tmp0, tmp1, dst0;
+  __m128i shift = {0x0300030003000300, 0x0300030003000300};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    tmp0 = __lsx_vsrli_b(tmp0, 3);
+    tmp1 = __lsx_vpackev_b(zero, tmp1);
+    tmp1 = __lsx_vsrli_h(tmp1, 2);
+    tmp0 = __lsx_vsll_b(tmp0, shift);
+    tmp1 = __lsx_vslli_h(tmp1, 5);
+    dst0 = __lsx_vor_v(tmp0, tmp1);
+    __lsx_vst(dst0, dst_rgb, 0);
+    dst_rgb += 16;
+    src_argb += 32;
+  }
+}
+
+void ARGBToARGB1555Row_LSX(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
+  int x;
+  int len = width / 8;
+  __m128i zero = __lsx_vldi(0);
+  __m128i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0;
+  __m128i shift1 = {0x0703070307030703, 0x0703070307030703};
+  __m128i shift2 = {0x0200020002000200, 0x0200020002000200};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    tmp0 = __lsx_vsrli_b(tmp0, 3);
+    tmp1 = __lsx_vsrl_b(tmp1, shift1);
+    tmp0 = __lsx_vsll_b(tmp0, shift2);
+    tmp2 = __lsx_vpackev_b(zero, tmp1);
+    tmp3 = __lsx_vpackod_b(zero, tmp1);
+    tmp2 = __lsx_vslli_h(tmp2, 5);
+    tmp3 = __lsx_vslli_h(tmp3, 15);
+    dst0 = __lsx_vor_v(tmp0, tmp2);
+    dst0 = __lsx_vor_v(dst0, tmp3);
+    __lsx_vst(dst0, dst_rgb, 0);
+    dst_rgb += 16;
+    src_argb += 32;
+  }
+}
+
+void ARGBToARGB4444Row_LSX(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    tmp1 = __lsx_vandi_b(tmp1, 0xF0);
+    tmp0 = __lsx_vsrli_b(tmp0, 4);
+    dst0 = __lsx_vor_v(tmp1, tmp0);
+    __lsx_vst(dst0, dst_rgb, 0);
+    dst_rgb += 16;
+    src_argb += 32;
+  }
+}
+
+void ARGBToUV444Row_LSX(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int32_t width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, dst0, dst1;
+  __m128i const_112 = __lsx_vldi(112);
+  __m128i const_74 = __lsx_vldi(74);
+  __m128i const_38 = __lsx_vldi(38);
+  __m128i const_94 = __lsx_vldi(94);
+  __m128i const_18 = __lsx_vldi(18);
+  __m128i const_0x8080 = {0x8080808080808080, 0x8080808080808080};
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+              src0, src1, src2, src3);
+    tmp0 = __lsx_vpickev_h(src1, src0);
+    tmp1 = __lsx_vpickod_h(src1, src0);
+    tmp2 = __lsx_vpickev_h(src3, src2);
+    tmp3 = __lsx_vpickod_h(src3, src2);
+    reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp0, const_112);
+    reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp2, const_112);
+    reg2 = __lsx_vmulwod_h_bu(tmp0, const_74);
+    reg3 = __lsx_vmulwod_h_bu(tmp2, const_74);
+    reg2 = __lsx_vmaddwev_h_bu(reg2, tmp1, const_38);
+    reg3 = __lsx_vmaddwev_h_bu(reg3, tmp3, const_38);
+    reg0 = __lsx_vsub_h(reg0, reg2);
+    reg1 = __lsx_vsub_h(reg1, reg3);
+    reg0 = __lsx_vsrai_h(reg0, 8);
+    reg1 = __lsx_vsrai_h(reg1, 8);
+    dst0 = __lsx_vpickev_b(reg1, reg0);
+
+    reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp1, const_112);
+    reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp3, const_112);
+    reg2 = __lsx_vmulwev_h_bu(tmp0, const_18);
+    reg3 = __lsx_vmulwev_h_bu(tmp2, const_18);
+    reg2 = __lsx_vmaddwod_h_bu(reg2, tmp0, const_94);
+    reg3 = __lsx_vmaddwod_h_bu(reg3, tmp2, const_94);
+    reg0 = __lsx_vsub_h(reg0, reg2);
+    reg1 = __lsx_vsub_h(reg1, reg3);
+    reg0 = __lsx_vsrai_h(reg0, 8);
+    reg1 = __lsx_vsrai_h(reg1, 8);
+    dst1 = __lsx_vpickev_b(reg1, reg0);
+
+    __lsx_vst(dst0, dst_u, 0);
+    __lsx_vst(dst1, dst_v, 0);
+    dst_u += 16;
+    dst_v += 16;
+    src_argb += 64;
+  }
+}
+
+void ARGBMultiplyRow_LSX(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  int len = width / 4;
+  __m128i zero = __lsx_vldi(0);
+  __m128i src0, src1, dst0, dst1;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
+    tmp0 = __lsx_vilvl_b(src0, src0);
+    tmp1 = __lsx_vilvh_b(src0, src0);
+    tmp2 = __lsx_vilvl_b(zero, src1);
+    tmp3 = __lsx_vilvh_b(zero, src1);
+    dst0 = __lsx_vmuh_hu(tmp0, tmp2);
+    dst1 = __lsx_vmuh_hu(tmp1, tmp3);
+    dst0 = __lsx_vpickev_b(dst1, dst0);
+    __lsx_vst(dst0, dst_argb, 0);
+    src_argb0 += 16;
+    src_argb1 += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBAddRow_LSX(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width) {
+  int x;
+  int len = width / 4;
+  __m128i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
+    dst0 = __lsx_vsadd_bu(src0, src1);
+    __lsx_vst(dst0, dst_argb, 0);
+    src_argb0 += 16;
+    src_argb1 += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBSubtractRow_LSX(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  int len = width / 4;
+  __m128i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
+    dst0 = __lsx_vssub_bu(src0, src1);
+    __lsx_vst(dst0, dst_argb, 0);
+    src_argb0 += 16;
+    src_argb1 += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBAttenuateRow_LSX(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i b, g, r, a, dst0, dst1;
+  __m128i control = {0x0005000100040000, 0x0007000300060002};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    b = __lsx_vpackev_b(tmp0, tmp0);
+    r = __lsx_vpackod_b(tmp0, tmp0);
+    g = __lsx_vpackev_b(tmp1, tmp1);
+    a = __lsx_vpackod_b(tmp1, tmp1);
+    reg0 = __lsx_vmulwev_w_hu(b, a);
+    reg1 = __lsx_vmulwod_w_hu(b, a);
+    reg2 = __lsx_vmulwev_w_hu(r, a);
+    reg3 = __lsx_vmulwod_w_hu(r, a);
+    reg4 = __lsx_vmulwev_w_hu(g, a);
+    reg5 = __lsx_vmulwod_w_hu(g, a);
+    reg0 = __lsx_vssrani_h_w(reg1, reg0, 24);
+    reg2 = __lsx_vssrani_h_w(reg3, reg2, 24);
+    reg4 = __lsx_vssrani_h_w(reg5, reg4, 24);
+    reg0 = __lsx_vshuf_h(control, reg0, reg0);
+    reg2 = __lsx_vshuf_h(control, reg2, reg2);
+    reg4 = __lsx_vshuf_h(control, reg4, reg4);
+    tmp0 = __lsx_vpackev_b(reg4, reg0);
+    tmp1 = __lsx_vpackev_b(a, reg2);
+    dst0 = __lsx_vilvl_h(tmp1, tmp0);
+    dst1 = __lsx_vilvh_h(tmp1, tmp0);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    dst_argb += 32;
+    src_argb += 32;
+  }
+}
+
+void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               uint32_t dither4,
+                               int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1, dst0;
+  __m128i b, g, r;
+  __m128i zero = __lsx_vldi(0);
+  __m128i vec_dither = __lsx_vldrepl_w(&dither4, 0);
+
+  vec_dither = __lsx_vilvl_b(zero, vec_dither);
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    b = __lsx_vpackev_b(zero, tmp0);
+    r = __lsx_vpackod_b(zero, tmp0);
+    g = __lsx_vpackev_b(zero, tmp1);
+    b = __lsx_vadd_h(b, vec_dither);
+    g = __lsx_vadd_h(g, vec_dither);
+    r = __lsx_vadd_h(r, vec_dither);
+    DUP2_ARG1(__lsx_vclip255_h, b, g, b, g);
+    r = __lsx_vclip255_h(r);
+    b = __lsx_vsrai_h(b, 3);
+    g = __lsx_vsrai_h(g, 2);
+    r = __lsx_vsrai_h(r, 3);
+    g = __lsx_vslli_h(g, 5);
+    r = __lsx_vslli_h(r, 11);
+    dst0 = __lsx_vor_v(b, g);
+    dst0 = __lsx_vor_v(dst0, r);
+    __lsx_vst(dst0, dst_rgb, 0);
+    src_argb += 32;
+    dst_rgb += 16;
   }
+}
 
-#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
-  {                                                              \
-    __m128i _tmp0, _tmp1, _tmp2, _tmp3;                          \
-    __m128i _reg0, _reg1;                                        \
-    _tmp0 = __lsx_vaddwev_h_bu(_tmpb, _nexb);                    \
-    _tmp1 = __lsx_vaddwod_h_bu(_tmpb, _nexb);                    \
-    _tmp2 = __lsx_vaddwev_h_bu(_tmpg, _nexg);                    \
-    _tmp3 = __lsx_vaddwod_h_bu(_tmpg, _nexg);                    \
-    _reg0 = __lsx_vaddwev_h_bu(_tmpr, _nexr);                    \
-    _reg1 = __lsx_vaddwod_h_bu(_tmpr, _nexr);                    \
-    _tmpb = __lsx_vavgr_hu(_tmp0, _tmp1);                        \
-    _tmpg = __lsx_vavgr_hu(_tmp2, _tmp3);                        \
-    _tmpr = __lsx_vavgr_hu(_reg0, _reg1);                        \
-    _reg0 = __lsx_vmadd_h(const_8080, const_112, _tmpb);         \
-    _reg1 = __lsx_vmadd_h(const_8080, const_112, _tmpr);         \
-    _reg0 = __lsx_vmsub_h(_reg0, const_74, _tmpg);               \
-    _reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg);               \
-    _reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr);               \
-    _reg1 = __lsx_vmsub_h(_reg1, const_18, _tmpb);               \
-    _dst0 = __lsx_vpickod_b(_reg1, _reg0);                       \
+void ARGBShuffleRow_LSX(const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        const uint8_t* shuffler,
+                        int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, dst0, dst1;
+  __m128i shuf = {0x0404040400000000, 0x0C0C0C0C08080808};
+  __m128i temp = __lsx_vldrepl_w(shuffler, 0);
+
+  shuf = __lsx_vadd_b(shuf, temp);
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    dst0 = __lsx_vshuf_b(src0, src0, shuf);
+    dst1 = __lsx_vshuf_b(src1, src1, shuf);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBShadeRow_LSX(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      int width,
+                      uint32_t value) {
+  int x;
+  int len = width / 4;
+  __m128i src0, dst0, tmp0, tmp1;
+  __m128i vec_value = __lsx_vreplgr2vr_w(value);
+
+  vec_value = __lsx_vilvl_b(vec_value, vec_value);
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_argb, 0);
+    tmp0 = __lsx_vilvl_b(src0, src0);
+    tmp1 = __lsx_vilvh_b(src0, src0);
+    tmp0 = __lsx_vmuh_hu(tmp0, vec_value);
+    tmp1 = __lsx_vmuh_hu(tmp1, vec_value);
+    dst0 = __lsx_vpickod_b(tmp1, tmp0);
+    __lsx_vst(dst0, dst_argb, 0);
+    src_argb += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1;
+  __m128i reg0, reg1, reg2, dst0, dst1;
+  __m128i const_128 = __lsx_vldi(0x480);
+  __m128i const_150 = __lsx_vldi(0x96);
+  __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    reg0 = __lsx_vdp2_h_bu(tmp0, const_br);
+    reg1 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150);
+    reg2 = __lsx_vadd_h(reg0, reg1);
+    tmp0 = __lsx_vpackod_b(reg2, reg2);
+    tmp1 = __lsx_vpackod_b(tmp1, reg2);
+    dst0 = __lsx_vilvl_h(tmp1, tmp0);
+    dst1 = __lsx_vilvh_h(tmp1, tmp0);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1;
+  __m128i reg0, reg1, spb, spg, spr;
+  __m128i dst0, dst1;
+  __m128i spb_g = __lsx_vldi(68);
+  __m128i spg_g = __lsx_vldi(88);
+  __m128i spr_g = __lsx_vldi(98);
+  __m128i spb_br = {0x2311231123112311, 0x2311231123112311};
+  __m128i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16};
+  __m128i spr_br = {0x3218321832183218, 0x3218321832183218};
+  __m128i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    DUP2_ARG2(__lsx_vdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg);
+    spr = __lsx_vdp2_h_bu(tmp0, spr_br);
+    spb = __lsx_vmaddwev_h_bu(spb, tmp1, spb_g);
+    spg = __lsx_vmaddwev_h_bu(spg, tmp1, spg_g);
+    spr = __lsx_vmaddwev_h_bu(spr, tmp1, spr_g);
+    spb = __lsx_vsrli_h(spb, 7);
+    spg = __lsx_vsrli_h(spg, 7);
+    spr = __lsx_vsrli_h(spr, 7);
+    spg = __lsx_vsat_hu(spg, 7);
+    spr = __lsx_vsat_hu(spr, 7);
+    reg0 = __lsx_vpackev_b(spg, spb);
+    reg1 = __lsx_vshuf_b(tmp1, spr, shuff);
+    dst0 = __lsx_vilvl_h(reg1, reg0);
+    dst1 = __lsx_vilvh_h(reg1, reg0);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    dst_argb += 32;
   }
+}
 
 void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444,
                            uint8_t* dst_argb,
@@ -561,39 +1687,6 @@ void RGB565ToUVRow_LSX(const uint8_t* src_rgb565,
   }
 }
 
-void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 16;
-  __m128i src0, src1, src2;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-  __m128i reg0, reg1, dst0;
-  __m128i const_129 = __lsx_vldi(129);
-  __m128i const_br = {0x4219421942194219, 0x4219421942194219};
-  __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
-  __m128i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C};
-  __m128i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604};
-  __m128i shuff2 = {0x000A000700040001, 0x001600130010000D};
-  __m128i shuff3 = {0x0002001F001C0019, 0x000E000B00080005};
-
-  for (x = 0; x < len; x++) {
-    src0 = __lsx_vld(src_rgb24, 0);
-    src1 = __lsx_vld(src_rgb24, 16);
-    src2 = __lsx_vld(src_rgb24, 32);
-    tmp0 = __lsx_vshuf_b(src1, src0, shuff0);
-    tmp1 = __lsx_vshuf_b(src1, src2, shuff1);
-    tmp2 = __lsx_vshuf_b(src1, src0, shuff2);
-    tmp3 = __lsx_vshuf_b(src1, src2, shuff3);
-    reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp2, const_129);
-    reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129);
-    reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp1);
-    dst0 = __lsx_vpickod_b(reg1, reg0);
-    __lsx_vst(dst0, dst_y, 0);
-    dst_y += 16;
-    src_rgb24 += 48;
-  }
-}
-
 void RGB24ToUVRow_LSX(const uint8_t* src_rgb24,
                       int src_stride_rgb24,
                       uint8_t* dst_u,
@@ -647,39 +1740,6 @@ void RGB24ToUVRow_LSX(const uint8_t* src_rgb24,
   }
 }
 
-void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 16;
-  __m128i src0, src1, src2;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-  __m128i reg0, reg1, dst0;
-  __m128i const_129 = __lsx_vldi(129);
-  __m128i const_br = {0x1942194219421942, 0x1942194219421942};
-  __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
-  __m128i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C};
-  __m128i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604};
-  __m128i shuff2 = {0x000A000700040001, 0x001600130010000D};
-  __m128i shuff3 = {0x0002001F001C0019, 0x000E000B00080005};
-
-  for (x = 0; x < len; x++) {
-    src0 = __lsx_vld(src_raw, 0);
-    src1 = __lsx_vld(src_raw, 16);
-    src2 = __lsx_vld(src_raw, 32);
-    tmp0 = __lsx_vshuf_b(src1, src0, shuff0);
-    tmp1 = __lsx_vshuf_b(src1, src2, shuff1);
-    tmp2 = __lsx_vshuf_b(src1, src0, shuff2);
-    tmp3 = __lsx_vshuf_b(src1, src2, shuff3);
-    reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp2, const_129);
-    reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129);
-    reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp1);
-    dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
-    __lsx_vst(dst0, dst_y, 0);
-    dst_y += 16;
-    src_raw += 48;
-  }
-}
-
 void RAWToUVRow_LSX(const uint8_t* src_raw,
                     int src_stride_raw,
                     uint8_t* dst_u,
@@ -914,62 +1974,6 @@ void SobelXYRow_LSX(const uint8_t* src_sobelx,
   }
 }
 
-void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 16;
-  __m128i src0, src1, src2, src3, dst0;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-  __m128i reg0, reg1;
-  __m128i const_128 = __lsx_vldi(0x480);
-  __m128i const_150 = __lsx_vldi(0x96);
-  __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
-
-  for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
-              src0, src1, src2, src3);
-    tmp0 = __lsx_vpickev_b(src1, src0);
-    tmp1 = __lsx_vpickod_b(src1, src0);
-    tmp2 = __lsx_vpickev_b(src3, src2);
-    tmp3 = __lsx_vpickod_b(src3, src2);
-    reg0 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150);
-    reg1 = __lsx_vmaddwev_h_bu(const_128, tmp3, const_150);
-    reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
-    dst0 = __lsx_vpickod_b(reg1, reg0);
-    __lsx_vst(dst0, dst_y, 0);
-    dst_y += 16;
-    src_argb += 64;
-  }
-}
-
-void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 16;
-  __m128i src0, src1, src2, src3, dst0;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-  __m128i reg0, reg1;
-  __m128i const_129 = __lsx_vldi(0x81);
-  __m128i const_br = {0x1942194219421942, 0x1942194219421942};
-  __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
-
-  for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48,
-              src0, src1, src2, src3);
-    tmp0 = __lsx_vpickod_b(src1, src0);
-    tmp1 = __lsx_vpickev_b(src1, src0);
-    tmp2 = __lsx_vpickod_b(src3, src2);
-    tmp3 = __lsx_vpickev_b(src3, src2);
-    reg0 = __lsx_vmaddwod_h_bu(const_1080, tmp1, const_129);
-    reg1 = __lsx_vmaddwod_h_bu(const_1080, tmp3, const_129);
-    reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
-    dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
-    __lsx_vst(dst0, dst_y, 0);
-    dst_y += 16;
-    src_bgra += 64;
-  }
-}
-
 void BGRAToUVRow_LSX(const uint8_t* src_bgra,
                      int src_stride_bgra,
                      uint8_t* dst_u,
@@ -1018,34 +2022,6 @@ void BGRAToUVRow_LSX(const uint8_t* src_bgra,
   }
 }
 
-void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 16;
-  __m128i src0, src1, src2, src3, dst0;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-  __m128i reg0, reg1;
-  __m128i const_129 = __lsx_vldi(0x81);
-  __m128i const_br = {0x1942194219421942, 0x1942194219421942};
-  __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
-
-  for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48,
-              src0, src1, src2, src3);
-    tmp0 = __lsx_vpickev_b(src1, src0);
-    tmp1 = __lsx_vpickod_b(src1, src0);
-    tmp2 = __lsx_vpickev_b(src3, src2);
-    tmp3 = __lsx_vpickod_b(src3, src2);
-    reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp1, const_129);
-    reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129);
-    reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
-    dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
-    __lsx_vst(dst0, dst_y, 0);
-    dst_y += 16;
-    src_abgr += 64;
-  }
-}
-
 void ABGRToUVRow_LSX(const uint8_t* src_abgr,
                      int src_stride_abgr,
                      uint8_t* dst_u,
@@ -1094,34 +2070,6 @@ void ABGRToUVRow_LSX(const uint8_t* src_abgr,
   }
 }
 
-void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 16;
-  __m128i src0, src1, src2, src3, dst0;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-  __m128i reg0, reg1;
-  __m128i const_129 = __lsx_vldi(0x81);
-  __m128i const_br = {0x4219421942194219, 0x4219421942194219};
-  __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
-
-  for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48,
-              src0, src1, src2, src3);
-    tmp0 = __lsx_vpickod_b(src1, src0);
-    tmp1 = __lsx_vpickev_b(src1, src0);
-    tmp2 = __lsx_vpickod_b(src3, src2);
-    tmp3 = __lsx_vpickev_b(src3, src2);
-    reg0 = __lsx_vmaddwod_h_bu(const_1080, tmp1, const_129);
-    reg1 = __lsx_vmaddwod_h_bu(const_1080, tmp3, const_129);
-    reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
-    dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
-    __lsx_vst(dst0, dst_y, 0);
-    dst_y += 16;
-    src_rgba += 64;
-  }
-}
-
 void RGBAToUVRow_LSX(const uint8_t* src_rgba,
                      int src_stride_rgba,
                      uint8_t* dst_u,
@@ -1821,6 +2769,216 @@ void HalfFloatRow_LSX(const uint16_t* src,
   }
 }
 
+struct RgbConstants {
+  uint8_t kRGBToY[4];
+  uint16_t kAddY;
+  uint16_t pad;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+                                                        128,
+                                                        0};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                        0x1080,
+                                                        0};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
+                                                      0x1080,
+                                                      0};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct RgbConstants* rgbconstants) {
+  asm volatile(
+      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
+      "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
+      "1:                                         \n\t"
+      "vld            $vr4,  %0,    0             \n\t"
+      "vld            $vr5,  %0,    16            \n\t"
+      "vld            $vr6,  %0,    32            \n\t"
+      "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of
+                                                         // ARGB
+      "vor.v          $vr12, $vr3,  $vr3          \n\t"
+      "vor.v          $vr13, $vr3,  $vr3          \n\t"
+      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
+                                                         // loop.
+      "vpickev.b      $vr8,  $vr5,  $vr4          \n\t"  // BR
+      "vpickev.b      $vr10, $vr7,  $vr6          \n\t"
+      "vpickod.b      $vr9,  $vr5,  $vr4          \n\t"  // GA
+      "vpickod.b      $vr11, $vr7,  $vr6          \n\t"
+      "vmaddwev.h.bu  $vr12, $vr8,  $vr0          \n\t"  // B
+      "vmaddwev.h.bu  $vr13, $vr10, $vr0          \n\t"
+      "vmaddwev.h.bu  $vr12, $vr9,  $vr1          \n\t"  // G
+      "vmaddwev.h.bu  $vr13, $vr11, $vr1          \n\t"
+      "vmaddwod.h.bu  $vr12, $vr8,  $vr2          \n\t"  // R
+      "vmaddwod.h.bu  $vr13, $vr10, $vr2          \n\t"
+      "addi.d         %0,    %0,    64            \n\t"
+      "vpickod.b      $vr10, $vr13, $vr12         \n\t"
+      "vst            $vr10, %1,    0             \n\t"
+      "addi.d         %1,    %1,    16            \n\t"
+      "bnez           %2,    1b                   \n\t"
+      : "+&r"(src_argb),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(rgbconstants)
+      : "memory");
+}
+
+void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_LSX(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+
+void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_LSX(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_LSX(src_abgr, dst_y, width, &kRawI601Constants);
+}
+
+void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_LSX(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct RgbConstants* rgbconstants) {
+  asm volatile(
+      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
+      "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
+      "1:                                         \n\t"
+      "vld            $vr4,  %0,    0             \n\t"
+      "vld            $vr5,  %0,    16            \n\t"
+      "vld            $vr6,  %0,    32            \n\t"
+      "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of
+                                                         // RGBA
+      "vor.v          $vr12, $vr3,  $vr3          \n\t"
+      "vor.v          $vr13, $vr3,  $vr3          \n\t"
+      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
+                                                         // loop.
+      "vpickev.b      $vr8,  $vr5,  $vr4          \n\t"  // AG
+      "vpickev.b      $vr10, $vr7,  $vr6          \n\t"
+      "vpickod.b      $vr9,  $vr5,  $vr4          \n\t"  // BR
+      "vpickod.b      $vr11, $vr7,  $vr6          \n\t"
+      "vmaddwev.h.bu  $vr12, $vr9,  $vr0          \n\t"  // B
+      "vmaddwev.h.bu  $vr13, $vr11, $vr0          \n\t"
+      "vmaddwod.h.bu  $vr12, $vr8,  $vr1          \n\t"  // G
+      "vmaddwod.h.bu  $vr13, $vr10, $vr1          \n\t"
+      "vmaddwod.h.bu  $vr12, $vr9,  $vr2          \n\t"  // R
+      "vmaddwod.h.bu  $vr13, $vr11, $vr2          \n\t"
+      "addi.d         %0,    %0,    64            \n\t"
+      "vpickod.b      $vr10, $vr13, $vr12         \n\t"
+      "vst            $vr10, %1,    0             \n\t"
+      "addi.d         %1,    %1,    16            \n\t"
+      "bnez           %2,    1b                   \n\t"
+      : "+&r"(src_rgba),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(rgbconstants)
+      : "memory");
+}
+
+void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_LSX(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+
+void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+  RGBAToYMatrixRow_LSX(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_LSX(src_bgra, dst_y, width, &kRawI601Constants);
+}
+
+static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
+                                uint8_t* dst_y,
+                                int width,
+                                const struct RgbConstants* rgbconstants) {
+  int8_t shuff[64] = {0,  2,  3,  5,  6,  8,  9,  11, 12, 14, 15, 17, 18,
+                      20, 21, 23, 24, 26, 27, 29, 30, 0,  1,  3,  4,  6,
+                      7,  9,  10, 12, 13, 15, 1,  0,  4,  0,  7,  0,  10,
+                      0,  13, 0,  16, 0,  19, 0,  22, 0,  25, 0,  28, 0,
+                      31, 0,  2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
+  asm volatile(
+      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
+      "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
+      "vld            $vr4,  %4,    0             \n\t"  // load shuff
+      "vld            $vr5,  %4,    16            \n\t"
+      "vld            $vr6,  %4,    32            \n\t"
+      "vld            $vr7,  %4,    48            \n\t"
+      "1:                                         \n\t"
+      "vld            $vr8,  %0,    0             \n\t"
+      "vld            $vr9,  %0,    16            \n\t"
+      "vld            $vr10, %0,    32            \n\t"  // load 16 pixels of
+                                                         // RGB
+      "vor.v          $vr12, $vr3,  $vr3          \n\t"
+      "vor.v          $vr13, $vr3,  $vr3          \n\t"
+      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
+                                                         // loop.
+      "vshuf.b        $vr14, $vr9,  $vr8,  $vr4   \n\t"
+      "vshuf.b        $vr15, $vr9,  $vr10, $vr5   \n\t"
+      "vshuf.b        $vr16, $vr9,  $vr8,  $vr6   \n\t"
+      "vshuf.b        $vr17, $vr9,  $vr10, $vr7   \n\t"
+      "vmaddwev.h.bu  $vr12, $vr16, $vr1          \n\t"  // G
+      "vmaddwev.h.bu  $vr13, $vr17, $vr1          \n\t"
+      "vmaddwev.h.bu  $vr12, $vr14, $vr0          \n\t"  // B
+      "vmaddwev.h.bu  $vr13, $vr15, $vr0          \n\t"
+      "vmaddwod.h.bu  $vr12, $vr14, $vr2          \n\t"  // R
+      "vmaddwod.h.bu  $vr13, $vr15, $vr2          \n\t"
+      "addi.d         %0,    %0,    48            \n\t"
+      "vpickod.b      $vr10, $vr13, $vr12         \n\t"
+      "vst            $vr10, %1,    0             \n\t"
+      "addi.d         %1,    %1,    16            \n\t"
+      "bnez           %2,    1b                   \n\t"
+      : "+&r"(src_rgba),    // %0
+        "+&r"(dst_y),       // %1
+        "+&r"(width)        // %2
+      : "r"(rgbconstants),  // %3
+        "r"(shuff)          // %4
+      : "memory");
+}
+
+void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LSX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LSX(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+
+void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LSX(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+
+void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LSX(src_raw, dst_y, width, &kRawI601Constants);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/source/row_mmi.cc b/files/source/row_mmi.cc
deleted file mode 100644
index 362fd1cf..00000000
--- a/files/source/row_mmi.cc
+++ /dev/null
@@ -1,7842 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "libyuv/row.h"
-
-#include <string.h>  // For memcpy and memset.
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// clang-format off
-
-void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24,
-                        uint8_t* dst_argb,
-                        int width) {
-  uint64_t src0, src1, dest;
-  const uint64_t mask = 0xff000000ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask]       \n\t"
-      "or         %[src1],         %[src1],           %[mask]       \n\t"
-      "punpcklwd  %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src0],         0x09(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x06(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x0c(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x09(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask]       \n\t"
-      "or         %[src1],         %[src1],           %[mask]       \n\t"
-      "punpcklwd  %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  uint64_t src0, src1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0xff000000ULL;
-  const uint64_t mask2 = 0xc6;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask1]      \n\t"
-      "punpcklbh  %[src0],         %[src0],           %[mask0]      \n\t"
-      "pshufh     %[src0],         %[src0],           %[mask2]      \n\t"
-      "or         %[src1],         %[src1],           %[mask1]      \n\t"
-      "punpcklbh  %[src1],         %[src1],           %[mask0]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask2]      \n\t"
-      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src0],         0x09(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x06(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x0c(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x09(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask1]      \n\t"
-      "punpcklbh  %[src0],         %[src0],           %[mask0]      \n\t"
-      "pshufh     %[src0],         %[src0],           %[mask2]      \n\t"
-      "or         %[src1],         %[src1],           %[mask1]      \n\t"
-      "punpcklbh  %[src1],         %[src1],           %[mask0]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask2]      \n\t"
-      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width)
-      : "memory");
-}
-
-void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[4];
-  uint64_t mask0 = 0xc6;
-  uint64_t mask1 = 0x6c;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_raw])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_raw])                 \n\t"
-      "gslwrc1    %[src1],         0x08(%[src_raw])                 \n\t"
-      "gslwlc1    %[src1],         0x0b(%[src_raw])                 \n\t"
-
-      "punpcklbh  %[ftmp0],        %[src0],           %[zero]       \n\t"
-      "pshufh     %[ftmp0],        %[ftmp0],          %[mask0]      \n\t"
-      "punpckhbh  %[ftmp1],        %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[src1],         %[src1],           %[zero]       \n\t"
-      "pextrh     %[ftmp2],        %[ftmp0],          %[three]      \n\t"
-      "pextrh     %[ftmp3],        %[ftmp1],          %[one]        \n\t"
-      "pinsrh_3   %[ftmp0],        %[ftmp0],          %[ftmp3]      \n\t"
-      "pextrh     %[ftmp3],        %[ftmp1],          %[two]        \n\t"
-      "pinsrh_1   %[ftmp1],        %[ftmp1],          %[ftmp2]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
-      "pextrh     %[ftmp2],        %[src1],           %[zero]       \n\t"
-      "pinsrh_2   %[ftmp1],        %[ftmp1],          %[ftmp2]      \n\t"
-      "pinsrh_0   %[src1],         %[src1],           %[ftmp3]      \n\t"
-      "packushb   %[ftmp0],        %[ftmp0],          %[ftmp1]      \n\t"
-      "packushb   %[src1],         %[src1],           %[zero]       \n\t"
-
-      "gssdrc1    %[ftmp0],        0x00(%[dst_rgb24])               \n\t"
-      "gssdlc1    %[ftmp0],        0x07(%[dst_rgb24])               \n\t"
-      "gsswrc1    %[src1],         0x08(%[dst_rgb24])               \n\t"
-      "gsswlc1    %[src1],         0x0b(%[dst_rgb24])               \n\t"
-
-      "daddiu     %[src_raw],      %[src_raw],        0x0c          \n\t"
-      "daddiu     %[dst_rgb24],    %[dst_rgb24],      0x0c          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
-        [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3])
-      : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width),
-        [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
-        [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03)
-      : "memory");
-}
-
-void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
-                         uint8_t* dst_argb,
-                         int width) {
-  uint64_t ftmp[5];
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0007000700070007;
-  __asm__ volatile(
-      "1:                                                      \n\t"
-      "gsldrc1   %[src0],       0x00(%[src_rgb565])            \n\t"
-      "gsldlc1   %[src0],       0x07(%[src_rgb565])            \n\t"
-      "psrlh     %[src1],       %[src0],             %[eight]  \n\t"
-      "and       %[b],          %[src0],             %[c0]     \n\t"
-      "and       %[src0],       %[src0],             %[c1]     \n\t"
-      "psrlh     %[src0],       %[src0],             %[five]   \n\t"
-      "and       %[g],          %[src1],             %[c2]     \n\t"
-      "psllh     %[g],          %[g],                %[three]  \n\t"
-      "or        %[g],          %[src0],             %[g]      \n\t"
-      "psrlh     %[r],          %[src1],             %[three]  \n\t"
-      "psllh     %[src0],       %[b],                %[three]  \n\t"
-      "psrlh     %[src1],       %[b],                %[two]    \n\t"
-      "or        %[b],          %[src0],             %[src1]   \n\t"
-      "psllh     %[src0],       %[g],                %[two]    \n\t"
-      "psrlh     %[src1],       %[g],                %[four]   \n\t"
-      "or        %[g],          %[src0],             %[src1]   \n\t"
-      "psllh     %[src0],       %[r],                %[three]  \n\t"
-      "psrlh     %[src1],       %[r],                %[two]    \n\t"
-      "or        %[r],          %[src0],             %[src1]   \n\t"
-      "packushb  %[b],          %[b],                %[r]      \n\t"
-      "packushb  %[g],          %[g],                %[c1]     \n\t"
-      "punpcklbh %[src0],       %[b],                %[g]      \n\t"
-      "punpckhbh %[src1],       %[b],                %[g]      \n\t"
-      "punpcklhw %[r],          %[src0],             %[src1]   \n\t"
-      "gssdrc1   %[r],          0x00(%[dst_argb])              \n\t"
-      "gssdlc1   %[r],          0x07(%[dst_argb])              \n\t"
-      "punpckhhw %[r],          %[src0],             %[src1]   \n\t"
-      "gssdrc1   %[r],          0x08(%[dst_argb])              \n\t"
-      "gssdlc1   %[r],          0x0f(%[dst_argb])              \n\t"
-      "daddiu    %[src_rgb565], %[src_rgb565],       0x08      \n\t"
-      "daddiu    %[dst_argb],   %[dst_argb],         0x10      \n\t"
-      "daddiu    %[width],      %[width],           -0x04      \n\t"
-      "bgtz      %[width],     1b                              \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
-        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4])
-      : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02),
-        [four] "f"(0x04)
-      : "memory");
-}
-
-void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
-                           uint8_t* dst_argb,
-                           int width) {
-  uint64_t ftmp[6];
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0003000300030003;
-  uint64_t c3 = 0x007c007c007c007c;
-  uint64_t c4 = 0x0001000100010001;
-  __asm__ volatile(
-      "1:                                                         \n\t"
-      "gsldrc1   %[src0],         0x00(%[src_argb1555])           \n\t"
-      "gsldlc1   %[src0],         0x07(%[src_argb1555])           \n\t"
-      "psrlh     %[src1],         %[src0],              %[eight]  \n\t"
-      "and       %[b],            %[src0],              %[c0]     \n\t"
-      "and       %[src0],         %[src0],              %[c1]     \n\t"
-      "psrlh     %[src0],         %[src0],              %[five]   \n\t"
-      "and       %[g],            %[src1],              %[c2]     \n\t"
-      "psllh     %[g],            %[g],                 %[three]  \n\t"
-      "or        %[g],            %[src0],              %[g]      \n\t"
-      "and       %[r],            %[src1],              %[c3]     \n\t"
-      "psrlh     %[r],            %[r],                 %[two]    \n\t"
-      "psrlh     %[a],            %[src1],              %[seven]  \n\t"
-      "psllh     %[src0],         %[b],                 %[three]  \n\t"
-      "psrlh     %[src1],         %[b],                 %[two]    \n\t"
-      "or        %[b],            %[src0],              %[src1]   \n\t"
-      "psllh     %[src0],         %[g],                 %[three]  \n\t"
-      "psrlh     %[src1],         %[g],                 %[two]    \n\t"
-      "or        %[g],            %[src0],              %[src1]   \n\t"
-      "psllh     %[src0],         %[r],                 %[three]  \n\t"
-      "psrlh     %[src1],         %[r],                 %[two]    \n\t"
-      "or        %[r],            %[src0],              %[src1]   \n\t"
-      "xor       %[a],            %[a],                 %[c1]     \n\t"
-      "paddb     %[a],            %[a],                 %[c4]     \n\t"
-      "packushb  %[b],            %[b],                 %[r]      \n\t"
-      "packushb  %[g],            %[g],                 %[a]      \n\t"
-      "punpcklbh %[src0],         %[b],                 %[g]      \n\t"
-      "punpckhbh %[src1],         %[b],                 %[g]      \n\t"
-      "punpcklhw %[r],            %[src0],              %[src1]   \n\t"
-      "gssdrc1   %[r],            0x00(%[dst_argb])               \n\t"
-      "gssdlc1   %[r],            0x07(%[dst_argb])               \n\t"
-      "punpckhhw %[r],            %[src0],              %[src1]   \n\t"
-      "gssdrc1   %[r],            0x08(%[dst_argb])               \n\t"
-      "gssdlc1   %[r],            0x0f(%[dst_argb])               \n\t"
-      "daddiu    %[src_argb1555], %[src_argb1555],      0x08      \n\t"
-      "daddiu    %[dst_argb],     %[dst_argb],          0x10      \n\t"
-      "daddiu    %[width],        %[width],            -0x04      \n\t"
-      "bgtz      %[width],        1b                              \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
-        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
-      : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05),
-        [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
-      : "memory");
-}
-
-void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
-                           uint8_t* dst_argb,
-                           int width) {
-  uint64_t ftmp[6];
-  uint64_t c0 = 0x000f000f000f000f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  __asm__ volatile(
-      "1:                                                          \n\t"
-      "gsldrc1   %[src0],         0x00(%[src_argb4444])            \n\t"
-      "gsldlc1   %[src0],         0x07(%[src_argb4444])            \n\t"
-      "psrlh     %[src1],         %[src0],              %[eight]   \n\t"
-      "and       %[b],            %[src0],              %[c0]      \n\t"
-      "and       %[src0],         %[src0],              %[c1]      \n\t"
-      "psrlh     %[g],            %[src0],              %[four]    \n\t"
-      "and       %[r],            %[src1],              %[c0]      \n\t"
-      "psrlh     %[a],            %[src1],              %[four]    \n\t"
-      "psllh     %[src0],         %[b],                 %[four]    \n\t"
-      "or        %[b],            %[src0],              %[b]       \n\t"
-      "psllh     %[src0],         %[g],                 %[four]    \n\t"
-      "or        %[g],            %[src0],              %[g]       \n\t"
-      "psllh     %[src0],         %[r],                 %[four]    \n\t"
-      "or        %[r],            %[src0],              %[r]       \n\t"
-      "psllh     %[src0],         %[a],                 %[four]    \n\t"
-      "or        %[a],            %[src0],              %[a]       \n\t"
-      "packushb  %[b],            %[b],                 %[r]       \n\t"
-      "packushb  %[g],            %[g],                 %[a]       \n\t"
-      "punpcklbh %[src0],         %[b],                 %[g]       \n\t"
-      "punpckhbh %[src1],         %[b],                 %[g]       \n\t"
-      "punpcklhw %[r],            %[src0],              %[src1]    \n\t"
-      "gssdrc1   %[r],            0x00(%[dst_argb])                \n\t"
-      "gssdlc1   %[r],            0x07(%[dst_argb])                \n\t"
-      "punpckhhw %[r],            %[src0],              %[src1]    \n\t"
-      "gssdrc1   %[r],            0x08(%[dst_argb])                \n\t"
-      "gssdlc1   %[r],            0x0f(%[dst_argb])                \n\t"
-      "daddiu    %[src_argb4444], %[src_argb4444],      0x08       \n\t"
-      "daddiu    %[dst_argb],     %[dst_argb],          0x10       \n\t"
-      "daddiu    %[width],        %[width],            -0x04       \n\t"
-      "bgtz      %[width],        1b                               \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
-        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
-      : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08),
-        [four] "f"(0x04)
-      : "memory");
-}
-
-void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  uint64_t src;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src],          0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x03(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x00(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x04(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x06(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x03(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src],          0x0b(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x08(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x09(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x06(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x0c(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x0c(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x09(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x0c          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[3];
-  uint64_t mask0 = 0xc6;
-  uint64_t mask1 = 0x18;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[ftmp0],        %[src0],           %[zero]       \n\t"
-      "pshufh     %[ftmp0],        %[ftmp0],          %[mask0]      \n\t"
-      "punpckhbh  %[ftmp1],        %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[ftmp2],        %[src1],           %[zero]       \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]       \n\t"
-
-      "pextrh     %[src0],         %[ftmp1],          %[two]        \n\t"
-      "pinsrh_3   %[ftmp0],        %[ftmp0],          %[src0]       \n\t"
-      "pshufh     %[ftmp1],        %[ftmp1],          %[one]        \n\t"
-
-      "pextrh     %[src0],         %[ftmp2],          %[two]        \n\t"
-      "pinsrh_2   %[ftmp1],        %[ftmp1],          %[src0]       \n\t"
-      "pextrh     %[src0],         %[ftmp2],          %[one]        \n\t"
-      "pinsrh_3   %[ftmp1],        %[ftmp1],          %[src0]       \n\t"
-      "pextrh     %[src0],         %[ftmp2],          %[zero]       \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
-      "pinsrh_0   %[src1],         %[src1],           %[src0]       \n\t"
-      "packushb   %[ftmp0],        %[ftmp0],          %[ftmp1]      \n\t"
-      "packushb   %[src1],         %[src1],           %[zero]       \n\t"
-
-      "gssdrc1    %[ftmp0],        0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[ftmp0],        0x07(%[dst_rgb])                 \n\t"
-      "gsswrc1    %[src1],         0x08(%[dst_rgb])                 \n\t"
-      "gsswlc1    %[src1],         0x0b(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x0c          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
-        [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
-        [one] "f"(0x01), [two] "f"(0x02)
-      : "memory");
-}
-
-void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[3];
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-
-      "psrlh      %[b],            %[b],              %[three]      \n\t"
-      "psrlh      %[g],            %[g],              %[two]        \n\t"
-      "psrlh      %[r],            %[r],              %[three]      \n\t"
-
-      "psllh      %[g],            %[g],              %[five]       \n\t"
-      "psllh      %[r],            %[r],              %[eleven]     \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05),
-        [eleven] "f"(0x0b)
-      : "memory");
-}
-
-// dither4 is a row of 4 values from 4x4 dither matrix.
-// The 4x4 matrix contains values to increase RGB.  When converting to
-// fewer bits (565) this provides an ordered dither.
-// The order in the 4x4 matrix in first byte is upper left.
-// The 4 values are passed as an int, then referenced as an array, so
-// endian will not affect order of the original matrix.  But the dither4
-// will containing the first pixel in the lower byte for little endian
-// or the upper byte for big endian.
-void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
-                               uint8_t* dst_rgb,
-                               const uint32_t dither4,
-                               int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[3];
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-
-  __asm__ volatile(
-      "punpcklbh  %[dither],       %[dither],         %[zero]       \n\t"
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-
-      "paddh      %[b],            %[b],              %[dither]     \n\t"
-      "paddh      %[g],            %[g],              %[dither]     \n\t"
-      "paddh      %[r],            %[r],              %[dither]     \n\t"
-      "pcmpgth    %[src0],         %[b],              %[c0]         \n\t"
-      "or         %[src0],         %[src0],           %[b]          \n\t"
-      "and        %[b],            %[src0],           %[c0]         \n\t"
-      "pcmpgth    %[src0],         %[g],              %[c0]         \n\t"
-      "or         %[src0],         %[src0],           %[g]          \n\t"
-      "and        %[g],            %[src0],           %[c0]         \n\t"
-      "pcmpgth    %[src0],         %[r],              %[c0]         \n\t"
-      "or         %[src0],         %[src0],           %[r]          \n\t"
-      "and        %[r],            %[src0],           %[c0]         \n\t"
-
-      "psrlh      %[b],            %[b],              %[three]      \n\t"
-      "psrlh      %[g],            %[g],              %[two]        \n\t"
-      "psrlh      %[r],            %[r],              %[three]      \n\t"
-
-      "psllh      %[g],            %[g],              %[five]       \n\t"
-      "psllh      %[r],            %[r],              %[eleven]     \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02),
-        [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b)
-      : "memory");
-}
-
-void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[4];
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-      "punpckhbh  %[a],            %[src1],           %[zero]       \n\t"
-
-      "psrlh      %[b],            %[b],              %[three]      \n\t"
-      "psrlh      %[g],            %[g],              %[three]      \n\t"
-      "psrlh      %[r],            %[r],              %[three]      \n\t"
-      "psrlh      %[a],            %[a],              %[seven]      \n\t"
-
-      "psllh      %[g],            %[g],              %[five]       \n\t"
-      "psllh      %[r],            %[r],              %[ten]        \n\t"
-      "psllh      %[a],            %[a],              %[fifteen]    \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-      "or         %[b],            %[b],              %[a]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05),
-        [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f)
-      : "memory");
-}
-
-void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[4];
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-      "punpckhbh  %[a],            %[src1],           %[zero]       \n\t"
-
-      "psrlh      %[b],            %[b],              %[four]       \n\t"
-      "psrlh      %[g],            %[g],              %[four]       \n\t"
-      "psrlh      %[r],            %[r],              %[four]       \n\t"
-      "psrlh      %[a],            %[a],              %[four]       \n\t"
-
-      "psllh      %[g],            %[g],              %[four]       \n\t"
-      "psllh      %[r],            %[r],              %[eight]      \n\t"
-      "psllh      %[a],            %[a],              %[twelve]     \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-      "or         %[b],            %[b],              %[a]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08),
-        [twelve] "f"(0x0c)
-      : "memory");
-}
-
-void ARGBToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001004200810019;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void ARGBToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0013002500380002;
-  const uint64_t mask_v = 0x00020038002f0009;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0019008100420001;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void BGRAToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0002003800250013;
-  const uint64_t mask_v = 0x0009002f00380002;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]        \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001001900810042;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void ABGRToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0002003800250013;
-  const uint64_t mask_v = 0x0009002F00380002;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest0_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest1_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest2_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest3_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0042008100190001;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void RGBAToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0013002500380002;
-  const uint64_t mask_v = 0x00020038002f0009;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest0_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest1_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest2_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest3_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001004200810019;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0d(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x06(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x13(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x0c(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x19(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x12(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x18          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void RGB24ToUVRow_MMI(const uint8_t* src_rgb,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0013002500380002;
-  const uint64_t mask_v = 0x00020038002f0009;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x06(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0d(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x0c(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x13(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x12(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x19(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x1e(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x25(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x24(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2b(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x2a(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x31(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x30              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001001900810042;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0d(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x06(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x13(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x0c(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x19(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x12(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x18          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void RAWToUVRow_MMI(const uint8_t* src_rgb,
-                    int src_stride_rgb,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0002003800250013;
-  const uint64_t mask_v = 0x0009002f00380002;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest0_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x06(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0d(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x0c(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x13(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest1_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x12(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x19(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest2_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x1e(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x25(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x24(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2b(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest3_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x2a(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x31(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x30              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void ARGBToYJRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest, dest0, dest1, dest2, dest3;
-  uint64_t tmp0, tmp1;
-  const uint64_t shift = 0x08;
-  const uint64_t value = 0x80;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x0001004D0096001DULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x08(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x10(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[shift]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x18(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[shift]      \n\t"
-
-      "packsswh   %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[tmp1],         %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest],         %[tmp0],           %[tmp1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
-        [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0),
-        [tmp1] "=&f"(tmp1)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value),
-        [width] "r"(width)
-      : "memory");
-}
-
-void ARGBToUVJRow_MMI(const uint8_t* src_rgb,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0015002a003f0002;
-  const uint64_t mask_v = 0x0002003f0035000a;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  uint64_t ftmp[11];
-  const uint64_t value = 0x1080108010801080;
-  const uint64_t mask = 0x0001004200810019;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0007000700070007;
-  __asm__ volatile(
-      "1:                                                            \n\t"
-      "gsldrc1    %[src0],        0x00(%[src_rgb565])                \n\t"
-      "gsldlc1    %[src0],        0x07(%[src_rgb565])                \n\t"
-      "psrlh      %[src1],        %[src0],             %[eight]      \n\t"
-      "and        %[b],           %[src0],             %[c0]         \n\t"
-      "and        %[src0],        %[src0],             %[c1]         \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]       \n\t"
-      "and        %[g],           %[src1],             %[c2]         \n\t"
-      "psllh      %[g],           %[g],                %[three]      \n\t"
-      "or         %[g],           %[src0],             %[g]          \n\t"
-      "psrlh      %[r],           %[src1],             %[three]      \n\t"
-      "psllh      %[src0],        %[b],                %[three]      \n\t"
-      "psrlh      %[src1],        %[b],                %[two]        \n\t"
-      "or         %[b],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[g],                %[two]        \n\t"
-      "psrlh      %[src1],        %[g],                %[four]       \n\t"
-      "or         %[g],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[r],                %[three]      \n\t"
-      "psrlh      %[src1],        %[r],                %[two]        \n\t"
-      "or         %[r],           %[src0],             %[src1]       \n\t"
-      "punpcklhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpcklhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest0],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest0],       %[dest0],            %[eight]      \n\t"
-
-      "punpckhhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpckhhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest1],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest1],       %[dest1],            %[eight]      \n\t"
-
-      "gsldrc1    %[src0],        0x08(%[src_rgb565])                \n\t"
-      "gsldlc1    %[src0],        0x0f(%[src_rgb565])                \n\t"
-      "psrlh      %[src1],        %[src0],             %[eight]      \n\t"
-      "and        %[b],           %[src0],             %[c0]         \n\t"
-      "and        %[src0],        %[src0],             %[c1]         \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]       \n\t"
-      "and        %[g],           %[src1],             %[c2]         \n\t"
-      "psllh      %[g],           %[g],                %[three]      \n\t"
-      "or         %[g],           %[src0],             %[g]          \n\t"
-      "psrlh      %[r],           %[src1],             %[three]      \n\t"
-      "psllh      %[src0],        %[b],                %[three]      \n\t"
-      "psrlh      %[src1],        %[b],                %[two]        \n\t"
-      "or         %[b],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[g],                %[two]        \n\t"
-      "psrlh      %[src1],        %[g],                %[four]       \n\t"
-      "or         %[g],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[r],                %[three]      \n\t"
-      "psrlh      %[src1],        %[r],                %[two]        \n\t"
-      "or         %[r],           %[src0],             %[src1]       \n\t"
-      "punpcklhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpcklhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest2],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest2],       %[dest2],            %[eight]      \n\t"
-
-      "punpckhhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpckhhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest3],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest3],       %[dest3],            %[eight]      \n\t"
-
-      "packsswh   %[src_lo],      %[dest0],            %[dest1]      \n\t"
-      "packsswh   %[src_hi],      %[dest2],            %[dest3]      \n\t"
-      "packushb   %[dest0],       %[src_lo],           %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],       0x07(%[dst_y])                     \n\t"
-      "gssdrc1    %[dest0],       0x00(%[dst_y])                     \n\t"
-
-      "daddiu    %[src_rgb565],   %[src_rgb565],       0x10          \n\t"
-      "daddiu    %[dst_y],        %[dst_y],            0x08          \n\t"
-      "daddiu    %[width],        %[width],           -0x08          \n\t"
-      "bgtz      %[width],        1b                                 \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
-        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
-        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
-      : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05),
-        [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04)
-      : "memory");
-}
-
-void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555,
-                        uint8_t* dst_y,
-                        int width) {
-  uint64_t ftmp[11];
-  const uint64_t value = 0x1080108010801080;
-  const uint64_t mask = 0x0001004200810019;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0003000300030003;
-  uint64_t c3 = 0x007c007c007c007c;
-  __asm__ volatile(
-      "1:                                                            \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb1555])             \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb1555])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[src0],         %[src0],              %[five]     \n\t"
-      "and        %[g],            %[src1],              %[c2]       \n\t"
-      "psllh      %[g],            %[g],                 %[three]    \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "and        %[r],            %[src1],              %[c3]       \n\t"
-      "psrlh      %[r],            %[r],                 %[two]      \n\t"
-      "psllh      %[src0],         %[b],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[b],                 %[two]      \n\t"
-      "or         %[b],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[g],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[g],                 %[two]      \n\t"
-      "or         %[g],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[r],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[r],                 %[two]      \n\t"
-      "or         %[r],            %[src0],              %[src1]     \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest0],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest0],        %[dest0],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest1],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest1],        %[dest1],             %[eight]    \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_argb1555])             \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_argb1555])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[src0],         %[src0],              %[five]     \n\t"
-      "and        %[g],            %[src1],              %[c2]       \n\t"
-      "psllh      %[g],            %[g],                 %[three]    \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "and        %[r],            %[src1],              %[c3]       \n\t"
-      "psrlh      %[r],            %[r],                 %[two]      \n\t"
-      "psllh      %[src0],         %[b],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[b],                 %[two]      \n\t"
-      "or         %[b],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[g],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[g],                 %[two]      \n\t"
-      "or         %[g],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[r],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[r],                 %[two]      \n\t"
-      "or         %[r],            %[src0],              %[src1]     \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest2],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest2],        %[dest2],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest3],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest3],        %[dest3],             %[eight]    \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],             %[dest1]    \n\t"
-      "packsswh   %[src_hi],       %[dest2],             %[dest3]    \n\t"
-      "packushb   %[dest0],        %[src_lo],            %[src_hi]   \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                    \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                    \n\t"
-
-      "daddiu     %[src_argb1555], %[src_argb1555],      0x10        \n\t"
-      "daddiu     %[dst_y],        %[dst_y],             0x08        \n\t"
-      "daddiu     %[width],        %[width],            -0x08        \n\t"
-      "bgtz       %[width],        1b                                \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
-        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
-        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
-      : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y),
-        [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
-        [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08),
-        [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
-      : "memory");
-}
-
-void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444,
-                        uint8_t* dst_y,
-                        int width) {
-  uint64_t ftmp[11];
-  uint64_t value = 0x1080108010801080;
-  uint64_t mask = 0x0001004200810019;
-  uint64_t c0 = 0x000f000f000f000f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  __asm__ volatile(
-      "1:                                                            \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb4444])             \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb4444])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[g],            %[src0],              %[four]     \n\t"
-      "and        %[r],            %[src1],              %[c0]       \n\t"
-      "psllh      %[src0],         %[b],                 %[four]     \n\t"
-      "or         %[b],            %[src0],              %[b]        \n\t"
-      "psllh      %[src0],         %[g],                 %[four]     \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "psllh      %[src0],         %[r],                 %[four]     \n\t"
-      "or         %[r],            %[src0],              %[r]        \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest0],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest0],        %[dest0],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest1],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest1],        %[dest1],             %[eight]    \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_argb4444])             \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_argb4444])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[g],            %[src0],              %[four]     \n\t"
-      "and        %[r],            %[src1],              %[c0]       \n\t"
-      "psllh      %[src0],         %[b],                 %[four]     \n\t"
-      "or         %[b],            %[src0],              %[b]        \n\t"
-      "psllh      %[src0],         %[g],                 %[four]     \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "psllh      %[src0],         %[r],                 %[four]     \n\t"
-      "or         %[r],            %[src0],              %[r]        \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest2],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest2],        %[dest2],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest3],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest3],        %[dest3],             %[eight]    \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],             %[dest1]    \n\t"
-      "packsswh   %[src_hi],       %[dest2],             %[dest3]    \n\t"
-      "packushb   %[dest0],        %[src_lo],            %[src_hi]   \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                    \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                    \n\t"
-
-      "daddiu     %[src_argb4444], %[src_argb4444],      0x10        \n\t"
-      "daddiu     %[dst_y],        %[dst_y],             0x08        \n\t"
-      "daddiu     %[width],        %[width],            -0x08        \n\t"
-      "bgtz       %[width],        1b                                \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
-        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
-        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
-      : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y),
-        [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
-        [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04)
-      : "memory");
-}
-
-void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
-                       int src_stride_rgb565,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  uint64_t ftmp[13];
-  uint64_t value = 0x2020202020202020;
-  uint64_t mask_u = 0x0026004a00700002;
-  uint64_t mask_v = 0x00020070005e0012;
-  uint64_t mask = 0x93;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0007000700070007;
-  __asm__ volatile(
-      "daddu      %[next_rgb565], %[src_rgb565],       %[next_rgb565]   \n\t"
-      "1:                                                               \n\t"
-      "gsldrc1    %[src0],        0x00(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x07(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x00(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x07(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest0_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest0_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest0_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest0_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest0_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest0_v],     %[dest0_v],          %[three]         \n\t"
-      "or         %[dest0_v],     %[src1],             %[dest0_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest0_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest0_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest0_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest0_u],          %[dest0_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest0_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest0_u],          %[dest0_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest0_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest0_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest0_u],     %[dest0_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest0_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest0_u],          %[b0]            \n\t"
-      "psubw      %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest0_u],     %[dest0_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest0_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest0_v],          %[g0]            \n\t"
-      "psubw      %[dest0_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest0_v],     %[dest0_v],          %[eight]         \n\t"
-
-      "gsldrc1    %[src0],        0x08(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x0f(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x08(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x0f(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest1_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest1_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest1_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest1_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest1_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest1_v],     %[dest1_v],          %[three]         \n\t"
-      "or         %[dest1_v],     %[src1],             %[dest1_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest1_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest1_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest1_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest1_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest1_u],          %[dest1_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest1_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest1_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest1_u],          %[dest1_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest1_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest1_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest1_u],     %[dest1_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest1_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest1_u],          %[b0]            \n\t"
-      "psubw      %[dest1_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest1_u],     %[dest1_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest1_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest1_v],          %[g0]            \n\t"
-      "psubw      %[dest1_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest1_v],     %[dest1_v],          %[eight]         \n\t"
-
-      "gsldrc1    %[src0],        0x10(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x17(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x10(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x17(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest2_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest2_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest2_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest2_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest2_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest2_v],     %[dest2_v],          %[three]         \n\t"
-      "or         %[dest2_v],     %[src1],             %[dest2_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest2_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest2_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest2_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest2_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest2_u],          %[dest2_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest2_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest2_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest2_u],          %[dest2_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest2_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest2_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest2_u],     %[dest2_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest2_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest2_u],          %[b0]            \n\t"
-      "psubw      %[dest2_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest2_u],     %[dest2_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest2_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest2_v],          %[g0]            \n\t"
-      "psubw      %[dest2_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest2_v],     %[dest2_v],          %[eight]         \n\t"
-
-      "gsldrc1    %[src0],        0x18(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x1f(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x18(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x1f(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest3_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest3_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest3_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest3_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest3_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest3_v],     %[dest3_v],          %[three]         \n\t"
-      "or         %[dest3_v],     %[src1],             %[dest3_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest3_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest3_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest3_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest3_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest3_u],          %[dest3_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest3_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest3_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest3_u],          %[dest3_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest3_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest3_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest3_u],     %[dest3_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest3_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest3_u],          %[b0]            \n\t"
-      "psubw      %[dest3_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest3_u],     %[dest3_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest3_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest3_v],          %[g0]            \n\t"
-      "psubw      %[dest3_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest3_v],     %[dest3_v],          %[eight]         \n\t"
-
-      "packsswh   %[src0],        %[dest0_u],          %[dest1_u]       \n\t"
-      "packsswh   %[src1],        %[dest2_u],          %[dest3_u]       \n\t"
-      "packushb   %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "gssdlc1    %[dest0_u],     0x07(%[dst_u])                        \n\t"
-      "gssdrc1    %[dest0_u],     0x00(%[dst_u])                        \n\t"
-      "packsswh   %[src0],        %[dest0_v],          %[dest1_v]       \n\t"
-      "packsswh   %[src1],        %[dest2_v],          %[dest3_v]       \n\t"
-      "packushb   %[dest0_v],     %[src0],             %[src1]          \n\t"
-      "gssdlc1    %[dest0_v],     0x07(%[dst_v])                        \n\t"
-      "gssdrc1    %[dest0_v],     0x00(%[dst_v])                        \n\t"
-
-      "daddiu    %[src_rgb565],   %[src_rgb565],       0x20             \n\t"
-      "daddiu    %[next_rgb565],  %[next_rgb565],      0x20             \n\t"
-      "daddiu    %[dst_u],        %[dst_u],            0x08             \n\t"
-      "daddiu    %[dst_v],        %[dst_v],            0x08             \n\t"
-      "daddiu    %[width],        %[width],           -0x10             \n\t"
-      "bgtz      %[width],        1b                                    \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
-        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
-        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
-        [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
-        [dest3_v] "=&f"(ftmp[12])
-      : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
-        [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
-        [one] "f"(0x01)
-      : "memory");
-}
-
-void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
-                         int src_stride_argb1555,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  uint64_t ftmp[11];
-  uint64_t value = 0x2020202020202020;
-  uint64_t mask_u = 0x0026004a00700002;
-  uint64_t mask_v = 0x00020070005e0012;
-  uint64_t mask = 0x93;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0003000300030003;
-  uint64_t c3 = 0x007c007c007c007c;
-  __asm__ volatile(
-      "daddu      %[next_argb1555], %[src_argb1555],      %[next_argb1555] \n\t"
-      "1:                                                                  \n\t"
-      "gsldrc1    %[src0],          0x00(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x07(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x00(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x07(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest0_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest0_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest0_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest0_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest0_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest0_v],       %[dest0_v],            %[three]        \n\t"
-      "or         %[dest0_v],       %[src1],               %[dest0_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest0_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest0_u],            %[dest0_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest0_u],            %[dest0_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest0_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest0_u],       %[dest0_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest0_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_u],            %[b0]           \n\t"
-      "psubw      %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest0_u],       %[dest0_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
-      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x08(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x0f(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x08(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x0f(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest1_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest1_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest1_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest1_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest1_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest1_v],       %[dest1_v],            %[three]        \n\t"
-      "or         %[dest1_v],       %[src1],               %[dest1_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest1_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest1_u],            %[dest1_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest1_u],            %[dest1_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest1_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest1_u],       %[dest1_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest1_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_u],            %[b0]           \n\t"
-      "psubw      %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest1_u],       %[dest1_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
-      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
-
-      "packsswh   %[dest0_u],       %[dest0_u],            %[dest1_u]      \n\t"
-      "packsswh   %[dest1_u],       %[dest0_v],            %[dest1_v]      \n\t"
-
-      "gsldrc1    %[src0],          0x10(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x17(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x10(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x17(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest2_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest2_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest2_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest2_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest0_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest0_v],       %[dest0_v],            %[three]        \n\t"
-      "or         %[dest0_v],       %[src1],               %[dest0_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest2_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest2_u],            %[dest0_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest2_u],            %[dest0_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest2_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest2_u],       %[dest2_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest2_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest2_u],            %[b0]           \n\t"
-      "psubw      %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest2_u],       %[dest2_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
-      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x18(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x1f(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x18(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x1f(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest3_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest3_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest3_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest3_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest1_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest1_v],       %[dest1_v],            %[three]        \n\t"
-      "or         %[dest1_v],       %[src1],               %[dest1_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest3_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest3_u],            %[dest1_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest3_u],            %[dest1_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest3_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest3_u],       %[dest3_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest3_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest3_u],            %[b0]           \n\t"
-      "psubw      %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest3_u],       %[dest3_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
-      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
-
-      "packsswh   %[src1],          %[dest2_u],            %[dest3_u]      \n\t"
-      "packushb   %[dest0_u],       %[dest0_u],            %[src1]         \n\t"
-      "gssdlc1    %[dest0_u],       0x07(%[dst_u])                         \n\t"
-      "gssdrc1    %[dest0_u],       0x00(%[dst_u])                         \n\t"
-      "packsswh   %[src1],          %[dest0_v],            %[dest1_v]      \n\t"
-      "packushb   %[dest0_v],       %[dest1_u],            %[src1]         \n\t"
-      "gssdlc1    %[dest0_v],       0x07(%[dst_v])                         \n\t"
-      "gssdrc1    %[dest0_v],       0x00(%[dst_v])                         \n\t"
-
-      "daddiu    %[src_argb1555],   %[src_argb1555],       0x20            \n\t"
-      "daddiu    %[next_argb1555],  %[next_argb1555],      0x20            \n\t"
-      "daddiu    %[dst_u],          %[dst_u],              0x08            \n\t"
-      "daddiu    %[dst_v],          %[dst_v],              0x08            \n\t"
-      "daddiu    %[width],          %[width],             -0x10            \n\t"
-      "bgtz      %[width],          1b                                     \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
-        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
-        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
-        [dest1_v] "=&f"(ftmp[10])
-      : [src_argb1555] "r"(src_argb1555),
-        [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u),
-        [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
-        [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3),
-        [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
-        [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
-        [two] "f"(0x02), [one] "f"(0x01)
-      : "memory");
-}
-
-void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
-                         int src_stride_argb4444,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  uint64_t ftmp[13];
-  uint64_t value = 0x2020202020202020;
-  uint64_t mask_u = 0x0026004a00700002;
-  uint64_t mask_v = 0x00020070005e0012;
-  uint64_t mask = 0x93;
-  uint64_t c0 = 0x000f000f000f000f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  __asm__ volatile(
-      "daddu      %[next_argb4444], %[src_argb4444],      %[next_argb4444] \n\t"
-      "1:                                                                  \n\t"
-      "gsldrc1    %[src0],          0x00(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x07(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x00(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x07(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest0_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest0_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest0_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest0_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest0_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest0_u],            %[dest0_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest0_u],            %[dest0_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest0_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest0_u],       %[dest0_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest0_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_u],            %[b0]           \n\t"
-      "psubw      %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest0_u],       %[dest0_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
-      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x08(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x0f(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x08(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x0f(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest1_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest1_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest1_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest1_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest1_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest1_u],            %[dest1_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest1_u],            %[dest1_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest1_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest1_u],       %[dest1_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest1_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_u],            %[b0]           \n\t"
-      "psubw      %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest1_u],       %[dest1_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
-      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x10(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x17(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x10(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x17(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest2_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest2_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest2_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest2_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest2_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest2_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest2_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest2_u],            %[dest2_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest2_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest2_u],            %[dest2_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest2_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest2_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest2_u],       %[dest2_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest2_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest2_u],            %[b0]           \n\t"
-      "psubw      %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest2_u],       %[dest2_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest2_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest2_v],            %[g0]           \n\t"
-      "psubw      %[dest2_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest2_v],       %[dest2_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x18(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x1f(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x18(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x1f(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest3_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest3_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest3_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest3_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest3_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest3_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest3_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest3_u],            %[dest3_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest3_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest3_u],            %[dest3_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest3_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest3_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest3_u],       %[dest3_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest3_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest3_u],            %[b0]           \n\t"
-      "psubw      %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest3_u],       %[dest3_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest3_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest3_v],            %[g0]           \n\t"
-      "psubw      %[dest3_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest3_v],       %[dest3_v],            %[eight]        \n\t"
-
-      "packsswh   %[src0],          %[dest0_u],            %[dest1_u]      \n\t"
-      "packsswh   %[src1],          %[dest2_u],            %[dest3_u]      \n\t"
-      "packushb   %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "gssdlc1    %[dest0_u],       0x07(%[dst_u])                         \n\t"
-      "gssdrc1    %[dest0_u],       0x00(%[dst_u])                         \n\t"
-      "packsswh   %[src0],          %[dest0_v],            %[dest1_v]      \n\t"
-      "packsswh   %[src1],          %[dest2_v],            %[dest3_v]      \n\t"
-      "packushb   %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "gssdlc1    %[dest0_v],       0x07(%[dst_v])                         \n\t"
-      "gssdrc1    %[dest0_v],       0x00(%[dst_v])                         \n\t"
-
-      "daddiu    %[src_argb4444],   %[src_argb4444],       0x20            \n\t"
-      "daddiu    %[next_argb4444],  %[next_argb4444],      0x20            \n\t"
-      "daddiu    %[dst_u],          %[dst_u],              0x08            \n\t"
-      "daddiu    %[dst_v],          %[dst_v],              0x08            \n\t"
-      "daddiu    %[width],          %[width],             -0x10            \n\t"
-      "bgtz      %[width],          1b                                     \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
-        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
-        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
-        [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
-        [dest3_v] "=&f"(ftmp[12])
-      : [src_argb4444] "r"(src_argb4444),
-        [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u),
-        [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
-        [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u),
-        [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04),
-        [two] "f"(0x02)
-      : "memory");
-}
-
-void ARGBToUV444Row_MMI(const uint8_t* src_argb,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0026004a00700002;
-  const uint64_t mask_v = 0x00020070005e0012;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest0_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest1_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest2_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest3_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x20              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x08              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]),
-        [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]),
-        [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]),
-        [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]),
-        [dest3_v] "=&f"(ftmp[11])
-      : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
-        [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10),
-        [eight] "f"(0x08)
-      : "memory");
-}
-
-void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi;
-  uint64_t tmp0, tmp1;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x01;
-  const uint64_t mask2 = 0x0080004D0096001DULL;
-  const uint64_t mask3 = 0xFF000000FF000000ULL;
-  const uint64_t mask4 = ~mask3;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "and        %[src37],        %[src],            %[mask3]      \n\t"
-
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "pmaddhw    %[dest_lo],      %[src_lo],         %[mask2]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_lo]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_lo]    \n\t"
-      "paddw      %[dest_lo],      %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "packsswh   %[dest_lo],      %[dest_lo],        %[dest_lo]    \n\t"
-
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "pmaddhw    %[dest_hi],      %[src_hi],         %[mask2]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest_hi],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_hi],        %[dest_hi]    \n\t"
-      "paddw      %[dest_hi],      %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-      "packsswh   %[dest_hi],      %[dest_hi],        %[dest_hi]    \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "and        %[dest],         %[dest],           %[mask4]      \n\t"
-      "or         %[dest],         %[dest],           %[src37]      \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0),
-        [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest),
-        [src37] "=&f"(src37)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
-        [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4)
-      : "memory");
-}
-
-// Convert a row of image to Sepia tone.
-void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) {
-  uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2;
-  uint64_t tmp0, tmp1;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x002300440011ULL;
-  const uint64_t mask2 = 0x002D00580016ULL;
-  const uint64_t mask3 = 0x003200620018ULL;
-  const uint64_t mask4 = 0xFF000000FF000000ULL;
-  const uint64_t shift = 0x07;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "and        %[dest37],       %[dest],           %[mask4]      \n\t"
-
-      "punpcklbh  %[dest_lo],      %[dest],           %[mask0]      \n\t"
-      "pmaddhw    %[dest0],        %[dest_lo],        %[mask1]      \n\t"
-      "pmaddhw    %[dest1],        %[dest_lo],        %[mask2]      \n\t"
-      "pmaddhw    %[dest2],        %[dest_lo],        %[mask3]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest0],          %[dest1]      \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest2],          %[mask0]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest2],          %[mask0]      \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
-
-      "punpckhbh  %[dest_hi],      %[dest],           %[mask0]      \n\t"
-      "pmaddhw    %[dest0],        %[dest_hi],        %[mask1]      \n\t"
-      "pmaddhw    %[dest1],        %[dest_hi],        %[mask2]      \n\t"
-      "pmaddhw    %[dest2],        %[dest_hi],        %[mask3]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest0],          %[dest1]      \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest2],          %[mask0]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest2],          %[mask0]      \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_hi],      %[dest0],          %[dest1]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "or         %[dest],         %[dest],           %[dest37]     \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1),
-        [dest] "=&f"(dest)
-      : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
-        [mask4] "f"(mask4), [shift] "f"(shift)
-      : "memory");
-}
-
-// Apply color matrix to a row of image. Matrix is signed.
-// TODO(fbarchard): Consider adding rounding (+32).
-void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            const int8_t* matrix_argb,
-                            int width) {
-  uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2,
-      dest3;
-  uint64_t matrix, matrix_hi, matrix_lo;
-  uint64_t tmp0, tmp1;
-  const uint64_t shift0 = 0x06;
-  const uint64_t shift1 = 0x08;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-
-      "gsldlc1    %[matrix],       0x07(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x00(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_lo],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_lo],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest0],        %[dest0],          %[shift0]     \n\t"
-
-      "gsldlc1    %[matrix],       0x0f(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x08(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_lo],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_lo],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest1],        %[dest1],          %[shift0]     \n\t"
-
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-
-      "gsldlc1    %[matrix],       0x07(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x00(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_hi],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_hi],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest2],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest2],        %[dest2],          %[shift0]     \n\t"
-
-      "gsldlc1    %[matrix],       0x0f(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x08(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_hi],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_hi],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest3],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest3],        %[dest3],          %[shift0]     \n\t"
-
-      "packsswh   %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[tmp1],         %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest],         %[tmp0],           %[tmp1]       \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest),
-        [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi),
-        [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix)
-      : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0),
-        [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1)
-      : "memory");
-}
-
-void ARGBShadeRow_MMI(const uint8_t* src_argb,
-                      uint8_t* dst_argb,
-                      int width,
-                      uint32_t value) {
-  uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[src]        \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[src]        \n\t"
-
-      "punpcklbh  %[value],        %[value],          %[value]      \n\t"
-
-      "pmulhuh    %[dest_lo],      %[src_lo],         %[value]      \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "pmulhuh    %[dest_hi],      %[src_hi],         %[value]      \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src),
-        [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
-        [value] "f"(value), [shift] "f"(shift)
-      : "memory");
-}
-
-void ARGBMultiplyRow_MMI(const uint8_t* src_argb,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width) {
-  uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo;
-  uint64_t dest, dest_lo, dest_hi;
-  const uint64_t mask = 0x0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[src0_lo],      %[src0],           %[src0]       \n\t"
-      "punpckhbh  %[src0_hi],      %[src0],           %[src0]       \n\t"
-
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src1_lo],      %[src1],           %[mask]       \n\t"
-      "punpckhbh  %[src1_hi],      %[src1],           %[mask]       \n\t"
-
-      "pmulhuh    %[dest_lo],      %[src0_lo],        %[src1_lo]    \n\t"
-      "pmulhuh    %[dest_hi],      %[src0_hi],        %[src1_hi]    \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
-        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0),
-        [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask)
-      : "memory");
-}
-
-void ARGBAddRow_MMI(const uint8_t* src_argb,
-                    const uint8_t* src_argb1,
-                    uint8_t* dst_argb,
-                    int width) {
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "paddusb    %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBSubtractRow_MMI(const uint8_t* src_argb,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width) {
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "psubusb    %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width)
-      : "memory");
-}
-
-// Sobel functions which mimics SSSE3.
-void SobelXRow_MMI(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   const uint8_t* src_y2,
-                   uint8_t* dst_sobelx,
-                   int width) {
-  uint64_t y00 = 0, y10 = 0, y20 = 0;
-  uint64_t y02 = 0, y12 = 0, y22 = 0;
-  uint64_t zero = 0x0;
-  uint64_t sobel = 0x0;
-  __asm__ volatile(
-      "1:	                                         \n\t"
-      "gsldlc1   %[y00],        0x07(%[src_y0])          \n\t"  // a=src_y0[i]
-      "gsldrc1   %[y00],        0x00(%[src_y0])          \n\t"
-      "gsldlc1   %[y02],        0x09(%[src_y0])          \n\t"  // a_sub=src_y0[i+2]
-      "gsldrc1   %[y02],        0x02(%[src_y0])          \n\t"
-
-      "gsldlc1   %[y10],        0x07(%[src_y1])          \n\t"  // b=src_y1[i]
-      "gsldrc1   %[y10],        0x00(%[src_y1])          \n\t"
-      "gsldlc1   %[y12],        0x09(%[src_y1])          \n\t"  // b_sub=src_y1[i+2]
-      "gsldrc1   %[y12],        0x02(%[src_y1])          \n\t"
-
-      "gsldlc1   %[y20],        0x07(%[src_y2])          \n\t"  // c=src_y2[i]
-      "gsldrc1   %[y20],        0x00(%[src_y2])          \n\t"
-      "gsldlc1   %[y22],        0x09(%[src_y2])          \n\t"  // c_sub=src_y2[i+2]
-      "gsldrc1   %[y22],        0x02(%[src_y2])          \n\t"
-
-      "punpcklbh %[y00],        %[y00],          %[zero] \n\t"
-      "punpcklbh %[y10],        %[y10],          %[zero] \n\t"
-      "punpcklbh %[y20],        %[y20],          %[zero] \n\t"
-
-      "punpcklbh %[y02],        %[y02],          %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],          %[zero] \n\t"
-      "punpcklbh %[y22],        %[y22],          %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],          %[y10]  \n\t"  // a+b
-      "paddh     %[y20],        %[y20],          %[y10]  \n\t"  // c+b
-      "paddh     %[y00],        %[y00],          %[y20]  \n\t"  // a+2b+c
-
-      "paddh     %[y02],        %[y02],          %[y12]  \n\t"  // a_sub+b_sub
-      "paddh     %[y22],        %[y22],          %[y12]  \n\t"  // c_sub+b_sub
-      "paddh     %[y02],        %[y02],          %[y22]  \n\t"  // a_sub+2b_sub+c_sub
-
-      "pmaxsh    %[y10],        %[y00],          %[y02]  \n\t"
-      "pminsh    %[y20],        %[y00],          %[y02]  \n\t"
-      "psubh     %[sobel],      %[y10],          %[y20]  \n\t"  // Abs
-
-      "gsldlc1   %[y00],        0x0B(%[src_y0])          \n\t"
-      "gsldrc1   %[y00],        0x04(%[src_y0])          \n\t"
-      "gsldlc1   %[y02],        0x0D(%[src_y0])          \n\t"
-      "gsldrc1   %[y02],        0x06(%[src_y0])          \n\t"
-
-      "gsldlc1   %[y10],        0x0B(%[src_y1])          \n\t"
-      "gsldrc1   %[y10],        0x04(%[src_y1])          \n\t"
-      "gsldlc1   %[y12],        0x0D(%[src_y1])          \n\t"
-      "gsldrc1   %[y12],        0x06(%[src_y1])          \n\t"
-
-      "gsldlc1   %[y20],        0x0B(%[src_y2])          \n\t"
-      "gsldrc1   %[y20],        0x04(%[src_y2])          \n\t"
-      "gsldlc1   %[y22],        0x0D(%[src_y2])          \n\t"
-      "gsldrc1   %[y22],        0x06(%[src_y2])          \n\t"
-
-      "punpcklbh %[y00],        %[y00],          %[zero] \n\t"
-      "punpcklbh %[y10],        %[y10],          %[zero] \n\t"
-      "punpcklbh %[y20],        %[y20],          %[zero] \n\t"
-
-      "punpcklbh %[y02],        %[y02],          %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],          %[zero] \n\t"
-      "punpcklbh %[y22],        %[y22],          %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],          %[y10]  \n\t"
-      "paddh     %[y20],        %[y20],          %[y10]  \n\t"
-      "paddh     %[y00],        %[y00],          %[y20]  \n\t"
-
-      "paddh     %[y02],        %[y02],          %[y12]  \n\t"
-      "paddh     %[y22],        %[y22],          %[y12]  \n\t"
-      "paddh     %[y02],        %[y02],          %[y22]  \n\t"
-
-      "pmaxsh    %[y10],        %[y00],          %[y02]  \n\t"
-      "pminsh    %[y20],        %[y00],          %[y02]  \n\t"
-      "psubh     %[y00],        %[y10],          %[y20]  \n\t"
-
-      "packushb  %[sobel],      %[sobel],        %[y00]  \n\t"  // clamp255
-      "gssdrc1   %[sobel],      0(%[dst_sobelx])         \n\t"
-      "gssdlc1   %[sobel],      7(%[dst_sobelx])         \n\t"
-
-      "daddiu    %[src_y0],     %[src_y0],      8        \n\t"
-      "daddiu    %[src_y1],     %[src_y1],      8        \n\t"
-      "daddiu    %[src_y2],     %[src_y2],      8        \n\t"
-      "daddiu    %[dst_sobelx], %[dst_sobelx],  8        \n\t"
-      "daddiu    %[width],      %[width],      -8        \n\t"
-      "bgtz      %[width],      1b                       \n\t"
-      "nop                                               \n\t"
-      : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10),
-        [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22)
-      : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2),
-        [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero)
-      : "memory");
-}
-
-void SobelYRow_MMI(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   uint8_t* dst_sobely,
-                   int width) {
-  uint64_t y00 = 0, y01 = 0, y02 = 0;
-  uint64_t y10 = 0, y11 = 0, y12 = 0;
-  uint64_t zero = 0x0;
-  uint64_t sobel = 0x0;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldlc1   %[y00],        0x07(%[src_y0])         \n\t"  // a=src_y0[i]
-      "gsldrc1   %[y00],        0x00(%[src_y0])         \n\t"
-      "gsldlc1   %[y01],        0x08(%[src_y0])         \n\t"  // b=src_y0[i+1]
-      "gsldrc1   %[y01],        0x01(%[src_y0])         \n\t"
-      "gsldlc1   %[y02],        0x09(%[src_y0])         \n\t"  // c=src_y0[i+2]
-      "gsldrc1   %[y02],        0x02(%[src_y0])         \n\t"
-
-      "gsldlc1   %[y10],        0x07(%[src_y1])         \n\t"  // a_sub=src_y1[i]
-      "gsldrc1   %[y10],        0x00(%[src_y1])         \n\t"
-      "gsldlc1   %[y11],        0x08(%[src_y1])         \n\t"  // b_sub=src_y1[i+1]
-      "gsldrc1   %[y11],        0x01(%[src_y1])         \n\t"
-      "gsldlc1   %[y12],        0x09(%[src_y1])         \n\t"  // c_sub=src_y1[i+2]
-      "gsldrc1   %[y12],        0x02(%[src_y1])         \n\t"
-
-      "punpcklbh %[y00],        %[y00],         %[zero] \n\t"
-      "punpcklbh %[y01],        %[y01],         %[zero] \n\t"
-      "punpcklbh %[y02],        %[y02],         %[zero] \n\t"
-
-      "punpcklbh %[y10],        %[y10],         %[zero] \n\t"
-      "punpcklbh %[y11],        %[y11],         %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],         %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],         %[y01]  \n\t"  // a+b
-      "paddh     %[y02],        %[y02],         %[y01]  \n\t"  // c+b
-      "paddh     %[y00],        %[y00],         %[y02]  \n\t"  // a+2b+c
-
-      "paddh     %[y10],        %[y10],         %[y11]  \n\t"  // a_sub+b_sub
-      "paddh     %[y12],        %[y12],         %[y11]  \n\t"  // c_sub+b_sub
-      "paddh     %[y10],        %[y10],         %[y12]  \n\t"  // a_sub+2b_sub+c_sub
-
-      "pmaxsh    %[y02],        %[y00],         %[y10]  \n\t"
-      "pminsh    %[y12],        %[y00],         %[y10]  \n\t"
-      "psubh     %[sobel],      %[y02],         %[y12]  \n\t"  // Abs
-
-      "gsldlc1   %[y00],        0x0B(%[src_y0])         \n\t"
-      "gsldrc1   %[y00],        0x04(%[src_y0])         \n\t"
-      "gsldlc1   %[y01],        0x0C(%[src_y0])         \n\t"
-      "gsldrc1   %[y01],        0x05(%[src_y0])         \n\t"
-      "gsldlc1   %[y02],        0x0D(%[src_y0])         \n\t"
-      "gsldrc1   %[y02],        0x06(%[src_y0])         \n\t"
-
-      "gsldlc1   %[y10],        0x0B(%[src_y1])         \n\t"
-      "gsldrc1   %[y10],        0x04(%[src_y1])         \n\t"
-      "gsldlc1   %[y11],        0x0C(%[src_y1])         \n\t"
-      "gsldrc1   %[y11],        0x05(%[src_y1])         \n\t"
-      "gsldlc1   %[y12],        0x0D(%[src_y1])         \n\t"
-      "gsldrc1   %[y12],        0x06(%[src_y1])         \n\t"
-
-      "punpcklbh %[y00],        %[y00],         %[zero] \n\t"
-      "punpcklbh %[y01],        %[y01],         %[zero] \n\t"
-      "punpcklbh %[y02],        %[y02],         %[zero] \n\t"
-
-      "punpcklbh %[y10],        %[y10],         %[zero] \n\t"
-      "punpcklbh %[y11],        %[y11],         %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],         %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],         %[y01]  \n\t"
-      "paddh     %[y02],        %[y02],         %[y01]  \n\t"
-      "paddh     %[y00],        %[y00],         %[y02]  \n\t"
-
-      "paddh     %[y10],        %[y10],         %[y11]  \n\t"
-      "paddh     %[y12],        %[y12],         %[y11]  \n\t"
-      "paddh     %[y10],        %[y10],         %[y12]  \n\t"
-
-      "pmaxsh    %[y02],        %[y00],         %[y10]  \n\t"
-      "pminsh    %[y12],        %[y00],         %[y10]  \n\t"
-      "psubh     %[y00],        %[y02],         %[y12]  \n\t"
-
-      "packushb  %[sobel],      %[sobel],       %[y00]  \n\t"  // clamp255
-      "gssdrc1   %[sobel],      0(%[dst_sobely])        \n\t"
-      "gssdlc1   %[sobel],      7(%[dst_sobely])        \n\t"
-
-      "daddiu    %[src_y0],     %[src_y0],      8       \n\t"
-      "daddiu    %[src_y1],     %[src_y1],      8       \n\t"
-      "daddiu    %[dst_sobely], %[dst_sobely],  8       \n\t"
-      "daddiu    %[width],      %[width],      -8       \n\t"
-      "bgtz      %[width],      1b                      \n\t"
-      "nop                                              \n\t"
-      : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01),
-        [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12)
-      : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1),
-        [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero)
-      : "memory");
-}
-
-void SobelRow_MMI(const uint8_t* src_sobelx,
-                  const uint8_t* src_sobely,
-                  uint8_t* dst_argb,
-                  int width) {
-  double temp[3];
-  uint64_t c1 = 0xff000000ff000000;
-  __asm__ volatile(
-      "1:	                                          \n\t"
-      "gsldlc1   %[t0],         0x07(%[src_sobelx])       \n\t"  // a=src_sobelx[i]
-      "gsldrc1   %[t0],         0x00(%[src_sobelx])       \n\t"
-      "gsldlc1   %[t1],         0x07(%[src_sobely])       \n\t"  // b=src_sobely[i]
-      "gsldrc1   %[t1],         0x00(%[src_sobely])       \n\t"
-      // s7 s6 s5 s4 s3 s2 s1 s0 = a+b
-      "paddusb   %[t2] ,        %[t0],              %[t1] \n\t"
-
-      // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0
-      "punpcklbh %[t0],         %[t2],              %[t2] \n\t"
-
-      // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0
-      "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      // 255 s1 s1 s1 s55 s0 s0 s0
-      "gssdrc1   %[t1],         0x00(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x07(%[dst_argb])         \n\t"
-
-      // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2
-      "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      // 255 s3 s3 s3 255 s2 s2 s2
-      "gssdrc1   %[t1],         0x08(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x0f(%[dst_argb])         \n\t"
-
-      // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4
-      "punpckhbh %[t0],         %[t2],              %[t2] \n\t"
-
-      // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4
-      "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      "gssdrc1   %[t1],         0x10(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x17(%[dst_argb])         \n\t"
-
-      // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6
-      "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      "gssdrc1   %[t1],         0x18(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x1f(%[dst_argb])         \n\t"
-
-      "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
-      "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
-      "daddiu    %[src_sobely], %[src_sobely],      8     \n\t"
-      "daddiu    %[width],      %[width],          -8     \n\t"
-      "bgtz      %[width],      1b                        \n\t"
-      "nop                                                \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
-      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
-        [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
-      : "memory");
-}
-
-void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
-                         const uint8_t* src_sobely,
-                         uint8_t* dst_y,
-                         int width) {
-  uint64_t tr = 0;
-  uint64_t tb = 0;
-  __asm__ volatile(
-      "1:	                                       \n\t"
-      "gsldrc1 %[tr],         0x0(%[src_sobelx])       \n\t"
-      "gsldlc1 %[tr],         0x7(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
-      "gsldrc1 %[tb],         0x0(%[src_sobely])       \n\t"
-      "gsldlc1 %[tb],         0x7(%[src_sobely])       \n\t"  // b=src_sobely[i]
-      "paddusb %[tr],         %[tr],             %[tb] \n\t"  // g
-      "gssdrc1 %[tr],         0x0(%[dst_y])	       \n\t"
-      "gssdlc1 %[tr],         0x7(%[dst_y])            \n\t"
-
-      "daddiu  %[dst_y],      %[dst_y],          8     \n\t"
-      "daddiu  %[src_sobelx], %[src_sobelx],     8     \n\t"
-      "daddiu  %[src_sobely], %[src_sobely],     8     \n\t"
-      "daddiu  %[width],      %[width],         -8     \n\t"
-      "bgtz    %[width],      1b                       \n\t"
-      "nop                                             \n\t"
-      : [tr] "=&f"(tr), [tb] "=&f"(tb)
-      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
-        [dst_y] "r"(dst_y), [width] "r"(width)
-      : "memory");
-}
-
-void SobelXYRow_MMI(const uint8_t* src_sobelx,
-                    const uint8_t* src_sobely,
-                    uint8_t* dst_argb,
-                    int width) {
-  uint64_t temp[3];
-  uint64_t result = 0;
-  uint64_t gb = 0;
-  uint64_t cr = 0;
-  uint64_t c1 = 0xffffffffffffffff;
-  __asm__ volatile(
-      "1:	                                          \n\t"
-      "gsldlc1   %[tr],         0x07(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
-      "gsldrc1   %[tr],         0x00(%[src_sobelx])       \n\t"
-      "gsldlc1   %[tb],         0x07(%[src_sobely])       \n\t"  // b=src_sobely[i]
-      "gsldrc1   %[tb],         0x00(%[src_sobely])       \n\t"
-      "paddusb   %[tg] ,        %[tr],              %[tb] \n\t"  // g
-
-      // g3 b3 g2 b2 g1 b1 g0 b0
-      "punpcklbh %[gb],         %[tb],              %[tg] \n\t"
-      // c3 r3 r2 r2 c1 r1 c0 r0
-      "punpcklbh %[cr],         %[tr],              %[c1] \n\t"
-      // c1 r1 g1 b1 c0 r0 g0 b0
-      "punpcklhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x00(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x07(%[dst_argb])         \n\t"
-      // c3 r3 g3 b3 c2 r2 g2 b2
-      "punpckhhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x08(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x0f(%[dst_argb])         \n\t"
-
-      // g7 b7 g6 b6 g5 b5 g4 b4
-      "punpckhbh %[gb],         %[tb],              %[tg] \n\t"
-      // c7 r7 c6 r6 c5 r5 c4 r4
-      "punpckhbh %[cr],         %[tr],              %[c1] \n\t"
-      // c5 r5 g5 b5 c4 r4 g4 b4
-      "punpcklhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x10(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x17(%[dst_argb])         \n\t"
-      // c7 r7 g7 b7 c6 r6 g6 b6
-      "punpckhhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x18(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x1f(%[dst_argb])         \n\t"
-
-      "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
-      "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
-      "daddiu    %[src_sobely], %[src_sobely],      8     \n\t"
-      "daddiu    %[width],      %[width],          -8     \n\t"
-      "bgtz      %[width],      1b                        \n\t"
-      "nop                                                \n\t"
-      : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]),
-        [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result)
-      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
-        [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
-      : "memory");
-}
-
-void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  // Copy a Y to RGB.
-  uint64_t src, dest;
-  const uint64_t mask0 = 0x00ffffff00ffffffULL;
-  const uint64_t mask1 = ~mask0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src],          0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src],          %[src],            %[src]        \n\t"
-      "punpcklhw  %[dest],         %[src],            %[src]        \n\t"
-      "and        %[dest],         %[dest],           %[mask0]      \n\t"
-      "or         %[dest],         %[dest],           %[mask1]      \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-
-      "punpckhhw  %[dest],         %[src],            %[src]        \n\t"
-      "and        %[dest],         %[dest],           %[mask0]      \n\t"
-      "or         %[dest],         %[dest],           %[mask1]      \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x04          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-// TODO - respect YuvConstants
-void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf,
-                       const struct YuvConstants*, int width) {
-  uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x55;
-  const uint64_t mask2 = 0xAA;
-  const uint64_t mask3 = 0xFF;
-  const uint64_t mask4 = 0x4A354A354A354A35ULL;
-  const uint64_t mask5 = 0x0488048804880488ULL;
-  const uint64_t shift0 = 0x08;
-  const uint64_t shift1 = 0x06;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-
-      "pshufh     %[src],          %[src_lo],         %[mask0]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_lo],         %[mask1]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "pshufh     %[src],          %[src_lo],         %[mask2]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_lo],         %[mask3]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "pshufh     %[src],          %[src_hi],         %[mask0]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_hi],         %[mask1]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
-
-      "pshufh     %[src],          %[src_hi],         %[mask2]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_hi],         %[mask3]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-
-      "daddi      %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x20          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo)
-      : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
-        [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0),
-        [shift1] "f"(shift1), [width] "r"(width)
-      : "memory");
-}
-
-void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  uint64_t source, src0, src1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x1b;
-
-  src += width - 1;
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[source],       0(%[src_ptr])                    \n\t"
-      "gsldrc1    %[source],       -7(%[src_ptr])                   \n\t"
-      "punpcklbh  %[src0],         %[source],         %[mask0]      \n\t"
-      "pshufh     %[src0],         %[src0],           %[mask1]      \n\t"
-      "punpckhbh  %[src1],         %[source],         %[mask0]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
-      "packushb   %[dest],         %[src1],           %[src0]       \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddi      %[src_ptr],      %[src_ptr],       -0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0),
-        [src1] "=&f"(src1)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void MirrorSplitUVRow_MMI(const uint8_t* src_uv,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width) {
-  uint64_t src0, src1, dest0, dest1;
-  const uint64_t mask0 = 0x00ff00ff00ff00ffULL;
-  const uint64_t mask1 = 0x1b;
-  const uint64_t shift = 0x08;
-
-  src_uv += (width - 1) << 1;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         1(%[src_ptr])                    \n\t"
-      "gsldrc1    %[src0],         -6(%[src_ptr])                   \n\t"
-      "gsldlc1    %[src1],         -7(%[src_ptr])                   \n\t"
-      "gsldrc1    %[src1],         -14(%[src_ptr])                  \n\t"
-
-      "and        %[dest0],        %[src0],           %[mask0]      \n\t"
-      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
-      "and        %[dest1],        %[src1],           %[mask0]      \n\t"
-      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
-      "packushb   %[dest0],        %[dest0],          %[dest1]      \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dstu_ptr])                \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dstu_ptr])                \n\t"
-
-      "psrlh      %[dest0],        %[src0],           %[shift]      \n\t"
-      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
-      "psrlh      %[dest1],        %[src1],           %[shift]      \n\t"
-      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
-      "packushb   %[dest0],        %[dest0],          %[dest1]      \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dstv_ptr])                \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dstv_ptr])                \n\t"
-
-      "daddi      %[src_ptr],      %[src_ptr],       -0x10          \n\t"
-      "daddiu     %[dstu_ptr],     %[dstu_ptr],       0x08          \n\t"
-      "daddiu     %[dstv_ptr],     %[dstv_ptr],       0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
-        [src1] "=&f"(src1)
-      : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v),
-        [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [shift] "f"(shift)
-      : "memory");
-}
-
-void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  src += (width - 1) * 4;
-  uint64_t temp = 0x0;
-  uint64_t shuff = 0x4e;  // 01 00 11 10
-  __asm__ volatile(
-      "1:                                      \n\t"
-      "gsldlc1 %[temp],  3(%[src])     	       \n\t"
-      "gsldrc1 %[temp], -4(%[src])     	       \n\t"
-      "pshufh  %[temp],  %[temp],    %[shuff]  \n\t"
-      "gssdrc1 %[temp],  0x0(%[dst])           \n\t"
-      "gssdlc1 %[temp],  0x7(%[dst])           \n\t"
-
-      "daddiu  %[src],   %[src],    -0x08      \n\t"
-      "daddiu  %[dst],   %[dst],     0x08      \n\t"
-      "daddiu  %[width], %[width],  -0x02      \n\t"
-      "bnez    %[width], 1b                    \n\t"
-      : [temp] "=&f"(temp)
-      : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff)
-      : "memory");
-}
-
-void SplitUVRow_MMI(const uint8_t* src_uv,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width) {
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[4];
-  uint64_t shift = 0x08;
-  __asm__ volatile(
-      "1:	                                    \n\t"
-      "gsldrc1  %[t0],     0x00(%[src_uv])          \n\t"
-      "gsldlc1  %[t0],     0x07(%[src_uv])          \n\t"
-      "gsldrc1  %[t1],     0x08(%[src_uv])          \n\t"
-      "gsldlc1  %[t1],     0x0f(%[src_uv])          \n\t"
-
-      "and      %[t2],     %[t0],          %[c0]    \n\t"
-      "and      %[t3],     %[t1],          %[c0]    \n\t"
-      "packushb %[t2],     %[t2],          %[t3]    \n\t"
-      "gssdrc1  %[t2],     0x0(%[dst_u])	    \n\t"
-      "gssdlc1  %[t2],     0x7(%[dst_u])            \n\t"
-
-      "psrlh    %[t2],     %[t0],          %[shift] \n\t"
-      "psrlh    %[t3],     %[t1],          %[shift] \n\t"
-      "packushb %[t2],     %[t2],          %[t3]    \n\t"
-      "gssdrc1  %[t2],     0x0(%[dst_v])            \n\t"
-      "gssdlc1  %[t2],     0x7(%[dst_v])            \n\t"
-
-      "daddiu   %[src_uv], %[src_uv],      16       \n\t"
-      "daddiu   %[dst_u],  %[dst_u],       8        \n\t"
-      "daddiu   %[dst_v],  %[dst_v],       8        \n\t"
-      "daddiu   %[width],  %[width],      -8        \n\t"
-      "bgtz     %[width],  1b                       \n\t"
-      "nop                                          \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
-        [t3] "=&f"(temp[3])
-      : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-void MergeUVRow_MMI(const uint8_t* src_u,
-                    const uint8_t* src_v,
-                    uint8_t* dst_uv,
-                    int width) {
-  uint64_t temp[3];
-  __asm__ volatile(
-      "1:	                                 \n\t"
-      "gsldrc1   %[t0],     0x0(%[src_u])        \n\t"
-      "gsldlc1   %[t0],     0x7(%[src_u])        \n\t"
-      "gsldrc1   %[t1],     0x0(%[src_v])        \n\t"
-      "gsldlc1   %[t1],     0x7(%[src_v])        \n\t"
-      "punpcklbh %[t2],     %[t0],         %[t1] \n\t"
-      "gssdrc1   %[t2],     0x0(%[dst_uv])	 \n\t"
-      "gssdlc1   %[t2],     0x7(%[dst_uv])       \n\t"
-      "punpckhbh %[t2],     %[t0],         %[t1] \n\t"
-      "gssdrc1   %[t2],     0x8(%[dst_uv])	 \n\t"
-      "gssdlc1   %[t2],     0xf(%[dst_uv])       \n\t"
-
-      "daddiu    %[src_u],  %[src_u],      8     \n\t"
-      "daddiu    %[src_v],  %[src_v],      8     \n\t"
-      "daddiu    %[dst_uv], %[dst_uv],     16    \n\t"
-      "daddiu    %[width],  %[width],     -8     \n\t"
-      "bgtz      %[width],  1b                   \n\t"
-      "nop                                       \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
-      : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v),
-        [width] "r"(width)
-      : "memory");
-}
-
-void SplitRGBRow_MMI(const uint8_t* src_rgb,
-                     uint8_t* dst_r,
-                     uint8_t* dst_g,
-                     uint8_t* dst_b,
-                     int width) {
-  uint64_t src[4];
-  uint64_t dest_hi, dest_lo, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
-      "punpcklbh  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "gslwlc1    %[src2],         0x09(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src2],         0x06(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src3],         0x0c(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src3],         0x09(%[src_ptr])                 \n\t"
-      "punpcklbh  %[dest_hi],      %[src2],           %[src3]       \n\t"
-
-      "punpcklhw  %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gsswlc1    %[dest],         0x03(%[dstr_ptr])                \n\t"
-      "gsswrc1    %[dest],         0x00(%[dstr_ptr])                \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x03(%[dstg_ptr])                \n\t"
-      "gsswrc1    %[dest],         0x00(%[dstg_ptr])                \n\t"
-      "punpckhhw  %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gsswlc1    %[dest],         0x03(%[dstb_ptr])                \n\t"
-      "gsswrc1    %[dest],         0x00(%[dstb_ptr])                \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
-      "daddiu     %[dstr_ptr],     %[dstr_ptr],       0x04          \n\t"
-      "daddiu     %[dstg_ptr],     %[dstg_ptr],       0x04          \n\t"
-      "daddiu     %[dstb_ptr],     %[dstb_ptr],       0x04          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]),
-        [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g),
-        [dstb_ptr] "r"(dst_b), [width] "r"(width)
-      : "memory");
-}
-
-void MergeRGBRow_MMI(const uint8_t* src_r,
-                     const uint8_t* src_g,
-                     const uint8_t* src_b,
-                     uint8_t* dst_rgb,
-                     int width) {
-  uint64_t srcr, srcg, srcb, dest;
-  uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo;
-  const uint64_t temp = 0x0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[srcr],         0x07(%[srcr_ptr])                \n\t"
-      "gsldrc1    %[srcr],         0x00(%[srcr_ptr])                \n\t"
-      "gsldlc1    %[srcg],         0x07(%[srcg_ptr])                \n\t"
-      "gsldrc1    %[srcg],         0x00(%[srcg_ptr])                \n\t"
-      "punpcklbh  %[srcrg_lo],     %[srcr],           %[srcg]       \n\t"
-      "punpckhbh  %[srcrg_hi],     %[srcr],           %[srcg]       \n\t"
-
-      "gsldlc1    %[srcb],         0x07(%[srcb_ptr])                \n\t"
-      "gsldrc1    %[srcb],         0x00(%[srcb_ptr])                \n\t"
-      "punpcklbh  %[srcbz_lo],     %[srcb],           %[temp]       \n\t"
-      "punpckhbh  %[srcbz_hi],     %[srcb],           %[temp]       \n\t"
-
-      "punpcklhw  %[dest],         %[srcrg_lo],       %[srcbz_lo]   \n\t"
-      "gsswlc1    %[dest],         0x03(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x06(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x03(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest],         %[srcrg_lo],       %[srcbz_lo]   \n\t"
-      "gsswlc1    %[dest],         0x09(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x06(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x0c(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x09(%[dst_ptr])                 \n\t"
-      "punpcklhw  %[dest],         %[srcrg_hi],       %[srcbz_hi]   \n\t"
-      "gsswlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x0c(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x12(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest],         %[srcrg_hi],       %[srcbz_hi]   \n\t"
-      "gsswlc1    %[dest],         0x15(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x12(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x15(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[srcr_ptr],     %[srcr_ptr],       0x08          \n\t"
-      "daddiu     %[srcg_ptr],     %[srcg_ptr],       0x08          \n\t"
-      "daddiu     %[srcb_ptr],     %[srcb_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x18          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb),
-        [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi),
-        [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi),
-        [srcbz_lo] "=&f"(srcbz_lo)
-      : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b),
-        [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp)
-      : "memory");
-}
-
-// Filter 2 rows of YUY2 UV's (422) into U and V (420).
-void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
-                     int src_stride_yuy2,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t c0 = 0xff00ff00ff00ff00;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t temp[3];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  uint64_t src_stride = 0x0;
-  __asm__ volatile(
-      "1:	                                                     \n\t"
-      "gsldrc1  %[t0],         0x00(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t0],         0x07(%[src_yuy2])                     \n\t"
-      "daddu    %[src_stride], %[src_yuy2],       %[src_stride_yuy2] \n\t"
-      "gsldrc1  %[t1],         0x00(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x07(%[src_stride])                   \n\t"
-      "pavgb    %[t0],         %[t0],             %[t1]              \n\t"
-
-      "gsldrc1  %[t2],         0x08(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t2],         0x0f(%[src_yuy2])                     \n\t"
-      "gsldrc1  %[t1],         0x08(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x0f(%[src_stride])                   \n\t"
-      "pavgb    %[t1],         %[t2],             %[t1]              \n\t"
-
-      "and      %[t0],         %[t0],             %[c0]              \n\t"
-      "and      %[t1],         %[t1],             %[c0]              \n\t"
-      "psrlh    %[t0],         %[t0],             %[shift]           \n\t"
-      "psrlh    %[t1],         %[t1],             %[shift]           \n\t"
-      "packushb %[t0],         %[t0],             %[t1]              \n\t"
-      "mov.s    %[t1],         %[t0]                                 \n\t"
-      "and      %[d0],         %[t0],             %[c1]              \n\t"
-      "psrlh    %[d1],         %[t1],             %[shift]           \n\t"
-
-      "gsldrc1  %[t0],         0x10(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t0],         0x17(%[src_yuy2])                     \n\t"
-      "gsldrc1  %[t1],         0x10(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x17(%[src_stride])                   \n\t"
-      "pavgb    %[t0],         %[t0],              %[t1]             \n\t"
-
-      "gsldrc1  %[t2],         0x18(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t2],         0x1f(%[src_yuy2])                     \n\t"
-      "gsldrc1  %[t1],         0x18(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x1f(%[src_stride])                   \n\t"
-      "pavgb    %[t1],         %[t2],              %[t1]             \n\t"
-
-      "and      %[t0],         %[t0],              %[c0]             \n\t"
-      "and      %[t1],         %[t1],              %[c0]             \n\t"
-      "psrlh    %[t0],         %[t0],              %[shift]          \n\t"
-      "psrlh    %[t1],         %[t1],              %[shift]          \n\t"
-      "packushb %[t0],         %[t0],              %[t1]             \n\t"
-      "mov.s    %[t1],         %[t0]                                 \n\t"
-      "and      %[d2],         %[t0],              %[c1]             \n\t"
-      "psrlh    %[d3],         %[t1],              %[shift]          \n\t"
-
-      "packushb %[d0],         %[d0],              %[d2]             \n\t"
-      "packushb %[d1],         %[d1],              %[d3]             \n\t"
-      "gssdrc1  %[d0],         0x0(%[dst_u])	                     \n\t"
-      "gssdlc1  %[d0],         0x7(%[dst_u])                         \n\t"
-      "gssdrc1  %[d1],         0x0(%[dst_v])	                     \n\t"
-      "gssdlc1  %[d1],         0x7(%[dst_v])                         \n\t"
-      "daddiu   %[src_yuy2],   %[src_yuy2],        32                \n\t"
-      "daddiu   %[dst_u],      %[dst_u],           8                 \n\t"
-      "daddiu   %[dst_v],      %[dst_v],           8                 \n\t"
-      "daddiu   %[width],      %[width],          -16                \n\t"
-      "bgtz     %[width],      1b                                    \n\t"
-      "nop                                                           \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
-        [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
-        [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
-      : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of YUY2 UV's (422) into U and V (422).
-void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  uint64_t c0 = 0xff00ff00ff00ff00;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t temp[2];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_yuy2])          \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_yuy2])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "psrlh    %[t0],       %[t0],            %[shift] \n\t"
-      "psrlh    %[t1],       %[t1],            %[shift] \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d0],       %[t0],            %[c1]    \n\t"
-      "psrlh    %[d1],       %[t1],            %[shift] \n\t"
-
-      "gsldrc1  %[t0],       0x10(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t0],       0x17(%[src_yuy2])          \n\t"
-      "gsldrc1  %[t1],       0x18(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t1],       0x1f(%[src_yuy2])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "psrlh    %[t0],       %[t0],            %[shift] \n\t"
-      "psrlh    %[t1],       %[t1],            %[shift] \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d2],       %[t0],            %[c1]    \n\t"
-      "psrlh    %[d3],       %[t1],            %[shift] \n\t"
-
-      "packushb %[d0],       %[d0],            %[d2]    \n\t"
-      "packushb %[d1],       %[d1],            %[d3]    \n\t"
-      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
-      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
-      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
-      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
-      "daddiu   %[src_yuy2], %[src_yuy2],      32       \n\t"
-      "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
-      "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
-      "daddiu   %[width],    %[width],        -16       \n\t"
-      "bgtz     %[width],    1b                         \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
-        [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
-      : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of YUY2 Y's (422) into Y (420/422).
-void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[2];
-  __asm__ volatile(
-      "1:	                                     \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_yuy2])       \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_yuy2])       \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_yuy2])       \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_yuy2])       \n\t"
-      "and      %[t0],       %[t0],            %[c0] \n\t"
-      "and      %[t1],       %[t1],            %[c0] \n\t"
-      "packushb %[t0],       %[t0],            %[t1] \n\t"
-      "gssdrc1  %[t0],       0x0(%[dst_y])	     \n\t"
-      "gssdlc1  %[t0],       0x7(%[dst_y])           \n\t"
-      "daddiu   %[src_yuy2], %[src_yuy2],      16    \n\t"
-      "daddiu   %[dst_y],    %[dst_y],         8     \n\t"
-      "daddiu   %[width],    %[width],        -8     \n\t"
-      "bgtz     %[width],    1b                      \n\t"
-      "nop                                           \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
-      : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width),
-        [c0] "f"(c0)
-      : "memory");
-}
-
-// Filter 2 rows of UYVY UV's (422) into U and V (420).
-void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
-                     int src_stride_uyvy,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  // Output a row of UV values.
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[3];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  uint64_t src_stride = 0x0;
-  __asm__ volatile(
-      "1:	                                                      \n\t"
-      "gsldrc1  %[t0],         0x00(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t0],         0x07(%[src_uyvy])                      \n\t"
-      "daddu    %[src_stride], %[src_uyvy],        %[src_stride_uyvy] \n\t"
-      "gsldrc1  %[t1],         0x00(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x07(%[src_stride])                    \n\t"
-      "pavgb    %[t0],         %[t0],              %[t1]              \n\t"
-
-      "gsldrc1  %[t2],         0x08(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t2],         0x0f(%[src_uyvy])                      \n\t"
-      "gsldrc1  %[t1],         0x08(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x0f(%[src_stride])                    \n\t"
-      "pavgb    %[t1],         %[t2],              %[t1]              \n\t"
-
-      "and      %[t0],         %[t0],              %[c0]              \n\t"
-      "and      %[t1],         %[t1],              %[c0]              \n\t"
-      "packushb %[t0],         %[t0],              %[t1]              \n\t"
-      "mov.s    %[t1],         %[t0]                                  \n\t"
-      "and      %[d0],         %[t0],              %[c0]              \n\t"
-      "psrlh    %[d1],         %[t1],              %[shift]           \n\t"
-
-      "gsldrc1  %[t0],         0x10(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t0],         0x17(%[src_uyvy])                      \n\t"
-      "gsldrc1  %[t1],         0x10(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x17(%[src_stride])                    \n\t"
-      "pavgb    %[t0],         %[t0],              %[t1]              \n\t"
-
-      "gsldrc1  %[t2],         0x18(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t2],         0x1f(%[src_uyvy])                      \n\t"
-      "gsldrc1  %[t1],         0x18(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x1f(%[src_stride])                    \n\t"
-      "pavgb    %[t1],         %[t2],              %[t1]              \n\t"
-
-      "and      %[t0],         %[t0],              %[c0]              \n\t"
-      "and      %[t1],         %[t1],              %[c0]              \n\t"
-      "packushb %[t0],         %[t0],              %[t1]              \n\t"
-      "mov.s    %[t1],         %[t0]                                  \n\t"
-      "and      %[d2],         %[t0],              %[c0]              \n\t"
-      "psrlh    %[d3],         %[t1],              %[shift]           \n\t"
-
-      "packushb %[d0],         %[d0],              %[d2]              \n\t"
-      "packushb %[d1],         %[d1],              %[d3]              \n\t"
-      "gssdrc1  %[d0],         0x0(%[dst_u])	                      \n\t"
-      "gssdlc1  %[d0],         0x7(%[dst_u])                          \n\t"
-      "gssdrc1  %[d1],         0x0(%[dst_v])	                      \n\t"
-      "gssdlc1  %[d1],         0x7(%[dst_v])                          \n\t"
-      "daddiu   %[src_uyvy],   %[src_uyvy],        32                 \n\t"
-      "daddiu   %[dst_u],      %[dst_u],           8                  \n\t"
-      "daddiu   %[dst_v],      %[dst_v],           8                  \n\t"
-      "daddiu   %[width],      %[width],          -16                 \n\t"
-      "bgtz     %[width],      1b                                     \n\t"
-      "nop                                                            \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
-        [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
-        [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
-      : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of UYVY UV's (422) into U and V (422).
-void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  // Output a row of UV values.
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[2];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d0],       %[t0],            %[c0]    \n\t"
-      "psrlh    %[d1],       %[t1],            %[shift] \n\t"
-
-      "gsldrc1  %[t0],       0x10(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t0],       0x17(%[src_uyvy])          \n\t"
-      "gsldrc1  %[t1],       0x18(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t1],       0x1f(%[src_uyvy])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d2],       %[t0],            %[c0]    \n\t"
-      "psrlh    %[d3],       %[t1],            %[shift] \n\t"
-
-      "packushb %[d0],       %[d0],            %[d2]    \n\t"
-      "packushb %[d1],       %[d1],            %[d3]    \n\t"
-      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
-      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
-      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
-      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
-      "daddiu   %[src_uyvy], %[src_uyvy],      32       \n\t"
-      "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
-      "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
-      "daddiu   %[width],    %[width],        -16       \n\t"
-      "bgtz     %[width],    1b                         \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
-        [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
-      : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of UYVY Y's (422) into Y (420/422).
-void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  // Output a row of Y values.
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t shift = 0x08;
-  uint64_t temp[2];
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
-      "dsrl     %[t0],       %[t0],            %[shift] \n\t"
-      "dsrl     %[t1],       %[t1],            %[shift] \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "gssdrc1  %[t0],       0x0(%[dst_y])	        \n\t"
-      "gssdlc1  %[t0],       0x7(%[dst_y])              \n\t"
-      "daddiu   %[src_uyvy], %[src_uyvy],      16       \n\t"
-      "daddiu   %[dst_y],    %[dst_y],         8        \n\t"
-      "daddiu   %[width],    %[width],        -8        \n\t"
-      "bgtz     %[width],    1b                         \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
-      : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width),
-        [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-// Blend src_argb over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb or src_argb1.
-// This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_MMI(const uint8_t* src_argb,
-                      const uint8_t* src_argb1,
-                      uint8_t* dst_argb,
-                      int width) {
-  uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi,
-      dest_lo;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL;
-  const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
-  const uint64_t mask3 = 0xFF;
-  const uint64_t mask4 = ~mask1;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[src0_lo],      %[src0],           %[mask0]      \n\t"
-
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src1_lo],      %[src1],           %[mask0]      \n\t"
-
-      "psubush    %[alpha],        %[mask2],          %[src0_lo]    \n\t"
-      "pshufh     %[alpha],        %[alpha],          %[mask3]      \n\t"
-      "pmullh     %[dest_lo],      %[src1_lo],        %[alpha]      \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src0_lo]    \n\t"
-
-      "punpckhbh  %[src0_hi],      %[src0],           %[mask0]      \n\t"
-      "punpckhbh  %[src1_hi],      %[src1],           %[mask0]      \n\t"
-
-      "psubush    %[alpha],        %[mask2],          %[src0_hi]    \n\t"
-      "pshufh     %[alpha],        %[alpha],          %[mask3]      \n\t"
-      "pmullh     %[dest_hi],      %[src1_hi],        %[alpha]      \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src0_hi]    \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[mask4]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha),
-        [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
-        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4),
-        [shift] "f"(shift), [width] "r"(width)
-      : "memory");
-}
-
-void BlendPlaneRow_MMI(const uint8_t* src0,
-                       const uint8_t* src1,
-                       const uint8_t* alpha,
-                       uint8_t* dst,
-                       int width) {
-  uint64_t source0, source1, dest, alph;
-  uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi,
-      dest_lo;
-  uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL;
-  const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[src0_lo],      %[src0],           %[mask0]      \n\t"
-      "punpckhbh  %[src0_hi],      %[src0],           %[mask0]      \n\t"
-
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src1_lo],      %[src1],           %[mask0]      \n\t"
-      "punpckhbh  %[src1_hi],      %[src1],           %[mask0]      \n\t"
-
-      "gsldlc1    %[alpha],        0x07(%[alpha_ptr])               \n\t"
-      "gsldrc1    %[alpha],        0x00(%[alpha_ptr])               \n\t"
-      "psubusb    %[alpha_r],      %[mask1],          %[alpha]      \n\t"
-      "punpcklbh  %[alpha_lo],     %[alpha],          %[mask0]      \n\t"
-      "punpckhbh  %[alpha_hi],     %[alpha],          %[mask0]      \n\t"
-      "punpcklbh  %[alpha_rlo],    %[alpha_r],        %[mask0]      \n\t"
-      "punpckhbh  %[alpha_rhi],    %[alpha_r],        %[mask0]      \n\t"
-
-      "pmullh     %[dest_lo],      %[src0_lo],        %[alpha_lo]   \n\t"
-      "pmullh     %[dest],         %[src1_lo],        %[alpha_rlo]  \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[dest]       \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[mask2]      \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-
-      "pmullh     %[dest_hi],      %[src0_hi],        %[alpha_hi]   \n\t"
-      "pmullh     %[dest],         %[src1_hi],        %[alpha_rhi]  \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[dest]       \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[mask2]      \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[alpha_ptr],    %[alpha_ptr],      0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph),
-        [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
-        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
-        [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi),
-        [alpha_r] "=&f"(alpha_rev)
-      : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha),
-        [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width)
-      : "memory");
-}
-
-// Multiply source RGB by alpha and store to destination.
-// This code mimics the SSSE3 version for better testability.
-void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
-                          uint8_t* dst_argb,
-                          int width) {
-  uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha;
-  const uint64_t mask0 = 0xFF;
-  const uint64_t mask1 = 0xFF000000FF000000ULL;
-  const uint64_t mask2 = ~mask1;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[src]        \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[src]        \n\t"
-
-      "pshufh     %[alpha],        %[src_lo],         %[mask0]      \n\t"
-      "pmulhuh    %[dest_lo],      %[alpha],          %[src_lo]     \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "pshufh     %[alpha],        %[src_hi],         %[mask0]      \n\t"
-      "pmulhuh    %[dest_hi],      %[alpha],          %[src_hi]     \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "and        %[dest],         %[dest],           %[mask2]      \n\t"
-      "and        %[src],          %[src],            %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[src]        \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift),
-        [width] "r"(width)
-      : "memory");
-}
-
-void ComputeCumulativeSumRow_MMI(const uint8_t* row,
-                                 int32_t* cumsum,
-                                 const int32_t* previous_cumsum,
-                                 int width) {
-  int64_t row_sum[2] = {0, 0};
-  uint64_t src, dest0, dest1, presrc0, presrc1, dest;
-  const uint64_t mask = 0x0;
-
-  __asm__ volatile(
-      "xor        %[row_sum0],     %[row_sum0],       %[row_sum0]   \n\t"
-      "xor        %[row_sum1],     %[row_sum1],       %[row_sum1]   \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[src],          0x03(%[row_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x00(%[row_ptr])                 \n\t"
-
-      "punpcklbh  %[src],          %[src],            %[mask]       \n\t"
-      "punpcklhw  %[dest0],        %[src],            %[mask]       \n\t"
-      "punpckhhw  %[dest1],        %[src],            %[mask]       \n\t"
-
-      "paddw      %[row_sum0],     %[row_sum0],       %[dest0]      \n\t"
-      "paddw      %[row_sum1],     %[row_sum1],       %[dest1]      \n\t"
-
-      "gsldlc1    %[presrc0],      0x07(%[pre_ptr])                 \n\t"
-      "gsldrc1    %[presrc0],      0x00(%[pre_ptr])                 \n\t"
-      "gsldlc1    %[presrc1],      0x0f(%[pre_ptr])                 \n\t"
-      "gsldrc1    %[presrc1],      0x08(%[pre_ptr])                 \n\t"
-
-      "paddw      %[dest0],        %[row_sum0],       %[presrc0]    \n\t"
-      "paddw      %[dest1],        %[row_sum1],       %[presrc1]    \n\t"
-
-      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[row_ptr],      %[row_ptr],        0x04          \n\t"
-      "daddiu     %[pre_ptr],      %[pre_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x01          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]),
-        [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0),
-        [presrc1] "=&f"(presrc1)
-      : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum),
-        [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask)
-      : "memory");
-}
-
-// C version 2x2 -> 2x1.
-void InterpolateRow_MMI(uint8_t* dst_ptr,
-                        const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        int width,
-                        int source_y_fraction) {
-  if (source_y_fraction == 0) {
-    __asm__ volatile(
-        "1:	                              \n\t"
-        "ld     $t0,        0x0(%[src_ptr])   \n\t"
-        "sd     $t0,        0x0(%[dst_ptr])   \n\t"
-        "daddiu %[src_ptr], %[src_ptr],     8 \n\t"
-        "daddiu %[dst_ptr], %[dst_ptr],     8 \n\t"
-        "daddiu %[width],   %[width],      -8 \n\t"
-        "bgtz   %[width],   1b                \n\t"
-        "nop                                  \n\t"
-        :
-        : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width)
-        : "memory");
-    return;
-  }
-  if (source_y_fraction == 128) {
-    uint64_t uv = 0x0;
-    uint64_t uv_stride = 0x0;
-    __asm__ volatile(
-        "1:	                                            \n\t"
-        "gsldrc1 %[uv],        0x0(%[src_ptr])              \n\t"
-        "gsldlc1 %[uv],        0x7(%[src_ptr])              \n\t"
-        "daddu   $t0,          %[src_ptr],     %[stride]    \n\t"
-        "gsldrc1 %[uv_stride], 0x0($t0)                     \n\t"
-        "gsldlc1 %[uv_stride], 0x7($t0)                     \n\t"
-
-        "pavgb   %[uv],        %[uv],          %[uv_stride] \n\t"
-        "gssdrc1 %[uv],        0x0(%[dst_ptr])              \n\t"
-        "gssdlc1 %[uv],        0x7(%[dst_ptr])              \n\t"
-
-        "daddiu  %[src_ptr],   %[src_ptr],     8            \n\t"
-        "daddiu  %[dst_ptr],   %[dst_ptr],     8            \n\t"
-        "daddiu  %[width],     %[width],      -8            \n\t"
-        "bgtz    %[width],     1b                           \n\t"
-        "nop                                                \n\t"
-        : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride)
-        : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width),
-          [stride] "r"((int64_t)src_stride)
-        : "memory");
-    return;
-  }
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  uint64_t temp;
-  uint64_t data[4];
-  uint64_t zero = 0x0;
-  uint64_t c0 = 0x0080008000800080;
-  uint64_t fy0 = 0x0100010001000100;
-  uint64_t shift = 0x8;
-  __asm__ volatile(
-      "pshufh    %[fy1],      %[fy1],          %[zero]  \n\t"
-      "psubh     %[fy0],      %[fy0],          %[fy1]   \n\t"
-      "1:	                                        \n\t"
-      "gsldrc1   %[t0],       0x0(%[src_ptr])           \n\t"
-      "gsldlc1   %[t0],       0x7(%[src_ptr])           \n\t"
-      "punpcklbh %[d0],       %[t0],           %[zero]  \n\t"
-      "punpckhbh %[d1],       %[t0],           %[zero]  \n\t"
-      "gsldrc1   %[t0],       0x0(%[src_ptr1])          \n\t"
-      "gsldlc1   %[t0],       0x7(%[src_ptr1])          \n\t"
-      "punpcklbh %[d2],       %[t0],           %[zero]  \n\t"
-      "punpckhbh %[d3],       %[t0],           %[zero]  \n\t"
-
-      "pmullh    %[d0],       %[d0],           %[fy0]   \n\t"
-      "pmullh    %[d2],       %[d2],           %[fy1]   \n\t"
-      "paddh     %[d0],       %[d0],           %[d2]    \n\t"
-      "paddh     %[d0],       %[d0],           %[c0]    \n\t"
-      "psrlh     %[d0],       %[d0],           %[shift] \n\t"
-
-      "pmullh    %[d1],       %[d1],           %[fy0]   \n\t"
-      "pmullh    %[d3],       %[d3],           %[fy1]   \n\t"
-      "paddh     %[d1],       %[d1],           %[d3]    \n\t"
-      "paddh     %[d1],       %[d1],           %[c0]    \n\t"
-      "psrlh     %[d1],       %[d1],           %[shift] \n\t"
-
-      "packushb  %[d0],       %[d0],           %[d1]    \n\t"
-      "gssdrc1   %[d0],       0x0(%[dst_ptr])           \n\t"
-      "gssdlc1   %[d0],       0x7(%[dst_ptr])           \n\t"
-      "daddiu    %[src_ptr],  %[src_ptr],      8        \n\t"
-      "daddiu    %[src_ptr1], %[src_ptr1],     8        \n\t"
-      "daddiu    %[dst_ptr],  %[dst_ptr],      8        \n\t"
-      "daddiu    %[width],    %[width],       -8        \n\t"
-      "bgtz      %[width],    1b                        \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]),
-        [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
-      : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1),
-        [dst_ptr] "r"(dst_ptr), [width] "r"(width),
-        [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0),
-        [shift] "f"(shift), [zero] "f"(zero)
-      : "memory");
-}
-
-// Use first 4 shuffler values to reorder ARGB channels.
-void ARGBShuffleRow_MMI(const uint8_t* src_argb,
-                        uint8_t* dst_argb,
-                        const uint8_t* shuffler,
-                        int width) {
-  uint64_t source, dest0, dest1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) |
-                         ((shuffler[2] & 0x03) << 4) |
-                         ((shuffler[3] & 0x03) << 6);
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[dest0],        %[src],            %[mask0]      \n\t"
-      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
-      "punpckhbh  %[dest1],        %[src],            %[mask0]      \n\t"
-      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
-      "packushb   %[dest],         %[dest0],          %[dest1]      \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void I422ToYUY2Row_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_frame,
-                       int width) {
-  uint64_t temp[3];
-  uint64_t vu = 0x0;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldlc1   %[ty],        0x7(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gsldrc1   %[ty],        0x0(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gslwlc1   %[tu],        0x3(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tu],        0x0(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwlc1   %[tv],        0x3(%[src_v])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tv],        0x0(%[src_v])            \n\t"  // b=src_sobely[i]
-      "punpcklbh %[vu],        %[tu],             %[tv]	\n\t"  // g
-      "punpcklbh %[tu],        %[ty],             %[vu]	\n\t"  // g
-      "gssdlc1   %[tu],        0x7(%[dst_frame])        \n\t"
-      "gssdrc1   %[tu],        0x0(%[dst_frame])        \n\t"
-      "punpckhbh %[tu],        %[ty],             %[vu]	\n\t"  // g
-      "gssdlc1   %[tu],        0x0F(%[dst_frame])       \n\t"
-      "gssdrc1   %[tu],        0x08(%[dst_frame])       \n\t"
-      "daddiu    %[src_y],     %[src_y],          8     \n\t"
-      "daddiu    %[src_u],     %[src_u],          4     \n\t"
-      "daddiu    %[src_v],     %[src_v],          4     \n\t"
-      "daddiu    %[dst_frame], %[dst_frame],      16    \n\t"
-      "daddiu    %[width],     %[width],         -8     \n\t"
-      "bgtz      %[width],     1b                       \n\t"
-      "nop                                              \n\t"
-      : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
-        [vu] "=&f"(vu)
-      : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
-        [dst_frame] "r"(dst_frame), [width] "r"(width)
-      : "memory");
-}
-
-void I422ToUYVYRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_frame,
-                       int width) {
-  uint64_t temp[3];
-  uint64_t vu = 0x0;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldlc1   %[ty],        0x7(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gsldrc1   %[ty],        0x0(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gslwlc1   %[tu],        0x3(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tu],        0x0(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwlc1   %[tv],        0x3(%[src_v])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tv],        0x0(%[src_v])            \n\t"  // b=src_sobely[i]
-      "punpcklbh %[vu],        %[tu],             %[tv]	\n\t"  // g
-      "punpcklbh %[tu],        %[vu],             %[ty]	\n\t"  // g
-      "gssdlc1   %[tu],        0x7(%[dst_frame])        \n\t"
-      "gssdrc1   %[tu],        0x0(%[dst_frame])        \n\t"
-      "punpckhbh %[tu],        %[vu],             %[ty]	\n\t"  // g
-      "gssdlc1   %[tu],        0x0F(%[dst_frame])       \n\t"
-      "gssdrc1   %[tu],        0x08(%[dst_frame])       \n\t"
-      "daddiu    %[src_y],     %[src_y],          8     \n\t"
-      "daddiu    %[src_u],     %[src_u],          4     \n\t"
-      "daddiu    %[src_v],     %[src_v],          4     \n\t"
-      "daddiu    %[dst_frame], %[dst_frame],      16    \n\t"
-      "daddiu    %[width],     %[width],         -8     \n\t"
-      "bgtz      %[width],     1b                       \n\t"
-      "nop                                              \n\t"
-      : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
-        [vu] "=&f"(vu)
-      : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
-        [dst_frame] "r"(dst_frame), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  uint64_t source, dest;
-  const uint64_t mask0 = 0xff000000ff000000ULL;
-  const uint64_t mask1 = ~mask0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "and        %[src],          %[src],            %[mask0]      \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[src],            %[dest]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(source), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
-                             uint8_t* dst_a,
-                             int width) {
-  uint64_t src, dest0, dest1, dest_lo, dest_hi, dest;
-  const uint64_t mask = 0xff000000ff000000ULL;
-  const uint64_t shift = 0x18;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "and        %[dest0],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "gsldlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x08(%[src_ptr])                 \n\t"
-      "and        %[dest1],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x10(%[src_ptr])                 \n\t"
-      "and        %[dest0],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "gsldlc1    %[src],          0x1f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x18(%[src_ptr])                 \n\t"
-      "and        %[dest1],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_hi],      %[dest0],          %[dest1]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask),
-        [shift] "f"(shift), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  uint64_t source, dest0, dest1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x00ffffff00ffffffULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[dest0],        %[mask0],          %[src]        \n\t"
-      "punpcklhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "punpckhbh  %[dest0],        %[mask0],          %[src]        \n\t"
-      "punpcklhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x20          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void I444ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-  __asm__ volatile (
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"//yg
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"//bb
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"//ub
-    "or         %[ub],           %[ub],             %[mask]       \n\t"//must sign extension
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"//bg
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"//ug
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"//vg
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"//br
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"//vr
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"//sign extension
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"//u
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"//v
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"//u*ug
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"//v*vg
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"//v*vr
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
-    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"//ffffgggg
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//gbgbgbgb
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//frfrfrfr
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-    : [y]"=&f"(y),
-      [u]"=&f"(u),                         [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [alpha]"f"(-1),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask)
-    : "memory"
-  );
-}
-
-// Also used for 420
-void I422ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"//yg
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"//bb
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"//ub
-    "or         %[ub],           %[ub],             %[mask]       \n\t"//must sign extension
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"//bg
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"//ug
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"//vg
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"//br
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"//vr
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"//sign extension
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"//v
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"//u*ug
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"//v*vg
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"//v*vr
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
-    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"//ffffgggg
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//gbgbgbgb
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//frfrfrfr
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),
-      [u]"=&f"(u),                         [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [alpha]"f"(-1),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask)
-    : "memory"
-  );
-}
-
-// 10 bit YUV to ARGB
-void I210ToARGBRow_MMI(const uint16_t* src_y,
-                       const uint16_t* src_u,
-                       const uint16_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gsldlc1    %[y],            0x07(%[y_ptr])                   \n\t"
-    "gsldrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "psllh      %[y],            %[y],              %[six]        \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "punpcklhw  %[u],            %[u],              %[u]          \n\t"
-    "psrah      %[u],            %[u],              %[two]        \n\t"
-    "punpcklhw  %[v],            %[v],              %[v]          \n\t"
-    "psrah      %[v],            %[v],              %[two]        \n\t"
-    "pminsh     %[u],            %[u],              %[mask1]      \n\t"
-    "pminsh     %[v],            %[v],              %[mask1]      \n\t"
-
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x08          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),
-      [u]"=&f"(u),                         [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [alpha]"f"(-1),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask),                     [two]"f"(0x02),
-      [mask1]"f"(0x00ff00ff00ff00ff)
-    : "memory"
-  );
-}
-
-void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            const uint8_t* src_a,
-                            uint8_t* rgb_buf,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  uint64_t y,u,v,a;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-    "gslwlc1    %[a],            0x03(%[a_ptr])                   \n\t"
-    "gslwrc1    %[a],            0x00(%[a_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
-    "packushb   %[g_vec0],       %[g_vec0],         %[a]          \n\t"
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[a]          \n\t"//aaaagggg
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[a_ptr],        %[a_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),                         [a]"=&f"(a),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [a_ptr]"r"(src_a),                   [zero]"f"(0x00),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask)
-    : "memory"
-  );
-}
-
-void I422ToRGB24Row_MMI(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "packushb   %[g_vec0],       %[g_vec0],         %[zero]       \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-
-    "punpckhwd  %[r_vec0],       %[g_vec0],         %[g_vec0]     \n\t"
-    "psllw      %[r_vec1],       %[r_vec0],         %[lmove1]     \n\t"
-    "or         %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-    "psrlw      %[r_vec1],       %[r_vec0],         %[rmove1]     \n\t"
-    "pextrh     %[r_vec1],       %[r_vec1],         %[zero]       \n\t"
-    "pinsrh_2   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-    "pextrh     %[r_vec1],       %[g_vec1],         %[zero]       \n\t"
-    "pinsrh_3   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-    "pextrh     %[r_vec1],       %[g_vec1],         %[one]        \n\t"
-    "punpckhwd  %[g_vec1],       %[g_vec1],         %[g_vec1]     \n\t"
-    "psllw      %[g_vec1],       %[g_vec1],         %[rmove1]     \n\t"
-    "or         %[g_vec1],       %[g_vec1],         %[r_vec1]     \n\t"
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gsswlc1    %[g_vec1],       0x0b(%[rgbbuf_ptr])              \n\t"
-    "gsswrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0c          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask]"f"(mask),
-      [lmove1]"f"(0x18),                   [rmove1]"f"(0x8),
-      [one]"f"(0x1)
-    : "memory"
-  );
-}
-
-void I422ToARGB4444Row_MMI(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_argb4444,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "and        %[g_vec],        %[g_vec],          %[mask1]      \n\t"
-    "psrlw      %[g_vec],        %[g_vec],          %[four]       \n\t"
-    "psrlw      %[r_vec],        %[g_vec],          %[four]       \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
-    "and        %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-
-    "and        %[b_vec],        %[b_vec],          %[mask1]      \n\t"
-    "psrlw      %[b_vec],        %[b_vec],          %[four]       \n\t"
-    "psrlw      %[r_vec],        %[b_vec],          %[four]       \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
-    "and        %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_argb4444])            \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_argb4444])            \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[dst_argb4444], %[dst_argb4444],   0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [dst_argb4444]"r"(dst_argb4444),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask]"f"(0xff00ff00ff00ff00),
-      [four]"f"(0x4),                      [mask1]"f"(0xf0f0f0f0f0f0f0f0),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void I422ToARGB1555Row_MMI(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_argb1555,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "psrlw      %[temp],         %[g_vec],          %[three]      \n\t"
-    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "or         %[g_vec],        %[g_vec],          %[mask3]      \n\t"
-
-    "psrlw      %[temp],         %[b_vec],          %[three]      \n\t"
-    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "or         %[b_vec],        %[b_vec],          %[mask3]      \n\t"
-
-    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_argb1555])            \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_argb1555])            \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[dst_argb1555], %[dst_argb1555],   0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [dst_argb1555]"r"(dst_argb1555),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
-      [eight]"f"(0x8),                     [mask3]"f"(0x800000008000),
-      [lmove5]"f"(0x5)
-    : "memory"
-  );
-}
-
-void I422ToRGB565Row_MMI(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_rgb565,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
-    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
-    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [dst_rgb565]"r"(dst_rgb565),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
-      [eight]"f"(0x8),                     [seven]"f"(0x7),
-      [lmove5]"f"(0x5)
-    : "memory"
-  );
-}
-
-void NV12ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_uv,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void NV21ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_vu,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[ushu]       \n\t"
-    "pshufh     %[u],            %[u],              %[vshu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [vu_ptr]"r"(src_vu),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void NV12ToRGB24Row_MMI(const uint8_t* src_y,
-                        const uint8_t* src_uv,
-                        uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
-    "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
-    "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
-    "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
-    "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
-    "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
-    "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
-    "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1),                      [lmove1]"f"(0x18),
-      [one]"f"(0x1),                       [rmove1]"f"(0x8)
-    : "memory"
-  );
-}
-
-void NV21ToRGB24Row_MMI(const uint8_t* src_y,
-                        const uint8_t* src_vu,
-                        uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[ushu]       \n\t"
-    "pshufh     %[u],            %[u],              %[vshu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
-    "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
-    "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
-    "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
-    "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
-    "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
-    "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
-    "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [vu_ptr]"r"(src_vu),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [lmove1]"f"(0x18),                   [rmove1]"f"(0x8),
-      [one]"f"(0x1)
-    : "memory"
-  );
-}
-
-void NV12ToRGB565Row_MMI(const uint8_t* src_y,
-                         const uint8_t* src_uv,
-                         uint8_t* dst_rgb565,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
-    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psubb      %[y],            %[eight],          %[three]      \n\t"//5
-    "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
-    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psubb      %[y],            %[eight],          %[three]      \n\t"//5
-    "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-	"daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-    "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
-      [dst_rgb565]"r"(dst_rgb565),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
-      [eight]"f"(0x8),                     [seven]"f"(0x7)
-    : "memory"
-  );
-}
-
-void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gsldlc1    %[y],            0x07(%[yuy2_ptr])                \n\t"
-    "gsldrc1    %[y],            0x00(%[yuy2_ptr])                \n\t"
-    "psrlh      %[temp],         %[y],              %[eight]      \n\t"
-    "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
-    "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
-
-    "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
-    "and        %[y],            %[y],              %[temp]       \n\t"
-    "psllh      %[temp],         %[y],              %[eight]      \n\t"
-    "or         %[y],            %[y],              %[temp]       \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[yuy2_ptr],     %[yuy2_ptr],       0x08          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [yuy2_ptr]"r"(src_yuy2),             [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1),                      [eight]"f"(0x8)
-    : "memory"
-  );
-}
-
-void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gsldlc1    %[y],            0x07(%[uyvy_ptr])                \n\t"
-    "gsldrc1    %[y],            0x00(%[uyvy_ptr])                \n\t"
-    "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
-    "and        %[temp],         %[y],              %[temp]       \n\t"
-    "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
-    "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
-
-    "psrlh      %[y],            %[y],              %[eight]      \n\t"
-    "psllh      %[temp],         %[y],              %[eight]      \n\t"
-    "or         %[y],            %[y],              %[temp]       \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[uyvy_ptr],     %[uyvy_ptr],       0x08          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [uyvy_ptr]"r"(src_uyvy),             [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1),                      [eight]"f"(0x8)
-    : "memory"
-  );
-}
-
-void I422ToRGBARow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[alpha],          %[g_vec]      \n\t"
-    "punpcklbh  %[b_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) {
-  __asm__ volatile (
-    "punpcklwd  %[v32],          %[v32],            %[v32]        \n\t"
-    "1:                                                           \n\t"
-    "gssdlc1    %[v32],          0x07(%[dst_ptr])                 \n\t"
-    "gssdrc1    %[v32],          0x00(%[dst_ptr])                 \n\t"
-    "gssdlc1    %[v32],          0x0f(%[dst_ptr])                 \n\t"
-    "gssdrc1    %[v32],          0x08(%[dst_ptr])                 \n\t"
-
-    "daddi      %[width],        %[width],         -0x04          \n\t"
-    "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-    "bnez       %[width],        1b                               \n\t"
-    : [v32]"+&f"(v32)
-    : [dst_ptr]"r"(dst_argb),           [width]"r"(width)
-    : "memory"
-  );
-}
-// clang-format on
-
-// 10 bit YUV to ARGB
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc
index 804ff839..4ed13638 100644
--- a/files/source/row_neon.cc
+++ b/files/source/row_neon.cc
@@ -89,12 +89,14 @@ extern "C" {
   "vsli.u16   d2, d2, #8                     \n" \
   "vsri.u16   d3, d3, #8                     \n"
 
+// TODO: Use single register for kUVCoeff and multiply by lane
 #define YUVTORGB_SETUP                                        \
+  "vld1.16    {d31}, [%[kRGBCoeffBias]]                   \n" \
   "vld4.8     {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \
-  "vld1.16    {d31[]}, [%[kRGBCoeffBias]]!   \n"              \
-  "vld1.16    {d20[], d21[]}, [%[kRGBCoeffBias]]! \n"         \
-  "vld1.16    {d22[], d23[]}, [%[kRGBCoeffBias]]! \n"         \
-  "vld1.16    {d24[], d25[]}, [%[kRGBCoeffBias]] \n"
+  "vdup.u16   q10, d31[1]                                 \n" \
+  "vdup.u16   q11, d31[2]                                 \n" \
+  "vdup.u16   q12, d31[3]                                 \n" \
+  "vdup.u16   d31, d31[0]                                 \n"
 
 // q0: B uint16x8_t
 // q1: G uint16x8_t
@@ -156,6 +158,29 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
       : "cc", "memory", YUVTORGB_REGS, "d6");
 }
 
+void I444ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV444 YUVTORGB
+          RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst3.8      {d0, d2, d4}, [%[dst_rgb24]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
+}
+
 void I422ToARGBRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -586,10 +611,10 @@ void DetileRow_NEON(const uint8_t* src,
                     int width) {
   asm volatile(
       "1:                                        \n"
-      "vld1.16     {q0}, [%0], %3                \n"  // load 16 bytes
+      "vld1.8      {q0}, [%0], %3                \n"  // load 16 bytes
       "subs        %2, %2, #16                   \n"  // 16 processed per loop
-      "pld         [%0, 1792]                    \n"
-      "vst1.16     {q0}, [%1]!                   \n"  // store 16 bytes
+      "pld         [%0, #1792]                   \n"
+      "vst1.8      {q0}, [%1]!                   \n"  // store 16 bytes
       "bgt         1b                            \n"
       : "+r"(src),            // %0
         "+r"(dst),            // %1
@@ -599,6 +624,26 @@ void DetileRow_NEON(const uint8_t* src,
   );
 }
 
+// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
+void DetileRow_16_NEON(const uint16_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint16_t* dst,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.16     {q0, q1}, [%0], %3            \n"  // load 16 pixels
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "pld         [%0, #3584]                   \n"
+      "vst1.16     {q0, q1}, [%1]!               \n"  // store 16 pixels
+      "bgt         1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2
+      : "r"(src_tile_stride * 2)    // %3
+      : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
 // Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
 void DetileSplitUVRow_NEON(const uint8_t* src_uv,
                            ptrdiff_t src_tile_stride,
@@ -609,7 +654,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
       "1:                                        \n"
       "vld2.8      {d0, d1}, [%0], %4            \n"
       "subs        %3, %3, #16                   \n"
-      "pld         [%0, 1792]                    \n"
+      "pld         [%0, #1792]                   \n"
       "vst1.8      {d0}, [%1]!                   \n"
       "vst1.8      {d1}, [%2]!                   \n"
       "bgt         1b                            \n"
@@ -622,6 +667,101 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
   );
 }
 
+#if LIBYUV_USE_ST2
+// Read 16 Y, 8 UV, and write 8 YUYV.
+void DetileToYUY2_NEON(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0], %4                \n"  // Load 16 Y
+      "pld         [%0, #1792]                   \n"
+      "vld1.8      {q1}, [%1], %5                \n"  // Load 8 UV
+      "pld         [%1, #1792]                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vst2.8      {q0, q1}, [%2]!               \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),                            // %0
+        "+r"(src_uv),                           // %1
+        "+r"(dst_yuy2),                         // %2
+        "+r"(width)                             // %3
+      : "r"(src_y_tile_stride),                 // %4
+        "r"(src_uv_tile_stride)                 // %5
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber list
+  );
+}
+#else
+// Read 16 Y, 8 UV, and write 8 YUYV.
+void DetileToYUY2_NEON(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0], %4                \n"  // Load 16 Y
+      "vld1.8      {q1}, [%1], %5                \n"  // Load 8 UV
+      "subs        %3, %3, #16                   \n"
+      "pld         [%0, #1792]                   \n"
+      "vzip.8      q0, q1                        \n"
+      "pld         [%1, #1792]                   \n"
+      "vst1.8      {q0, q1}, [%2]!               \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),                            // %0
+        "+r"(src_uv),                           // %1
+        "+r"(dst_yuy2),                         // %2
+        "+r"(width)                             // %3
+      : "r"(src_y_tile_stride),                 // %4
+        "r"(src_uv_tile_stride)                 // %5
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber list
+  );
+}
+#endif
+
+void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q14}, [%0]!                  \n"  // Load lower bits.
+      "vld1.8      {q9}, [%0]!                   \n"  // Load upper bits row
+                                                      // by row.
+      "vld1.8      {q11}, [%0]!                  \n"
+      "vld1.8      {q13}, [%0]!                  \n"
+      "vld1.8      {q15}, [%0]!                  \n"
+      "vshl.u8     q8, q14, #6                   \n"  // Shift lower bit data
+                                                      // appropriately.
+      "vshl.u8     q10, q14, #4                  \n"
+      "vshl.u8     q12, q14, #2                  \n"
+      "vzip.u8     q8, q9                        \n"  // Interleave upper and
+                                                      // lower bits.
+      "vzip.u8     q10, q11                      \n"
+      "vzip.u8     q12, q13                      \n"
+      "vzip.u8     q14, q15                      \n"
+      "vsri.u16    q8, q8, #10                   \n"  // Copy upper 6 bits
+                                                      // into lower 6 bits for
+                                                      // better accuracy in
+                                                      // conversions.
+      "vsri.u16    q9, q9, #10                   \n"
+      "vsri.u16    q10, q10, #10                 \n"
+      "vsri.u16    q11, q11, #10                 \n"
+      "vsri.u16    q12, q12, #10                 \n"
+      "vsri.u16    q13, q13, #10                 \n"
+      "vsri.u16    q14, q14, #10                 \n"
+      "vsri.u16    q15, q15, #10                 \n"
+      "vstmia      %1!, {q8-q15}                 \n"  // Store pixel block (64
+                                                      // pixels).
+      "subs        %2, %2, #80                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src),  // %0
+        "+r"(dst),  // %1
+        "+r"(size)  // %2
+      :
+      : "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
 // Reads 16 U's and V's and writes out 16 pairs of UV.
 void MergeUVRow_NEON(const uint8_t* src_u,
                      const uint8_t* src_v,
@@ -664,7 +804,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
         "+r"(dst_b),                      // %3
         "+r"(width)                       // %4
       :                                   // Input registers
-      : "cc", "memory", "d0", "d1", "d2"  // Clobber List
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
   );
 }
 
@@ -1505,6 +1645,29 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
   );
 }
 
+void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // stride + src_yuy2
+      "1:                                        \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of YUY2.
+      "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
+      "vld2.8      {q2, q3}, [%1]!               \n"  // load next row YUY2.
+      "vrhadd.u8   q4, q1, q3                    \n"  // average rows of UV
+      "vst1.8      {q4}, [%2]!                   \n"  // store 8 UV.
+      "bgt         1b                            \n"
+      : "+r"(src_yuy2),     // %0
+        "+r"(stride_yuy2),  // %1
+        "+r"(dst_uv),       // %2
+        "+r"(width)         // %3
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+        "d7"  // Clobber List
+  );
+}
+
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                          uint8_t* dst_argb,
@@ -1590,7 +1753,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
 
 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width) {
   asm volatile(
       "vdup.32     d7, %2                        \n"  // dither4
@@ -1762,7 +1925,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
   );
 }
 
-// TODO(fbarchard): Subsample match C code.
+// TODO(fbarchard): Subsample match Intel code.
 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
@@ -1808,6 +1971,51 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
   );
 }
 
+void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_uj,
+                       uint8_t* dst_vj,
+                       int width) {
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #127 / 2                 \n"  // UB / VR 0.500 coefficient
+      "vmov.s16    q11, #84 / 2                  \n"  // UG -0.33126 coefficient
+      "vmov.s16    q12, #43 / 2                  \n"  // UR -0.16874 coefficient
+      "vmov.s16    q13, #20 / 2                  \n"  // VB -0.08131 coefficient
+      "vmov.s16    q14, #107 / 2                 \n"  // VG -0.41869 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ABGR pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ABGR pixels.
+      "vpaddl.u8   q0, q0                        \n"  // R 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // B 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ABGR pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ABGR pixels.
+      "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+    RGBTOUV(q2, q1, q0)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_stride_abgr),  // %1
+    "+r"(dst_uj),     // %2
+    "+r"(dst_vj),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
 // TODO(fbarchard): Subsample match C code.
 void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
                         int src_stride_rgb24,
@@ -2567,6 +2775,10 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
   ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
 }
 
+void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
 // Same code as ARGB, except the LD4
 void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
@@ -3633,7 +3845,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
         "+r"(dst_v),   // %2
         "+r"(width)    // %3
       : "r"(shift)     // %4
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+      : "cc", "memory", "q0", "q1", "q2");
 }
 
 void MergeUVRow_16_NEON(const uint16_t* src_u,
@@ -3687,31 +3899,25 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                        int scale,
                        int width) {
   asm volatile(
-      "vdup.16     q0, %3                        \n"
-      "1:                                        \n"
-      "vld1.16     {q1}, [%0]!                   \n"
-      "vld1.16     {q2}, [%0]!                   \n"
-      "vmovl.u16   q3, d2                        \n"
-      "vmovl.u16   q1, d3                        \n"
-      "vmovl.u16   q4, d4                        \n"
-      "vmovl.u16   q2, d5                        \n"
-      "vshl.u32    q3, q3, q0                    \n"
-      "vshl.u32    q4, q4, q0                    \n"
-      "vshl.u32    q1, q1, q0                    \n"
-      "vshl.u32    q2, q2, q0                    \n"
-      "vmovn.u32   d2, q3                        \n"
-      "vmovn.u32   d3, q1                        \n"
-      "vmovn.u32   d4, q4                        \n"
-      "vmovn.u32   d5, q2                        \n"
-      "vst1.16     {q1}, [%1]!                   \n"
-      "vst1.16     {q2}, [%1]!                   \n"
+      "vdup.16     d8, %3                        \n"
+      "1:                                        \n"
+      "vld1.16     {q2, q3}, [%0]!               \n"
+      "vmull.u16   q0, d4, d8                    \n"
+      "vmull.u16   q1, d5, d8                    \n"
+      "vmull.u16   q2, d6, d8                    \n"
+      "vmull.u16   q3, d7, d8                    \n"
+      "vshrn.u32   d0, q0, #16                   \n"
+      "vshrn.u32   d1, q1, #16                   \n"
+      "vshrn.u32   d2, q2, #16                   \n"
+      "vshrn.u32   d3, q3, #16                   \n"
+      "vst1.16     {q0, q1}, [%1]!               \n"  // store 16 pixels
       "subs        %2, %2, #16                   \n"  // 16 src pixels per loop
       "bgt         1b                            \n"
       : "+r"(src_y),  // %0
         "+r"(dst_y),  // %1
         "+r"(width)   // %2
       : "r"(scale)    // %3
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+      : "cc", "memory", "q0", "q1", "q2", "q3", "d8");
 }
 
 // Use scale to convert lsb formats to msb, depending how many bits there are:
diff --git a/files/source/row_neon64.cc b/files/source/row_neon64.cc
index 0f120373..74190d61 100644
--- a/files/source/row_neon64.cc
+++ b/files/source/row_neon64.cc
@@ -142,6 +142,29 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
       : "cc", "memory", YUVTORGB_REGS, "v19");
 }
 
+void I444ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV444 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
+}
+
 void I422ToARGBRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -627,6 +650,26 @@ void DetileRow_NEON(const uint8_t* src,
   );
 }
 
+// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
+void DetileRow_16_NEON(const uint16_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint16_t* dst,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.8h,v1.8h}, [%0], %3       \n"  // load 16 pixels
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 3584]         \n"  // 7 tiles of 512b ahead
+      "st1         {v0.8h,v1.8h}, [%1], #32      \n"  // store 16 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2
+      : "r"(src_tile_stride * 2)    // %3
+      : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
 // Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
 void DetileSplitUVRow_NEON(const uint8_t* src_uv,
                            ptrdiff_t src_tile_stride,
@@ -650,6 +693,100 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
   );
 }
 
+#if LIBYUV_USE_ST2
+// Read 16 Y, 8 UV, and write 8 YUY2
+void DetileToYUY2_NEON(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], %4            \n"  // load 16 Ys
+      "prfm        pldl1keep, [%0, 1792]         \n"
+      "ld1         {v1.16b}, [%1], %5            \n"  // load 8 UVs
+      "prfm        pldl1keep, [%1, 1792]         \n"
+      "subs        %w3, %w3, #16                 \n"  // store 8 YUY2
+      "st2         {v0.16b,v1.16b}, [%2], #32    \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_y),                // %0
+        "+r"(src_uv),               // %1
+        "+r"(dst_yuy2),             // %2
+        "+r"(width)                 // %3
+      : "r"(src_y_tile_stride),     // %4
+        "r"(src_uv_tile_stride)     // %5
+      : "cc", "memory", "v0", "v1"  // Clobber list
+  );
+}
+#else
+// Read 16 Y, 8 UV, and write 8 YUY2
+void DetileToYUY2_NEON(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], %4            \n"  // load 16 Ys
+      "ld1         {v1.16b}, [%1], %5            \n"  // load 8 UVs
+      "subs        %w3, %w3, #16                 \n"
+      "prfm        pldl1keep, [%0, 1792]         \n"
+      "zip1        v2.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%1, 1792]         \n"
+      "zip2        v3.16b, v0.16b, v1.16b        \n"
+      "st1         {v2.16b,v3.16b}, [%2], #32    \n"  // store 8 YUY2
+      "b.gt        1b                            \n"
+      : "+r"(src_y),                            // %0
+        "+r"(src_uv),                           // %1
+        "+r"(dst_yuy2),                         // %2
+        "+r"(width)                             // %3
+      : "r"(src_y_tile_stride),                 // %4
+        "r"(src_uv_tile_stride)                 // %5
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber list
+  );
+}
+#endif
+
+// Unpack MT2T into tiled P010 64 pixels at a time. See
+// tinyurl.com/mtk-10bit-video-format for format documentation.
+void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v7.16b}, [%0], #16           \n"
+      "ld1         {v0.16b-v3.16b}, [%0], #64    \n"
+      "shl         v4.16b, v7.16b, #6            \n"
+      "shl         v5.16b, v7.16b, #4            \n"
+      "shl         v6.16b, v7.16b, #2            \n"
+      "subs        %2, %2, #80                   \n"
+      "zip1        v16.16b, v4.16b, v0.16b       \n"
+      "zip1        v18.16b, v5.16b, v1.16b       \n"
+      "zip1        v20.16b, v6.16b, v2.16b       \n"
+      "zip1        v22.16b, v7.16b, v3.16b       \n"
+      "zip2        v17.16b, v4.16b, v0.16b       \n"
+      "zip2        v19.16b, v5.16b, v1.16b       \n"
+      "zip2        v21.16b, v6.16b, v2.16b       \n"
+      "zip2        v23.16b, v7.16b, v3.16b       \n"
+      "sri         v16.8h, v16.8h, #10           \n"
+      "sri         v17.8h, v17.8h, #10           \n"
+      "sri         v18.8h, v18.8h, #10           \n"
+      "sri         v19.8h, v19.8h, #10           \n"
+      "st1         {v16.8h-v19.8h}, [%1], #64    \n"
+      "sri         v20.8h, v20.8h, #10           \n"
+      "sri         v21.8h, v21.8h, #10           \n"
+      "sri         v22.8h, v22.8h, #10           \n"
+      "sri         v23.8h, v23.8h, #10           \n"
+      "st1         {v20.8h-v23.8h}, [%1], #64    \n"
+      "b.gt        1b                            \n"
+      : "+r"(src),  // %0
+        "+r"(dst),  // %1
+        "+r"(size)  // %2
+      :
+      : "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+}
+
 #if LIBYUV_USE_ST2
 // Reads 16 U's and V's and writes out 16 pairs of UV.
 void MergeUVRow_NEON(const uint8_t* src_u,
@@ -1729,6 +1866,29 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
   );
 }
 
+void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width) {
+  const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels
+      "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
+      "ld2         {v2.16b,v3.16b}, [%1], #32    \n"  // load next row
+      "urhadd      v4.16b, v1.16b, v3.16b        \n"  // average rows of UV
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "st1         {v4.16b}, [%2], #16           \n"  // store 8 UV.
+      "b.gt        1b                            \n"
+      : "+r"(src_yuy2),   // %0
+        "+r"(src_yuy2b),  // %1
+        "+r"(dst_uv),     // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                          uint8_t* dst_argb,
@@ -1819,24 +1979,23 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
 
 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width) {
   asm volatile(
-      "dup         v1.4s, %w2                    \n"  // dither4
+      "dup         v1.4s, %w3                    \n"  // dither4
       "1:                                        \n"
-      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // load 8
-                                                                 // pixels
-      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       "uqadd       v16.8b, v16.8b, v1.8b         \n"
       "prfm        pldl1keep, [%0, 448]          \n"
       "uqadd       v17.8b, v17.8b, v1.8b         \n"
       "uqadd       v18.8b, v18.8b, v1.8b         \n" ARGBTORGB565
-      "st1         {v18.16b}, [%0], #16          \n"  // store 8 pixels RGB565.
+      "st1         {v18.16b}, [%1], #16          \n"  // store 8 pixels RGB565.
       "b.gt        1b                            \n"
-      : "+r"(dst_rgb)   // %0
-      : "r"(src_argb),  // %1
-        "r"(dither4),   // %2
-        "r"(width)      // %3
+      : "+r"(src_argb),  // %0
+        "+r"(dst_rgb),   // %1
+        "+r"(width)      // %2
+      : "r"(dither4)     // %3
       : "cc", "memory", "v1", "v16", "v17", "v18", "v19");
 }
 
@@ -2144,6 +2303,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
   );
 }
 
+// TODO(fbarchard): Subsample match Intel code.
 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
@@ -2189,6 +2349,51 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
   );
 }
 
+void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_uj,
+                       uint8_t* dst_vj,
+                       int width) {
+  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
+  asm volatile (
+      "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
+      "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
+      "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
+      "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
+      "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
+      "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "uaddlp      v0.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "uadalp      v0.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+    RGBTOUV(v2.8h, v1.8h, v0.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_abgr_1),  // %1
+    "+r"(dst_uj),     // %2
+    "+r"(dst_vj),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
 void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
                         int src_stride_rgb24,
                         uint8_t* dst_u,
@@ -2812,6 +3017,10 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
   ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
 }
 
+void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
 // Same code as ARGB, except the LD4
 void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
@@ -4241,23 +4450,19 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                        int scale,
                        int width) {
   asm volatile(
-      "dup         v0.8h, %w3                    \n"
+      "dup         v4.8h, %w3                    \n"
       "1:                                        \n"
-      "ldp         q1, q2, [%0], #32             \n"
-      "ushll       v3.4s, v1.4h, #0              \n"
-      "ushll       v4.4s, v2.4h, #0              \n"
+      "ldp         q2, q3, [%0], #32             \n"
+      "umull       v0.4s, v2.4h, v4.4h           \n"
+      "umull2      v1.4s, v2.8h, v4.8h           \n"
+      "umull       v2.4s, v3.4h, v4.4h           \n"
+      "umull2      v3.4s, v3.8h, v4.8h           \n"
       "prfm        pldl1keep, [%0, 448]          \n"
-      "ushll2      v1.4s, v1.8h, #0              \n"
-      "ushll2      v2.4s, v2.8h, #0              \n"
-      "mul         v3.4s, v0.4s, v3.4s           \n"
-      "mul         v4.4s, v0.4s, v4.4s           \n"
-      "mul         v1.4s, v0.4s, v1.4s           \n"
-      "mul         v2.4s, v0.4s, v2.4s           \n"
-      "shrn        v3.4h, v3.4s, #16             \n"
-      "shrn        v4.4h, v4.4s, #16             \n"
-      "shrn2       v3.8h, v1.4s, #16             \n"
-      "shrn2       v4.8h, v2.4s, #16             \n"
-      "stp         q3, q3, [%1], #32             \n"  // store 16 pixels
+      "shrn        v0.4h, v0.4s, #16             \n"
+      "shrn2       v0.8h, v1.4s, #16             \n"
+      "shrn        v1.4h, v2.4s, #16             \n"
+      "shrn2       v1.8h, v3.4s, #16             \n"
+      "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
       "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
       "b.gt        1b                            \n"
       : "+r"(src_y),  // %0
diff --git a/files/source/row_rvv.cc b/files/source/row_rvv.cc
new file mode 100644
index 00000000..27e91a3b
--- /dev/null
+++ b/files/source/row_rvv.cc
@@ -0,0 +1,956 @@
+/*
+ *  Copyright 2023 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * Contributed by Darren Hsieh <darren.hsieh@sifive.com>
+ * Contributed by Bruce Lai <bruce.lai@sifive.com>
+ */
+
+#include "libyuv/row.h"
+
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
+#include <assert.h>
+#include <riscv_vector.h>
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Fill YUV -> RGB conversion constants into vectors
+// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+// register) is set to round-to-nearest-up mode(0).
+#define YUVTORGB_SETUP(vl, yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
+  {                                                                  \
+    asm volatile("csrwi vxrm, 0");                                   \
+    ub = yuvconst->kUVCoeff[0];                                      \
+    vr = yuvconst->kUVCoeff[1];                                      \
+    ug = yuvconst->kUVCoeff[2];                                      \
+    vg = yuvconst->kUVCoeff[3];                                      \
+    yg = yuvconst->kRGBCoeffBias[0];                                 \
+    bb = yuvconst->kRGBCoeffBias[1] + 32;                            \
+    bg = yuvconst->kRGBCoeffBias[2] - 32;                            \
+    br = yuvconst->kRGBCoeffBias[3] + 32;                            \
+  }
+
+// Read [VLEN/8] Y, [VLEN/(8 * 2)] U and [VLEN/(8 * 2)] V from 422
+#define READYUV422(vl, v_u, v_v, v_y_16)                \
+  {                                                     \
+    vuint8m1_t v_tmp0, v_tmp1;                          \
+    vuint8m2_t v_y;                                     \
+    vuint16m2_t v_u_16, v_v_16;                         \
+    vl = __riscv_vsetvl_e8m1((w + 1) / 2);              \
+    v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl);            \
+    v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);    \
+    v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl);            \
+    v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);    \
+    v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \
+    v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \
+    v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);    \
+    v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);    \
+    vl = __riscv_vsetvl_e8m2(w);                        \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);               \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);       \
+  }
+
+// Read [VLEN/8] Y, [VLEN/8] U, and [VLEN/8] V from 444
+#define READYUV444(vl, v_u, v_v, v_y_16)          \
+  {                                               \
+    vuint8m2_t v_y;                               \
+    vl = __riscv_vsetvl_e8m2(w);                  \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);         \
+    v_u = __riscv_vle8_v_u8m2(src_u, vl);         \
+    v_v = __riscv_vle8_v_u8m2(src_v, vl);         \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \
+  }
+
+// Convert from YUV to fixed point RGB
+#define YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, \
+                 v_b_16, v_r_16)                                               \
+  {                                                                            \
+    vuint16m4_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4;                        \
+    vuint32m8_t v_tmp5;                                                        \
+    v_tmp0 = __riscv_vwmulu_vx_u16m4(v_u, ug, vl);                             \
+    v_y_16 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl);                        \
+    v_tmp0 = __riscv_vwmaccu_vx_u16m4(v_tmp0, vg, v_v, vl);                    \
+    v_tmp1 = __riscv_vwmulu_vx_u16m4(v_u, ub, vl);                             \
+    v_tmp5 = __riscv_vwmulu_vx_u32m8(v_y_16, yg, vl);                          \
+    v_tmp2 = __riscv_vnsrl_wx_u16m4(v_tmp5, 16, vl);                           \
+    v_tmp3 = __riscv_vadd_vx_u16m4(v_tmp2, bg, vl);                            \
+    v_tmp4 = __riscv_vadd_vv_u16m4(v_tmp2, v_tmp1, vl);                        \
+    v_tmp2 = __riscv_vwmaccu_vx_u16m4(v_tmp2, vr, v_v, vl);                    \
+    v_g_16 = __riscv_vssubu_vv_u16m4(v_tmp3, v_tmp0, vl);                      \
+    v_b_16 = __riscv_vssubu_vx_u16m4(v_tmp4, bb, vl);                          \
+    v_r_16 = __riscv_vssubu_vx_u16m4(v_tmp2, br, vl);                          \
+  }
+
+// Convert from fixed point RGB To 8 bit RGB
+#define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \
+  {                                                          \
+    v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, vl);            \
+    v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, vl);            \
+    v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl);            \
+  }
+
+void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
+  size_t avl = (size_t)4 * width;
+  do {
+    vuint16m8_t v_ar64;
+    vuint8m4_t v_argb;
+    size_t vl = __riscv_vsetvl_e8m4(avl);
+    v_argb = __riscv_vle8_v_u8m4(src_argb, vl);
+    v_ar64 = __riscv_vwaddu_vx_u16m8(v_argb, 0, vl);
+    v_ar64 = __riscv_vmul_vx_u16m8(v_ar64, 0x0101, vl);
+    __riscv_vse16_v_u16m8(dst_ar64, v_ar64, vl);
+    avl -= vl;
+    src_argb += vl;
+    dst_ar64 += vl;
+  } while (avl > 0);
+}
+
+void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
+  size_t avl = (size_t)width;
+  do {
+    vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16;
+    vuint8m1_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m1(avl);
+    __riscv_vlseg4e8_v_u8m1(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl);
+    v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl);
+    v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl);
+    v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl);
+    v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl);
+    v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl);
+    v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl);
+    v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl);
+    __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r_16, v_g_16, v_b_16, v_a_16, vl);
+    avl -= vl;
+    src_argb += 4 * vl;
+    dst_ab64 += 4 * vl;
+  } while (avl > 0);
+}
+
+void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
+  size_t avl = (size_t)4 * width;
+  do {
+    vuint16m8_t v_ar64;
+    vuint8m4_t v_argb;
+    size_t vl = __riscv_vsetvl_e16m8(avl);
+    v_ar64 = __riscv_vle16_v_u16m8(src_ar64, vl);
+    v_argb = __riscv_vnsrl_wx_u8m4(v_ar64, 8, vl);
+    __riscv_vse8_v_u8m4(dst_argb, v_argb, vl);
+    avl -= vl;
+    src_ar64 += vl;
+    dst_argb += vl;
+  } while (avl > 0);
+}
+
+void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
+  size_t avl = (size_t)width;
+  do {
+    vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16;
+    vuint8m1_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e16m2(avl);
+    __riscv_vlseg4e16_v_u16m2(&v_r_16, &v_g_16, &v_b_16, &v_a_16, src_ab64, vl);
+    v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl);
+    v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl);
+    v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl);
+    v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl);
+    __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
+    avl -= vl;
+    src_ab64 += 4 * vl;
+    dst_argb += 4 * vl;
+  } while (avl > 0);
+}
+
+void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_raw += vl * 3;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+
+void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_raw += vl * 3;
+    dst_rgba += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+
+void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_r, v_g, v_b, vl);
+    w -= vl;
+    src_raw += vl * 3;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+
+void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_raw += vl * 3;
+  } while (w > 0);
+}
+
+void ARGBToRGB24Row_RVV(const uint8_t* src_argb,
+                        uint8_t* dst_rgb24,
+                        int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+
+void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
+                        uint8_t* dst_argb,
+                        int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_rgb24 += vl * 3;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+
+void I444ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READYUV444(vl, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl;
+    src_v += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+
+void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READYUV444(vl, v_u, v_v, v_y_16);
+    v_a = __riscv_vle8_v_u8m2(src_a, vl);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_a += vl;
+    src_u += vl;
+    src_v += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+
+void I444ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READYUV444(vl, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl;
+    src_v += vl;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+
+void I422ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READYUV422(vl, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+
+void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READYUV422(vl, v_u, v_v, v_y_16);
+    v_a = __riscv_vle8_v_u8m2(src_a, vl);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_a += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+
+void I422ToRGBARow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgba,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READYUV422(vl, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_rgba += vl * 4;
+  } while (w > 0);
+}
+
+void I422ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READYUV422(vl, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+
+void I400ToARGBRow_RVV(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  const bool is_yb_positive = (yuvconstants->kRGBCoeffBias[4] >= 0);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  vuint16m4_t v_yb;
+  vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
+  // To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) sets to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  if (is_yb_positive) {
+    v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
+  } else {
+    v_yb = __riscv_vmv_v_x_u16m4(-yuvconstants->kRGBCoeffBias[4] + 32, vl);
+  }
+  do {
+    vuint8m2_t v_y, v_out;
+    vuint16m4_t v_y_16, v_tmp0, v_tmp1, v_tmp2;
+    vl = __riscv_vsetvl_e8m2(w);
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);
+    v_tmp0 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl);  // 257 * v_y
+    v_tmp1 = __riscv_vmulhu_vv_u16m4(v_tmp0, v_yg, vl);
+    if (is_yb_positive) {
+      v_tmp2 = __riscv_vsaddu_vv_u16m4(v_tmp1, v_yb, vl);
+    } else {
+      v_tmp2 = __riscv_vssubu_vv_u16m4(v_tmp1, v_yb, vl);
+    }
+    v_out = __riscv_vnclipu_wx_u8m2(v_tmp2, 6, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_out, v_out, v_out, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+
+void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_y;
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_y, v_y, v_y, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+
+void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m8(w);
+    vuint8m8_t v_data = __riscv_vle8_v_u8m8(src, vl);
+    __riscv_vse8_v_u8m8(dst, v_data, vl);
+    w -= vl;
+    src += vl;
+    dst += vl;
+  } while (w > 0);
+}
+
+// Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1
+void InterpolateRow_RVV(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int dst_width,
+                        int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  size_t dst_w = (size_t)dst_width;
+  assert(source_y_fraction >= 0);
+  assert(source_y_fraction < 256);
+  // Blend 100 / 0 - Copy row unchanged.
+  if (y1_fraction == 0) {
+    do {
+      size_t vl = __riscv_vsetvl_e8m8(dst_w);
+      __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl);
+      dst_w -= vl;
+      src_ptr += vl;
+      dst_ptr += vl;
+    } while (dst_w > 0);
+    return;
+  }
+  // Blend 50 / 50.
+  if (y1_fraction == 128) {
+    do {
+      size_t vl = __riscv_vsetvl_e8m8(dst_w);
+      vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl);
+      vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl);
+      // Averaging add
+      vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl);
+      __riscv_vse8_v_u8m8(dst_ptr, row_out, vl);
+      dst_w -= vl;
+      src_ptr += vl;
+      src_ptr1 += vl;
+      dst_ptr += vl;
+    } while (dst_w > 0);
+    return;
+  }
+  // General purpose row blend.
+  // To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(dst_w);
+    vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl);
+    vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl);
+    vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl);
+    acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl);
+    __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl);
+    dst_w -= vl;
+    src_ptr += vl;
+    src_ptr1 += vl;
+    dst_ptr += vl;
+  } while (dst_w > 0);
+}
+
+void SplitRGBRow_RVV(const uint8_t* src_rgb,
+                     uint8_t* dst_r,
+                     uint8_t* dst_g,
+                     uint8_t* dst_b,
+                     int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_rgb, vl);
+    __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+    __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+    __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+    w -= vl;
+    dst_r += vl;
+    dst_g += vl;
+    dst_b += vl;
+    src_rgb += vl * 3;
+  } while (w > 0);
+}
+
+void MergeRGBRow_RVV(const uint8_t* src_r,
+                     const uint8_t* src_g,
+                     const uint8_t* src_b,
+                     uint8_t* dst_rgb,
+                     int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
+    vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
+    vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb, v_r, v_g, v_b, vl);
+    w -= vl;
+    src_r += vl;
+    src_g += vl;
+    src_b += vl;
+    dst_rgb += vl * 3;
+  } while (w > 0);
+}
+
+void SplitARGBRow_RVV(const uint8_t* src_argb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      uint8_t* dst_a,
+                      int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vse8_v_u8m2(dst_a, v_a, vl);
+    __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+    __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+    __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+    w -= vl;
+    dst_a += vl;
+    dst_r += vl;
+    dst_g += vl;
+    dst_b += vl;
+    src_argb += vl * 4;
+  } while (w > 0);
+}
+
+void MergeARGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      const uint8_t* src_a,
+                      uint8_t* dst_argb,
+                      int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
+    vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
+    vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
+    vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_r += vl;
+    src_g += vl;
+    src_b += vl;
+    src_a += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+
+void SplitXRGBRow_RVV(const uint8_t* src_argb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+    __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+    __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+    w -= vl;
+    dst_r += vl;
+    dst_g += vl;
+    dst_b += vl;
+    src_argb += vl * 4;
+  } while (w > 0);
+}
+
+void MergeXRGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_argb,
+                      int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_r, v_g, v_b;
+    v_r = __riscv_vle8_v_u8m2(src_r, vl);
+    v_g = __riscv_vle8_v_u8m2(src_g, vl);
+    v_b = __riscv_vle8_v_u8m2(src_b, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_r += vl;
+    src_g += vl;
+    src_b += vl;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+
+void SplitUVRow_RVV(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4_t v_u, v_v;
+    __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl);
+    __riscv_vse8_v_u8m4(dst_u, v_u, vl);
+    __riscv_vse8_v_u8m4(dst_v, v_v, vl);
+    w -= vl;
+    dst_u += vl;
+    dst_v += vl;
+    src_uv += 2 * vl;
+  } while (w > 0);
+}
+
+void MergeUVRow_RVV(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m4_t v_u, v_v;
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    v_u = __riscv_vle8_v_u8m4(src_u, vl);
+    v_v = __riscv_vle8_v_u8m4(src_v, vl);
+    __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl);
+    w -= vl;
+    src_u += vl;
+    src_v += vl;
+    dst_uv += 2 * vl;
+  } while (w > 0);
+}
+
+struct RgbConstants {
+  uint8_t kRGBToY[4];
+  uint16_t kAddY;
+  uint16_t pad;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+                                                        128,
+                                                        0};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                        0x1080,
+                                                        0};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
+                                                      0x1080,
+                                                      0};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
+                          uint8_t* dst_y,
+                          int width,
+                          const struct RgbConstants* rgbconstants) {
+  assert(width != 0);
+  size_t w = (size_t)width;
+  vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
+  vuint16m4_t v_addy;           // vector is to store kAddY
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
+  v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
+  v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
+  v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a, v_y;
+    vuint16m4_t v_y_u16;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
+    v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
+    v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
+    __riscv_vse8_v_u8m2(dst_y, v_y, vl);
+    w -= vl;
+    src_argb += 4 * vl;
+    dst_y += vl;
+  } while (w > 0);
+}
+
+void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_RVV(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+
+void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_RVV(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_RVV(src_abgr, dst_y, width, &kRawI601Constants);
+}
+
+void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_RVV(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba,
+                          uint8_t* dst_y,
+                          int width,
+                          const struct RgbConstants* rgbconstants) {
+  assert(width != 0);
+  size_t w = (size_t)width;
+  vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
+  vuint16m4_t v_addy;           // vector is to store kAddY
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
+  v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
+  v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
+  v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a, v_y;
+    vuint16m4_t v_y_u16;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl);
+    v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
+    v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
+    v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
+    __riscv_vse8_v_u8m2(dst_y, v_y, vl);
+    w -= vl;
+    src_rgba += 4 * vl;
+    dst_y += vl;
+  } while (w > 0);
+}
+
+void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_RVV(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+
+void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+  RGBAToYMatrixRow_RVV(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_RVV(src_bgra, dst_y, width, &kRawI601Constants);
+}
+
+void RGBToYMatrixRow_RVV(const uint8_t* src_rgb,
+                         uint8_t* dst_y,
+                         int width,
+                         const struct RgbConstants* rgbconstants) {
+  assert(width != 0);
+  size_t w = (size_t)width;
+  vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
+  vuint16m4_t v_addy;           // vector is to store kAddY
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
+  v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
+  v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
+  v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_y;
+    vuint16m4_t v_y_u16;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb, vl);
+    v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
+    v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
+    v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
+    __riscv_vse8_v_u8m2(dst_y, v_y, vl);
+    w -= vl;
+    src_rgb += 3 * vl;
+    dst_y += vl;
+  } while (w > 0);
+}
+
+void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_RVV(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_RVV(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+
+void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_RVV(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+
+void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants);
+}
+
+void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
+  size_t w = (size_t)width;
+  // To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    vuint16m4_t v_ba_16, v_ga_16, v_ra_16;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl);
+    v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl);
+    v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl);
+    v_b = __riscv_vnclipu_wx_u8m2(v_ba_16, 8, vl);
+    v_g = __riscv_vnclipu_wx_u8m2(v_ga_16, 8, vl);
+    v_r = __riscv_vnclipu_wx_u8m2(v_ra_16, 8, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
diff --git a/files/source/row_win.cc b/files/source/row_win.cc
index c7c1ff60..5fb28521 100644
--- a/files/source/row_win.cc
+++ b/files/source/row_win.cc
@@ -14,7 +14,9 @@
 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
     !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
 
-#if defined(_M_X64)
+#if defined(_M_ARM64EC)
+#include <intrin.h>
+#elif defined(_M_X64)
 #include <emmintrin.h>
 #include <tmmintrin.h>  // For _mm_maddubs_epi16
 #endif
@@ -893,7 +895,7 @@ __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
 
 __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
                                                   uint8_t* dst_rgb,
-                                                  const uint32_t dither4,
+                                                  uint32_t dither4,
                                                   int width) {
   __asm {
 
@@ -940,7 +942,7 @@ __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
                                                   uint8_t* dst_rgb,
-                                                  const uint32_t dither4,
+                                                  uint32_t dither4,
                                                   int width) {
   __asm {
     mov        eax, [esp + 4]  // src_argb
@@ -2789,6 +2791,44 @@ __declspec(naked) void I422ToRGB24Row_SSSE3(
   }
 }
 
+// 8 pixels.
+// 8 UV values, mixed with 8 Y producing 8 RGB24 (24 bytes).
+__declspec(naked) void I444ToRGB24Row_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_rgb24,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
+    movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
+
+ convertloop:
+    READYUV444
+    YUVTORGB(ebx)
+    STORERGB24
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
 // 8 pixels
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
 __declspec(naked) void I422ToRGB565Row_SSSE3(
@@ -3423,17 +3463,14 @@ __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
     sub        edx, eax
 
   convertloop:
-    vmovdqu    ymm0, [eax]  // read 32 U's
-    vmovdqu    ymm1, [eax + edx]  // and 32 V's
-    lea        eax,  [eax + 32]
-    vpunpcklbw ymm2, ymm0, ymm1  // low 16 UV pairs. mutated qqword 0,2
-    vpunpckhbw ymm0, ymm0, ymm1  // high 16 UV pairs. mutated qqword 1,3
-    vextractf128 [edi], ymm2, 0  // bytes 0..15
-    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
-    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
-    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
-    lea        edi, [edi + 64]
-    sub        ecx, 32
+    vpmovzxbw  ymm0, [eax]
+    vpmovzxbw  ymm1, [eax + edx]
+    lea        eax,  [eax + 16]
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm2, ymm1, ymm0
+    vmovdqu    [edi], ymm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
diff --git a/files/source/scale.cc b/files/source/scale.cc
index e1335f1e..80b030dc 100644
--- a/files/source/scale.cc
+++ b/files/source/scale.cc
@@ -198,6 +198,51 @@ static void ScalePlaneDown2_16(int src_width,
   }
 }
 
+void ScalePlaneDown2_16To8(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           int scale,
+                           enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                        uint8_t* dst_ptr, int dst_width, int scale) =
+      (src_width & 1)
+          ? (filtering == kFilterNone
+                 ? ScaleRowDown2_16To8_Odd_C
+                 : (filtering == kFilterLinear ? ScaleRowDown2Linear_16To8_Odd_C
+                                               : ScaleRowDown2Box_16To8_Odd_C))
+          : (filtering == kFilterNone
+                 ? ScaleRowDown2_16To8_C
+                 : (filtering == kFilterLinear ? ScaleRowDown2Linear_16To8_C
+                                               : ScaleRowDown2Box_16To8_C));
+  int row_stride = src_stride * 2;
+  (void)dst_height;
+  if (!filtering) {
+    src_ptr += src_stride;  // Point to odd rows.
+    src_stride = 0;
+  }
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (y = 0; y < src_height / 2; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width, scale);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+  if (src_height & 1) {
+    if (!filtering) {
+      src_ptr -= src_stride;  // Point to last row.
+    }
+    ScaleRowDown2(src_ptr, 0, dst_ptr, dst_width, scale);
+  }
+}
+
 // Scale plane, 1/4
 // This is an optimized version for scaling down a plane to 1/4 of
 // its original size.
@@ -775,9 +820,11 @@ static void ScaleAddCols2_C(int dst_width,
     int ix = x >> 16;
     x += dx;
     boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ =
-        SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >>
-        16;
+    int scaletbl_index = boxwidth - minboxwidth;
+    assert((scaletbl_index == 0) || (scaletbl_index == 1));
+    *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + ix) *
+                               scaletbl[scaletbl_index] >>
+                           16);
   }
 }
 
@@ -797,9 +844,10 @@ static void ScaleAddCols2_16_C(int dst_width,
     int ix = x >> 16;
     x += dx;
     boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
-                     scaletbl[boxwidth - minboxwidth] >>
-                 16;
+    int scaletbl_index = boxwidth - minboxwidth;
+    assert((scaletbl_index == 0) || (scaletbl_index == 1));
+    *dst_ptr++ =
+        SumPixels_16(boxwidth, src_ptr + ix) * scaletbl[scaletbl_index] >> 16;
   }
 }
 
@@ -814,7 +862,7 @@ static void ScaleAddCols0_C(int dst_width,
   (void)dx;
   src_ptr += (x >> 16);
   for (i = 0; i < dst_width; ++i) {
-    *dst_ptr++ = src_ptr[i] * scaleval >> 16;
+    *dst_ptr++ = (uint8_t)(src_ptr[i] * scaleval >> 16);
   }
 }
 
@@ -829,7 +877,7 @@ static void ScaleAddCols1_C(int dst_width,
   int i;
   x >>= 16;
   for (i = 0; i < dst_width; ++i) {
-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+    *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + x) * scaleval >> 16);
     x += boxwidth;
   }
 }
@@ -1020,10 +1068,10 @@ void ScalePlaneBilinearDown(int src_width,
 
   const int max_y = (src_height - 1) << 16;
   int j;
-  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                           int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1070,6 +1118,11 @@ void ScalePlaneBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
 #if defined(HAS_SCALEFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -1143,10 +1196,10 @@ void ScalePlaneBilinearDown_16(int src_width,
 
   const int max_y = (src_height - 1) << 16;
   int j;
-  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                           int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
-  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_16_C;
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1231,10 +1284,10 @@ void ScalePlaneBilinearUp(int src_width,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                           int dst_width, int x, int dx) =
       filtering ? ScaleFilterCols_C : ScaleCols_C;
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1265,6 +1318,11 @@ void ScalePlaneBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   if (filtering && src_width >= 32768) {
     ScaleFilterCols = ScaleFilterCols64_C;
@@ -1315,11 +1373,11 @@ void ScalePlaneBilinearUp(int src_width,
     const uint8_t* src = src_ptr + yi * (int64_t)src_stride;
 
     // Allocate 2 row buffers.
-    const int kRowSize = (dst_width + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (dst_width + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
 
     uint8_t* rowptr = row;
-    int rowstride = kRowSize;
+    int rowstride = row_size;
     int lasty = yi;
 
     ScaleFilterCols(rowptr, src, dst_width, x, dx);
@@ -1699,10 +1757,10 @@ void ScalePlaneBilinearUp_16(int src_width,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_16_C;
-  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                           int dst_width, int x, int dx) =
       filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1766,11 +1824,11 @@ void ScalePlaneBilinearUp_16(int src_width,
     const uint16_t* src = src_ptr + yi * (int64_t)src_stride;
 
     // Allocate 2 row buffers.
-    const int kRowSize = (dst_width + 31) & ~31;
-    align_buffer_64(row, kRowSize * 4);
+    const int row_size = (dst_width + 31) & ~31;
+    align_buffer_64(row, row_size * 4);
 
     uint16_t* rowptr = (uint16_t*)row;
-    int rowstride = kRowSize;
+    int rowstride = row_size;
     int lasty = yi;
 
     ScaleFilterCols(rowptr, src, dst_width, x, dx);
@@ -1827,7 +1885,7 @@ static void ScalePlaneSimple(int src_width,
                              const uint8_t* src_ptr,
                              uint8_t* dst_ptr) {
   int i;
-  void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
+  void (*ScaleCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width,
                     int x, int dx) = ScaleCols_C;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -1864,7 +1922,7 @@ static void ScalePlaneSimple_16(int src_width,
                                 const uint16_t* src_ptr,
                                 uint16_t* dst_ptr) {
   int i;
-  void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width,
+  void (*ScaleCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width,
                     int x, int dx) = ScaleCols_16_C;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
diff --git a/files/source/scale_any.cc b/files/source/scale_any.cc
index 317041f8..f6576874 100644
--- a/files/source/scale_any.cc
+++ b/files/source/scale_any.cc
@@ -128,6 +128,22 @@ SDODD(ScaleRowDown2Box_Odd_NEON,
       1,
       15)
 #endif
+#ifdef HAS_SCALEUVROWDOWN2_NEON
+SDANY(ScaleUVRowDown2_Any_NEON,
+      ScaleUVRowDown2_NEON,
+      ScaleUVRowDown2_C,
+      2,
+      2,
+      7)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2LINEAR_NEON
+SDANY(ScaleUVRowDown2Linear_Any_NEON,
+      ScaleUVRowDown2Linear_NEON,
+      ScaleUVRowDown2Linear_C,
+      2,
+      2,
+      7)
+#endif
 #ifdef HAS_SCALEUVROWDOWN2BOX_NEON
 SDANY(ScaleUVRowDown2Box_Any_NEON,
       ScaleUVRowDown2Box_NEON,
diff --git a/files/source/scale_argb.cc b/files/source/scale_argb.cc
index 9c3acf7f..ddd8d29e 100644
--- a/files/source/scale_argb.cc
+++ b/files/source/scale_argb.cc
@@ -58,9 +58,9 @@ static void ScaleARGBDown2(int src_width,
   assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
   // Advance to odd row, even column.
   if (filtering == kFilterBilinear) {
-    src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4;
+    src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
   } else {
-    src_argb += (y >> 16) * (int64_t)src_stride + ((x >> 16) - 1) * 4;
+    src_argb += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 4;
   }
 
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
@@ -155,14 +155,14 @@ static void ScaleARGBDown4Box(int src_width,
                               int dy) {
   int j;
   // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2);
+  const int row_size = (dst_width * 2 * 4 + 31) & ~31;
+  align_buffer_64(row, row_size * 2);
   int row_stride = src_stride * (dy >> 16);
   void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
                             uint8_t* dst_argb, int dst_width) =
       ScaleARGBRowDown2Box_C;
   // Advance to odd row, even column.
-  src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4;
+  src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
   (void)src_width;
   (void)src_height;
   (void)dx;
@@ -187,9 +187,9 @@ static void ScaleARGBDown4Box(int src_width,
 
   for (j = 0; j < dst_height; ++j) {
     ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
-    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,
+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + row_size,
                       dst_width * 2);
-    ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
+    ScaleARGBRowDown2(row, row_size, dst_argb, dst_width);
     src_argb += row_stride;
     dst_argb += dst_stride;
   }
@@ -214,7 +214,7 @@ static void ScaleARGBDownEven(int src_width,
                               enum FilterMode filtering) {
   int j;
   int col_step = dx >> 16;
-  int row_stride = (dy >> 16) * (int64_t)src_stride;
+  ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride);
   void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,
                                int src_step, uint8_t* dst_argb, int dst_width) =
       filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
@@ -222,7 +222,7 @@ static void ScaleARGBDownEven(int src_width,
   (void)src_height;
   assert(IS_ALIGNED(src_width, 2));
   assert(IS_ALIGNED(src_height, 2));
-  src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4;
+  src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
@@ -289,10 +289,10 @@ static void ScaleARGBBilinearDown(int src_width,
                                   int dy,
                                   enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
                               int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
   int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
@@ -348,6 +348,11 @@ static void ScaleARGBBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
@@ -388,7 +393,7 @@ static void ScaleARGBBilinearDown(int src_width,
     }
     for (j = 0; j < dst_height; ++j) {
       int yi = y >> 16;
-      const uint8_t* src = src_argb + yi * (int64_t)src_stride;
+      const uint8_t* src = src_argb + yi * (intptr_t)src_stride;
       if (filtering == kFilterLinear) {
         ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
       } else {
@@ -421,10 +426,10 @@ static void ScaleARGBBilinearUp(int src_width,
                                 int dy,
                                 enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
                               int dst_width, int x, int dx) =
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   const int max_y = (src_height - 1) << 16;
@@ -467,6 +472,11 @@ static void ScaleARGBBilinearUp(int src_width,
       InterpolateRow = InterpolateRow_LSX;
     }
   }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
 #endif
   if (src_width >= 32768) {
     ScaleARGBFilterCols =
@@ -545,14 +555,14 @@ static void ScaleARGBBilinearUp(int src_width,
 
   {
     int yi = y >> 16;
-    const uint8_t* src = src_argb + yi * (int64_t)src_stride;
+    const uint8_t* src = src_argb + yi * (intptr_t)src_stride;
 
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (dst_width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (dst_width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
 
     uint8_t* rowptr = row;
-    int rowstride = kRowSize;
+    int rowstride = row_size;
     int lasty = yi;
 
     ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
@@ -570,7 +580,7 @@ static void ScaleARGBBilinearUp(int src_width,
         if (y > max_y) {
           y = max_y;
           yi = y >> 16;
-          src = src_argb + yi * (int64_t)src_stride;
+          src = src_argb + yi * (intptr_t)src_stride;
         }
         if (yi != lasty) {
           ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
@@ -659,6 +669,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(src_width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGBROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToARGBRow = I422ToARGBRow_Any_LASX;
@@ -667,8 +685,13 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif
 
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
 #if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -711,8 +734,13 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
-  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
                               int dst_width, int x, int dx) =
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   if (src_width >= 32768) {
@@ -793,19 +821,19 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
   const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
   int yi = y >> 16;
   int uv_yi = yi >> kYShift;
-  const uint8_t* src_row_y = src_y + yi * (int64_t)src_stride_y;
-  const uint8_t* src_row_u = src_u + uv_yi * (int64_t)src_stride_u;
-  const uint8_t* src_row_v = src_v + uv_yi * (int64_t)src_stride_v;
+  const uint8_t* src_row_y = src_y + yi * (intptr_t)src_stride_y;
+  const uint8_t* src_row_u = src_u + uv_yi * (intptr_t)src_stride_u;
+  const uint8_t* src_row_v = src_v + uv_yi * (intptr_t)src_stride_v;
 
   // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 4 + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2);
+  const int row_size = (dst_width * 4 + 31) & ~31;
+  align_buffer_64(row, row_size * 2);
 
   // Allocate 1 row of ARGB for source conversion.
   align_buffer_64(argb_row, src_width * 4);
 
   uint8_t* rowptr = row;
-  int rowstride = kRowSize;
+  int rowstride = row_size;
   int lasty = yi;
 
   // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
@@ -833,9 +861,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
         y = max_y;
         yi = y >> 16;
         uv_yi = yi >> kYShift;
-        src_row_y = src_y + yi * (int64_t)src_stride_y;
-        src_row_u = src_u + uv_yi * (int64_t)src_stride_u;
-        src_row_v = src_v + uv_yi * (int64_t)src_stride_v;
+        src_row_y = src_y + yi * (intptr_t)src_stride_y;
+        src_row_u = src_u + uv_yi * (intptr_t)src_stride_u;
+        src_row_v = src_v + uv_yi * (intptr_t)src_stride_v;
       }
       if (yi != lasty) {
         // TODO(fbarchard): Convert the clipped region of row.
@@ -883,7 +911,7 @@ static void ScaleARGBSimple(int src_width,
                             int y,
                             int dy) {
   int j;
-  void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*ScaleARGBCols)(uint8_t* dst_argb, const uint8_t* src_argb,
                         int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
   (void)src_height;
@@ -926,7 +954,7 @@ static void ScaleARGBSimple(int src_width,
   }
 
   for (j = 0; j < dst_height; ++j) {
-    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (int64_t)src_stride,
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (intptr_t)src_stride,
                   dst_width, x, dx);
     dst_argb += dst_stride;
     y += dy;
@@ -962,7 +990,7 @@ static void ScaleARGB(const uint8_t* src,
   // Negative src_height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
-    src = src + (src_height - 1) * (int64_t)src_stride;
+    src = src + (src_height - 1) * (intptr_t)src_stride;
     src_stride = -src_stride;
   }
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -977,7 +1005,7 @@ static void ScaleARGB(const uint8_t* src,
   if (clip_y) {
     int64_t clipf = (int64_t)(clip_y)*dy;
     y += (clipf & 0xffff);
-    src += (clipf >> 16) * (int64_t)src_stride;
+    src += (clipf >> 16) * (intptr_t)src_stride;
     dst += clip_y * dst_stride;
   }
 
@@ -1011,7 +1039,7 @@ static void ScaleARGB(const uint8_t* src,
         filtering = kFilterNone;
         if (dx == 0x10000 && dy == 0x10000) {
           // Straight copy.
-          ARGBCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 4,
+          ARGBCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4,
                    src_stride, dst, dst_stride, clip_width, clip_height);
           return;
         }
diff --git a/files/source/scale_common.cc b/files/source/scale_common.cc
index b02bdafd..77455903 100644
--- a/files/source/scale_common.cc
+++ b/files/source/scale_common.cc
@@ -23,6 +23,25 @@ namespace libyuv {
 extern "C" {
 #endif
 
+#ifdef __cplusplus
+#define STATIC_CAST(type, expr) static_cast<type>(expr)
+#else
+#define STATIC_CAST(type, expr) (type)(expr)
+#endif
+
+// TODO(fbarchard): make clamp255 preserve negative values.
+static __inline int32_t clamp255(int32_t v) {
+  return (-(v >= 255) | v) & 255;
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
+
 static __inline int Abs(int v) {
   return v >= 0 ? v : -v;
 }
@@ -62,6 +81,50 @@ void ScaleRowDown2_16_C(const uint16_t* src_ptr,
   }
 }
 
+void ScaleRowDown2_16To8_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width,
+                           int scale) {
+  int x;
+  (void)src_stride;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+    dst[1] = STATIC_CAST(uint8_t, C16TO8(src_ptr[3], scale));
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+  }
+}
+
+void ScaleRowDown2_16To8_Odd_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width,
+                               int scale) {
+  int x;
+  (void)src_stride;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  dst_width -= 1;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+    dst[1] = STATIC_CAST(uint8_t, C16TO8(src_ptr[3], scale));
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+    dst += 1;
+    src_ptr += 2;
+  }
+  dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[0], scale));
+}
+
 void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst,
@@ -98,6 +161,52 @@ void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
   }
 }
 
+void ScaleRowDown2Linear_16To8_C(const uint16_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst,
+                                 int dst_width,
+                                 int scale) {
+  const uint16_t* s = src_ptr;
+  int x;
+  (void)src_stride;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+    dst[1] = STATIC_CAST(uint8_t, C16TO8((s[2] + s[3] + 1) >> 1, scale));
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+  }
+}
+
+void ScaleRowDown2Linear_16To8_Odd_C(const uint16_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst,
+                                     int dst_width,
+                                     int scale) {
+  const uint16_t* s = src_ptr;
+  int x;
+  (void)src_stride;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  dst_width -= 1;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+    dst[1] = STATIC_CAST(uint8_t, C16TO8((s[2] + s[3] + 1) >> 1, scale));
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+    dst += 1;
+    s += 2;
+  }
+  dst[0] = STATIC_CAST(uint8_t, C16TO8(s[0], scale));
+}
+
 void ScaleRowDown2Box_C(const uint8_t* src_ptr,
                         ptrdiff_t src_stride,
                         uint8_t* dst,
@@ -160,6 +269,61 @@ void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
   }
 }
 
+void ScaleRowDown2Box_16To8_C(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width,
+                              int scale) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
+  int x;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t,
+                         C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+    dst[1] = STATIC_CAST(uint8_t,
+                         C16TO8((s[2] + s[3] + t[2] + t[3] + 2) >> 2, scale));
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t,
+                         C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+  }
+}
+
+void ScaleRowDown2Box_16To8_Odd_C(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst,
+                                  int dst_width,
+                                  int scale) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
+  int x;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  dst_width -= 1;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t,
+                         C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+    dst[1] = STATIC_CAST(uint8_t,
+                         C16TO8((s[2] + s[3] + t[2] + t[3] + 2) >> 2, scale));
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t,
+                         C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+    dst += 1;
+    s += 2;
+    t += 2;
+  }
+  dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + t[0] + 1) >> 1, scale));
+}
+
 void ScaleRowDown4_C(const uint8_t* src_ptr,
                      ptrdiff_t src_stride,
                      uint8_t* dst,
@@ -1116,18 +1280,13 @@ void ScaleUVRowDown2_C(const uint8_t* src_uv,
                        ptrdiff_t src_stride,
                        uint8_t* dst_uv,
                        int dst_width) {
-  const uint16_t* src = (const uint16_t*)(src_uv);
-  uint16_t* dst = (uint16_t*)(dst_uv);
   int x;
   (void)src_stride;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src[1];
-    dst[1] = src[3];
-    src += 2;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[1];
+  for (x = 0; x < dst_width; ++x) {
+    dst_uv[0] = src_uv[2];  // Store the 2nd UV
+    dst_uv[1] = src_uv[3];
+    src_uv += 4;
+    dst_uv += 2;
   }
 }
 
@@ -1469,7 +1628,7 @@ void ScalePlaneVertical(int src_height,
                         enum FilterMode filtering) {
   // TODO(fbarchard): Allow higher bpp.
   int dst_width_bytes = dst_width * bpp;
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
@@ -1519,6 +1678,12 @@ void ScalePlaneVertical(int src_height,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
+
   for (j = 0; j < dst_height; ++j) {
     int yi;
     int yf;
@@ -1548,7 +1713,7 @@ void ScalePlaneVertical_16(int src_height,
                            enum FilterMode filtering) {
   // TODO(fbarchard): Allow higher wpp.
   int dst_width_words = dst_width * wpp;
-  void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb,
+  void (*InterpolateRow)(uint16_t* dst_argb, const uint16_t* src_argb,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_16_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
@@ -1627,7 +1792,7 @@ void ScalePlaneVertical_16To8(int src_height,
   // TODO(fbarchard): Allow higher wpp.
   int dst_width_words = dst_width * wpp;
   // TODO(https://crbug.com/libyuv/931): Add NEON 32 bit and AVX2 versions.
-  void (*InterpolateRow_16To8)(uint8_t * dst_argb, const uint16_t* src_argb,
+  void (*InterpolateRow_16To8)(uint8_t* dst_argb, const uint16_t* src_argb,
                                ptrdiff_t src_stride, int scale, int dst_width,
                                int source_y_fraction) = InterpolateRow_16To8_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
diff --git a/files/source/scale_gcc.cc b/files/source/scale_gcc.cc
index edaf2e29..17eeffad 100644
--- a/files/source/scale_gcc.cc
+++ b/files/source/scale_gcc.cc
@@ -1094,7 +1094,8 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
       : "r"((intptr_t)(src_stride)),  // %3
         "r"((intptr_t)(dst_stride)),  // %4
         "m"(kLinearShuffleFar)        // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif
 
@@ -1294,7 +1295,7 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
         "+r"(dst_ptr),      // %1
         "+r"(dst_width)     // %2
       : "m"(kLinearMadd31)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif
 
diff --git a/files/source/scale_mmi.cc b/files/source/scale_mmi.cc
deleted file mode 100644
index 1226ef3e..00000000
--- a/files/source/scale_mmi.cc
+++ /dev/null
@@ -1,1168 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"  // For CopyARGB
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// clang-format off
-
-// CPU agnostic row functions
-void ScaleRowDown2_MMI(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1, dest;
-  const uint64_t shift = 0x8ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "psrlh      %[src0],         %[src0],           %[shift]      \n\t"
-
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "psrlh      %[src1],         %[src1],           %[shift]      \n\t"
-
-      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift] "f"(shift)
-      : "memory");
-}
-
-void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst,
-                             int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest0, dest1;
-
-  const uint64_t mask = 0x00ff00ff00ff00ffULL;
-  const uint64_t shift = 0x8ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],          0x00(%[src_ptr])                \n\t"
-      "gsldlc1    %[src0],          0x07(%[src_ptr])                \n\t"
-      "and        %[dest0],         %[src0],          %[mask]       \n\t"
-      "gsldrc1    %[src1],          0x08(%[src_ptr])                \n\t"
-      "gsldlc1    %[src1],          0x0f(%[src_ptr])                \n\t"
-      "and        %[dest1],         %[src1],          %[mask]       \n\t"
-      "packushb   %[dest0],         %[dest0],         %[dest1]      \n\t"
-
-      "psrlh      %[src0],          %[src0],          %[shift]      \n\t"
-      "psrlh      %[src1],          %[src1],          %[shift]      \n\t"
-      "packushb   %[dest1],         %[src0],          %[src1]       \n\t"
-
-      "pavgb      %[dest],          %[dest0],         %[dest1]      \n\t"
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "daddiu     %[src_ptr],       %[src_ptr],        0x10         \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],        0x08         \n\t"
-      "daddi      %[width],         %[width],         -0x08         \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask),
-        [shift] "f"(shift), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width) {
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-
-  uint64_t s0, s1, t0, t1;
-  uint64_t dest, dest0, dest1;
-
-  const uint64_t ph = 0x0002000200020002ULL;
-  const uint64_t mask = 0x00ff00ff00ff00ffULL;
-  const uint64_t shift0 = 0x2ULL;
-  const uint64_t shift1 = 0x8ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
-      "psrlh      %[s1],            %[s0],            %[shift1]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
-      "psrlh      %[t1],            %[t0],            %[shift1]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddh      %[dest0],         %[s0],            %[s1]         \n\t"
-      "paddh      %[dest0],         %[dest0],         %[t0]         \n\t"
-      "paddh      %[dest0],         %[dest0],         %[t1]         \n\t"
-      "paddh      %[dest0],         %[dest0],         %[ph]         \n\t"
-      "psrlh      %[dest0],         %[dest0],         %[shift0]     \n\t"
-
-      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
-      "psrlh      %[s1],            %[s0],            %[shift1]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
-      "psrlh      %[t1],            %[t0],            %[shift1]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddh      %[dest1],         %[s0],            %[s1]         \n\t"
-      "paddh      %[dest1],         %[dest1],         %[t0]         \n\t"
-      "paddh      %[dest1],         %[dest1],         %[t1]         \n\t"
-      "paddh      %[dest1],         %[dest1],         %[ph]         \n\t"
-      "psrlh      %[dest1],         %[dest1],         %[shift0]     \n\t"
-
-      "packushb   %[dest],          %[dest0],         %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[s],            %[s],              0x10          \n\t"
-      "daddiu     %[t],            %[t],              0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest)
-      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_argb,
-                           int dst_width) {
-  (void)src_stride;
-
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[src0],           %[src1]       \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_argb,
-                                 int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "lwc1       %[src0],         0x00(%[src_ptr])                 \n\t"
-      "lwc1       %[src1],         0x08(%[src_ptr])                 \n\t"
-      "punpcklwd  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "lwc1       %[src0],         0x04(%[src_ptr])                 \n\t"
-      "lwc1       %[src1],         0x0c(%[src_ptr])                 \n\t"
-      "punpcklwd  %[dest_hi],      %[src0],           %[src1]       \n\t"
-
-      "pavgb      %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_argb,
-                              int dst_width) {
-  const uint8_t* s = src_argb;
-  const uint8_t* t = src_argb + src_stride;
-
-  uint64_t s0, s_hi, s_lo;
-  uint64_t t0, t_hi, t_lo;
-  uint64_t dest, dest_hi, dest_lo;
-
-  const uint64_t mask = 0x0ULL;
-  const uint64_t ph = 0x0002000200020002ULL;
-  const uint64_t shfit = 0x2ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
-      "punpcklbh  %[s_lo],          %[s0],           %[mask]        \n\t"
-      "punpckhbh  %[s_hi],          %[s0],           %[mask]        \n\t"
-      "paddh      %[dest_lo],       %[s_lo],         %[s_hi]        \n\t"
-
-      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
-      "punpcklbh  %[t_lo],          %[t0],           %[mask]        \n\t"
-      "punpckhbh  %[t_hi],          %[t0],           %[mask]        \n\t"
-      "paddh      %[dest_lo],       %[dest_lo],      %[t_lo]        \n\t"
-      "paddh      %[dest_lo],       %[dest_lo],      %[t_hi]        \n\t"
-
-      "paddh      %[dest_lo],      %[dest_lo],       %[ph]          \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],       %[shfit]       \n\t"
-
-      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
-      "punpcklbh  %[s_lo],          %[s0],           %[mask]        \n\t"
-      "punpckhbh  %[s_hi],          %[s0],           %[mask]        \n\t"
-      "paddh      %[dest_hi],       %[s_lo],         %[s_hi]        \n\t"
-
-      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
-      "punpcklbh  %[t_lo],          %[t0],           %[mask]        \n\t"
-      "punpckhbh  %[t_hi],          %[t0],           %[mask]        \n\t"
-      "paddh      %[dest_hi],       %[dest_hi],      %[t_lo]        \n\t"
-      "paddh      %[dest_hi],       %[dest_hi],      %[t_hi]        \n\t"
-
-      "paddh      %[dest_hi],      %[dest_hi],       %[ph]          \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],       %[shfit]       \n\t"
-
-      "packushb   %[dest],         %[dest_lo],       %[dest_hi]     \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[s],            %[s],              0x10          \n\t"
-      "daddiu     %[t],            %[t],              0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo),
-        [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest)
-      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
-        [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit)
-      : "memory");
-}
-
-void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint16_t* dst,
-                          int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1, dest;
-  const uint64_t shift = 0x10ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
-
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
-
-      "packsswh   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift] "f"(shift)
-      : "memory");
-}
-
-void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint16_t* dst,
-                                int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "punpcklhw  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "punpckhhw  %[dest_hi],      %[src0],           %[src1]       \n\t"
-
-      "punpcklhw  %[src0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhhw  %[src1],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "pavgh      %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint16_t* dst,
-                             int dst_width) {
-  const uint16_t* s = src_ptr;
-  const uint16_t* t = src_ptr + src_stride;
-
-  uint64_t s0, s1, s_hi, s_lo;
-  uint64_t t0, t1, t_hi, t_lo;
-  uint64_t dest, dest0, dest1;
-
-  const uint64_t ph = 0x0000000200000002ULL;
-  const uint64_t mask = 0x0000ffff0000ffffULL;
-  const uint64_t shift0 = 0x10ULL;
-  const uint64_t shift1 = 0x2ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
-      "psrlw      %[s1],            %[s0],            %[shift0]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
-      "psrlw      %[t1],            %[t0],            %[shift0]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddw      %[dest0],         %[s0],            %[s1]         \n\t"
-      "paddw      %[dest0],         %[dest0],         %[t0]         \n\t"
-      "paddw      %[dest0],         %[dest0],         %[t1]         \n\t"
-      "paddw      %[dest0],         %[dest0],         %[ph]         \n\t"
-      "psrlw      %[dest0],         %[dest0],         %[shift1]     \n\t"
-
-      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
-      "psrlw      %[s1],            %[s0],            %[shift0]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
-      "psrlw      %[t1],            %[t0],            %[shift0]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddw      %[dest1],         %[s0],            %[s1]         \n\t"
-      "paddw      %[dest1],         %[dest1],         %[t0]         \n\t"
-      "paddw      %[dest1],         %[dest1],         %[t1]         \n\t"
-      "paddw      %[dest1],         %[dest1],         %[ph]         \n\t"
-      "psrlw      %[dest1],         %[dest1],         %[shift1]     \n\t"
-
-      "packsswh   %[dest],          %[dest0],         %[dest1]      \n\t"
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "daddiu     %[s],             %[s],              0x10         \n\t"
-      "daddiu     %[t],             %[t],              0x10         \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],        0x08         \n\t"
-      "daddi      %[width],         %[width],         -0x04         \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
-        [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi),
-        [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
-        [dest] "=&f"(dest)
-      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleRowDown4_MMI(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  const uint64_t shift = 0x10ULL;
-  const uint64_t mask = 0x000000ff000000ffULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
-      "and        %[src0],         %[src0],           %[mask]       \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
-      "and        %[src1],         %[src1],           %[mask]       \n\t"
-      "packsswh   %[dest_lo],      %[src0],           %[src1]       \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_ptr])                 \n\t"
-      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
-      "and        %[src0],         %[src0],           %[mask]       \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_ptr])                 \n\t"
-      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
-      "and        %[src1],         %[src1],           %[mask]       \n\t"
-      "packsswh   %[dest_hi],      %[src0],           %[src1]       \n\t"
-
-      "packushb   %[dest],         %[dest_lo],         %[dest_hi]   \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift] "f"(shift), [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint16_t* dst,
-                          int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "punpckhhw  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "punpcklhw  %[dest_lo],      %[dest_lo],        %[mask]       \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_ptr])                 \n\t"
-      "punpckhhw  %[dest_hi],      %[src0],           %[src1]       \n\t"
-      "punpcklhw  %[dest_hi],      %[dest_hi],        %[mask]       \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-#define DO_SCALEROWDOWN4BOX_PUNPCKADD()                              \
-  "punpcklbh  %[src_lo],       %[src],           %[mask0]      \n\t" \
-  "punpckhbh  %[src_hi],       %[src],           %[mask0]      \n\t" \
-  "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]     \n\t" \
-  "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]     \n\t"
-
-#define DO_SCALEROWDOWN4BOX_LOOP(reg)                                \
-  "ldc1       %[src],          0x00(%[src0_ptr])               \n\t" \
-  "punpcklbh  %[dest_lo],      %[src],           %[mask0]      \n\t" \
-  "punpckhbh  %[dest_hi],      %[src],           %[mask0]      \n\t" \
-                                                                     \
-  "ldc1       %[src],          0x00(%[src1_ptr])               \n\t" \
-  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
-                                                                     \
-  "ldc1       %[src],          0x00(%[src2_ptr])               \n\t" \
-  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
-                                                                     \
-  "ldc1       %[src],          0x00(%[src3_ptr])               \n\t" \
-  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
-                                                                     \
-  "pmaddhw    %[dest_lo],      %[dest_lo],       %[mask1]      \n\t" \
-  "pmaddhw    %[dest_hi],      %[dest_hi],       %[mask1]      \n\t" \
-  "packsswh   " #reg   ",      %[dest_lo],       %[dest_hi]    \n\t" \
-  "pmaddhw    " #reg   ",      " #reg   ",       %[mask1]      \n\t" \
-  "paddh      " #reg   ",      " #reg   ",       %[ph]         \n\t" \
-  "psrlh      " #reg   ",      " #reg   ",       %[shift]      \n\t" \
-                                                                     \
-  "daddiu     %[src0_ptr],     %[src0_ptr],      0x08          \n\t" \
-  "daddiu     %[src1_ptr],     %[src1_ptr],      0x08          \n\t" \
-  "daddiu     %[src2_ptr],     %[src2_ptr],      0x08          \n\t" \
-  "daddiu     %[src3_ptr],     %[src3_ptr],      0x08          \n\t"
-
-/* LibYUVScaleTest.ScaleDownBy4_Box */
-void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width) {
-  const uint8_t* src0_ptr = src_ptr;
-  const uint8_t* src1_ptr = src_ptr + src_stride;
-  const uint8_t* src2_ptr = src_ptr + src_stride * 2;
-  const uint8_t* src3_ptr = src_ptr + src_stride * 3;
-
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
-
-  const uint64_t mask0 = 0x0ULL;
-  const uint64_t mask1 = 0x0001000100010001ULL;
-  const uint64_t ph = 0x0008000800080008ULL;
-  const uint64_t shift = 0x4ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest0])
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest1])
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest2])
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest3])
-
-      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[dest_hi],      %[dest2],          %[dest3]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
-        [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
-        [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
-        [ph] "f"(ph), [mask1] "f"(mask1)
-      : "memory");
-}
-
-#define DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                            \
-  "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t" \
-  "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t" \
-  "paddh      %[dest_lo],      %[dest_lo],        %[src_lo]     \n\t" \
-  "paddh      %[dest_hi],      %[dest_hi],        %[src_hi]     \n\t"
-
-#define DO_SCALEROWDOWN4BOX_16_LOOP(reg)                              \
-  "ldc1       %[src],          0x00(%[src0_ptr])                \n\t" \
-  "punpcklbh  %[dest_lo],      %[src],            %[mask0]      \n\t" \
-  "punpckhbh  %[dest_hi],      %[src],            %[mask0]      \n\t" \
-                                                                      \
-  "ldc1       %[src],          0x00(%[src1_ptr])                \n\t" \
-  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
-                                                                      \
-  "ldc1       %[src],          0x00(%[src2_ptr])                \n\t" \
-  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
-                                                                      \
-  "ldc1       %[src],          0x00(%[src3_ptr])                \n\t" \
-  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
-                                                                      \
-  "paddw      %[dest],         %[dest_lo],        %[dest_hi]    \n\t" \
-  "punpckhwd  %[dest_hi],      %[dest],           %[dest]       \n\t" \
-  "paddw      %[dest],         %[dest_hi],        %[dest]       \n\t" \
-  "paddw      %[dest],         %[dest],           %[ph]         \n\t" \
-  "psraw      %[dest],         %[dest],           %[shift]      \n\t" \
-  "and        " #reg ",        %[dest],           %[mask1]      \n\t" \
-                                                                      \
-  "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t" \
-  "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t" \
-  "daddiu     %[src2_ptr],     %[src2_ptr],       0x08          \n\t" \
-  "daddiu     %[src3_ptr],     %[src3_ptr],       0x08          \n\t"
-
-/* LibYUVScaleTest.ScaleDownBy4_Box_16 */
-void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint16_t* dst,
-                             int dst_width) {
-  const uint16_t* src0_ptr = src_ptr;
-  const uint16_t* src1_ptr = src_ptr + src_stride;
-  const uint16_t* src2_ptr = src_ptr + src_stride * 2;
-  const uint16_t* src3_ptr = src_ptr + src_stride * 3;
-
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
-
-  const uint64_t mask0 = 0x0ULL;
-  const uint64_t mask1 = 0x00000000ffffffffULL;
-  const uint64_t ph = 0x0000000800000008ULL;
-  const uint64_t shift = 0x04ULL;
-
-  __asm__ volatile(
-      "1:                                                        \n\t"
-
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0])
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1])
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2])
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3])
-      "punpcklwd  %[dest_lo],      %[dest0],          %[dest1]   \n\t"
-      "punpcklwd  %[dest_hi],      %[dest2],          %[dest3]   \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi] \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])              \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])              \n\t"
-
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08       \n\t"
-      "daddi      %[width],        %[width],         -0x04       \n\t"
-      "bnez       %[width],        1b                            \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
-        [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
-        [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
-        [ph] "f"(ph), [mask1] "f"(mask1)
-      : "memory");
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_MMI(uint8_t* dst_ptr,
-                      const uint8_t* src_ptr,
-                      int dst_width,
-                      int x,
-                      int dx) {
-  uint64_t src, dest;
-
-  (void)x;
-  (void)dx;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "lwc1       %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[dest],         %[src],            %[src]        \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x04          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
-                         const uint16_t* src_ptr,
-                         int dst_width,
-                         int x,
-                         int dx) {
-  uint64_t src, dest;
-
-  (void)x;
-  (void)dx;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-
-      "punpcklhw  %[dest],         %[src],            %[src]        \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "punpckhhw  %[dest],         %[src],            %[src]        \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
-  uint64_t src, src_hi, src_lo, dest0, dest1;
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask]       \n\t"
-
-      "gsldrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "paddush    %[dest0],        %[dest0],          %[src_lo]     \n\t"
-      "gsldrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "paddush    %[dest1],        %[dest1],          %[src_hi]     \n\t"
-
-      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [src] "=&f"(src)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
-                        uint32_t* dst_ptr,
-                        int src_width) {
-  uint64_t src, src_hi, src_lo, dest0, dest1;
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "punpcklhw  %[src_lo],       %[src],            %[mask]       \n\t"
-      "punpckhhw  %[src_hi],       %[src],            %[mask]       \n\t"
-
-      "gsldrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src_lo]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-
-      "gsldrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src_hi]     \n\t"
-      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [src] "=&f"(src)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              int src_stepx,
-                              uint8_t* dst_argb,
-                              int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "lwc1       %[src0],          0x00(%[src_ptr])                \n\t"
-      "dadd       %[src_ptr],       %[src_ptr],       %[src_stepx_4]\n\t"
-      "lwc1       %[src1],          0x00(%[src_ptr])                \n\t"
-      "punpcklwd  %[dest],          %[src0],          %[src1]       \n\t"
-
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "dadd       %[src_ptr],       %[src_ptr],       %[src_stepx_4]\n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x08          \n\t"
-      "daddi      %[width],         %[width],        -0x02          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb),
-        [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 int src_stepx,
-                                 uint8_t* dst_argb,
-                                 int dst_width) {
-  const uint8_t* src0_ptr = src_argb;
-  const uint8_t* src1_ptr = src_argb + src_stride;
-
-  uint64_t src0, src1, src_hi, src_lo;
-  uint64_t dest, dest_hi, dest_lo, dest0, dest1;
-
-  const uint64_t mask = 0x0ULL;
-  const uint64_t ph = 0x0002000200020002ULL;
-  const uint64_t shift = 0x2ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-
-      "lwc1       %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_lo],      %[src0],          %[mask]        \n\t"
-      "lwc1       %[src0],         0x04(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_hi],      %[src0],          %[mask]        \n\t"
-
-      "lwc1       %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_lo],       %[src1],          %[mask]        \n\t"
-      "lwc1       %[src1],         0x04(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_hi],       %[src1],          %[mask]        \n\t"
-      "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]      \n\t"
-      "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]      \n\t"
-      "paddh      %[dest0],        %[dest_hi],       %[dest_lo]     \n\t"
-      "paddh      %[dest0],        %[dest0],         %[ph]          \n\t"
-      "psrlh      %[dest0],        %[dest0],         %[shift]       \n\t"
-
-      "dadd       %[src0_ptr],     %[src0_ptr],      %[src_stepx_4] \n\t"
-      "dadd       %[src1_ptr],     %[src1_ptr],      %[src_stepx_4] \n\t"
-
-      "lwc1       %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_lo],      %[src0],          %[mask]        \n\t"
-      "lwc1       %[src0],         0x04(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_hi],      %[src0],          %[mask]        \n\t"
-
-      "lwc1       %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_lo],       %[src1],          %[mask]        \n\t"
-      "lwc1       %[src1],         0x04(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_hi],       %[src1],          %[mask]        \n\t"
-      "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]      \n\t"
-      "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]      \n\t"
-      "paddh      %[dest1],        %[dest_hi],       %[dest_lo]     \n\t"
-      "paddh      %[dest1],        %[dest1],         %[ph]          \n\t"
-      "psrlh      %[dest1],        %[dest1],         %[shift]       \n\t"
-
-      "packushb   %[dest],         %[dest0],          %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "dadd       %[src0_ptr],     %[src0_ptr],      %[src_stepx_4] \n\t"
-      "dadd       %[src1_ptr],     %[src1_ptr],      %[src_stepx_4] \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
-        [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
-        [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
-        [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask),
-        [ph] "f"(ph)
-      : "memory");
-}
-
-// Scales a single row of pixels using point sampling.
-void ScaleARGBCols_MMI(uint8_t* dst_argb,
-                       const uint8_t* src_argb,
-                       int dst_width,
-                       int x,
-                       int dx) {
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-
-  const uint32_t* src_tmp;
-
-  uint64_t dest, offset;
-
-  const uint64_t shift0 = 16;
-  const uint64_t shift1 = 2;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "srav       %[offset],        %[x],             %[shift0]     \n\t"
-      "sllv       %[offset],        %[offset],        %[shift1]     \n\t"
-      "dadd       %[src_tmp],       %[src_ptr],       %[offset]     \n\t"
-      "lwc1       %[dest],          0x00(%[src_tmp])                \n\t"
-      "swc1       %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "dadd       %[x],             %[x],             %[dx]         \n\t"
-
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x04          \n\t"
-      "daddi      %[width],         %[width],        -0x01          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1)
-      : "memory");
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
-                          const uint8_t* src_argb,
-                          int dst_width,
-                          int x,
-                          int dx) {
-  uint64_t src, dest0, dest1;
-  (void)x;
-  (void)dx;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src],           0x00(%[src_ptr])                \n\t"
-      "gsldlc1    %[src],           0x07(%[src_ptr])                \n\t"
-      "punpcklwd  %[dest0],         %[src],           %[src]        \n\t"
-      "gssdlc1    %[dest0],         0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest0],         0x00(%[dst_ptr])                \n\t"
-      "punpckhwd  %[dest1],         %[src],           %[src]        \n\t"
-      "gssdlc1    %[dest1],         0x0f(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest1],         0x08(%[dst_ptr])                \n\t"
-
-      "daddiu     %[src_ptr],       %[src_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x10          \n\t"
-      "daddi      %[width],         %[width],        -0x04          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
-      : "memory");
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-/* LibYUVBaseTest.TestFixedDiv */
-int FixedDiv_MIPS(int num, int div) {
-  int quotient = 0;
-  const int shift = 16;
-
-  asm(
-      "dsll    %[num],     %[num],     %[shift]    \n\t"
-      "ddiv    %[num],     %[div]                  \t\n"
-      "mflo    %[quo]                              \t\n"
-      : [quo] "+&r"(quotient)
-      : [num] "r"(num), [div] "r"(div), [shift] "r"(shift));
-
-  return quotient;
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-/* LibYUVScaleTest.ARGBScaleTo320x240_Linear */
-int FixedDiv1_MIPS(int num, int div) {
-  int quotient = 0;
-  const int shift = 16;
-  const int val1 = 1;
-  const int64_t val11 = 0x00010001ULL;
-
-  asm(
-      "dsll    %[num],     %[num],     %[shift]    \n\t"
-      "dsub    %[num],     %[num],     %[val11]    \n\t"
-      "dsub    %[div],     %[div],     %[val1]     \n\t"
-      "ddiv    %[num],     %[div]                  \t\n"
-      "mflo    %[quo]                              \t\n"
-      : [quo] "+&r"(quotient)
-      : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11),
-        [shift] "r"(shift));
-
-  return quotient;
-}
-
-// Read 8x2 upsample with filtering and write 16x1.
-// actually reads an extra pixel, so 9x2.
-void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint16_t* dst,
-                        int dst_width) {
-  const uint16_t* src2_ptr = src_ptr + src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest04, dest15, dest26, dest37;
-  uint64_t tmp0, tmp1, tmp2, tmp3;
-
-  const uint64_t mask0 = 0x0003000900030009ULL;
-  const uint64_t mask1 = 0x0001000300010003ULL;
-  const uint64_t mask2 = 0x0009000300090003ULL;
-  const uint64_t mask3 = 0x0003000100030001ULL;
-  const uint64_t ph = 0x0000000800000008ULL;
-  const uint64_t shift = 4;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],          0x00(%[src1_ptr])               \n\t"
-      "gsldlc1    %[src0],          0x07(%[src1_ptr])               \n\t"
-      "pmaddhw    %[dest04],        %[src0],          %[mask0]      \n\t"
-      "gsldrc1    %[src1],          0x00(%[src2_ptr])               \n\t"
-      "gsldlc1    %[src1],          0x07(%[src2_ptr])               \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask1]      \n\t"
-      "paddw      %[dest04],        %[dest04],        %[dest]       \n\t"
-      "paddw      %[dest04],        %[dest04],        %[ph]         \n\t"
-      "psrlw      %[dest04],        %[dest04],        %[shift]      \n\t"
-
-      "pmaddhw    %[dest15],        %[src0],          %[mask2]      \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask3]      \n\t"
-      "paddw      %[dest15],        %[dest15],        %[dest]       \n\t"
-      "paddw      %[dest15],        %[dest15],        %[ph]         \n\t"
-      "psrlw      %[dest15],        %[dest15],        %[shift]      \n\t"
-
-      "gsldrc1    %[src0],          0x02(%[src1_ptr])               \n\t"
-      "gsldlc1    %[src0],          0x09(%[src1_ptr])               \n\t"
-      "pmaddhw    %[dest26],        %[src0],          %[mask0]      \n\t"
-      "gsldrc1    %[src1],          0x02(%[src2_ptr])               \n\t"
-      "gsldlc1    %[src1],          0x09(%[src2_ptr])               \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask1]      \n\t"
-      "paddw      %[dest26],        %[dest26],        %[dest]       \n\t"
-      "paddw      %[dest26],        %[dest26],        %[ph]         \n\t"
-      "psrlw      %[dest26],        %[dest26],        %[shift]      \n\t"
-
-      "pmaddhw    %[dest37],        %[src0],          %[mask2]      \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask3]      \n\t"
-      "paddw      %[dest37],        %[dest37],        %[dest]       \n\t"
-      "paddw      %[dest37],        %[dest37],        %[ph]         \n\t"
-      "psrlw      %[dest37],        %[dest37],        %[shift]      \n\t"
-
-      /* tmp0 = ( 00 04 02 06 ) */
-      "packsswh   %[tmp0],          %[dest04],        %[dest26]     \n\t"
-      /* tmp1 = ( 01 05 03 07 ) */
-      "packsswh   %[tmp1],          %[dest15],        %[dest37]     \n\t"
-
-      /* tmp2 = ( 00 01 04 05 )*/
-      "punpcklhw  %[tmp2],          %[tmp0],          %[tmp1]       \n\t"
-      /* tmp3 = ( 02 03 06 07 )*/
-      "punpckhhw  %[tmp3],          %[tmp0],          %[tmp1]       \n\t"
-
-      /* ( 00 01 02 03 ) */
-      "punpcklwd  %[dest],          %[tmp2],          %[tmp3]       \n\t"
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      /* ( 04 05 06 07 ) */
-      "punpckhwd  %[dest],          %[tmp2],          %[tmp3]       \n\t"
-      "gssdlc1    %[dest],          0x0f(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x08(%[dst_ptr])                \n\t"
-
-      "daddiu     %[src1_ptr],      %[src1_ptr],      0x08          \n\t"
-      "daddiu     %[src2_ptr],      %[src2_ptr],      0x08          \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x10          \n\t"
-      "daddi      %[width],         %[width],        -0x08          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04),
-        [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37),
-        [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
-        [tmp3] "=&f"(tmp3), [dest] "=&f"(dest)
-      : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst),
-        [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph)
-      : "memory");
-}
-
-void ScaleRowDown34_MMI(const uint8_t* src_ptr,
-                      ptrdiff_t src_stride,
-                      uint8_t* dst,
-                      int dst_width) {
-  (void)src_stride;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  uint64_t src[2];
-  uint64_t tmp[2];
-  __asm__ volatile (
-    "1:                                                           \n\t"
-    "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-    "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-    "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-    "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-    "and        %[tmp1],         %[src0],        %[mask1]         \n\t"
-    "psrlw      %[tmp0],         %[src0],        %[rmov]          \n\t"
-    "psllw      %[tmp0],         %[tmp0],        %[lmov1]         \n\t"
-    "or         %[src0],         %[tmp0],        %[tmp1]          \n\t"
-    "punpckhwd  %[tmp0],         %[src0],        %[src0]          \n\t"
-    "psllw      %[tmp1],         %[tmp0],        %[rmov]          \n\t"
-    "or         %[src0],         %[src0],        %[tmp1]          \n\t"
-    "psrlw      %[tmp0],         %[tmp0],        %[rmov8]         \n\t"
-    "pextrh     %[tmp0],         %[tmp0],        %[zero]          \n\t"
-    "pinsrh_2   %[src0],         %[src0],        %[tmp0]          \n\t"
-    "pextrh     %[tmp0],         %[src1],        %[zero]          \n\t"
-    "pinsrh_3   %[src0],         %[src0],        %[tmp0]          \n\t"
-
-    "punpckhwd  %[tmp0],         %[src1],        %[src1]          \n\t"
-    "pextrh     %[tmp1],         %[tmp0],        %[zero]          \n\t"
-    "psrlw      %[src1],         %[src1],        %[rmov]          \n\t"
-    "psllw      %[tmp1],         %[tmp1],        %[rmov8]         \n\t"
-    "or         %[src1],         %[src1],        %[tmp1]          \n\t"
-    "and        %[tmp0],         %[tmp0],        %[mask2]         \n\t"
-    "or         %[src1],         %[src1],        %[tmp0]          \n\t"
-
-    "gssdlc1    %[src0],         0x07(%[dst_ptr])                 \n\t"
-    "gssdrc1    %[src0],         0x00(%[dst_ptr])                 \n\t"
-    "gsswlc1    %[src1],         0x0b(%[dst_ptr])                 \n\t"
-    "gsswrc1    %[src1],         0x08(%[dst_ptr])                 \n\t"
-
-    "daddiu     %[src_ptr],      %[src_ptr],     0x10             \n\t"
-    "daddi      %[width],        %[width],      -0x0c             \n\t"
-    "daddiu     %[dst_ptr],      %[dst_ptr],     0x0c             \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [src0]"=&f"(src[0]),              [src1]"=&f"(src[1]),
-      [tmp0]"=&f"(tmp[0]),              [tmp1]"=&f"(tmp[1])
-    : [src_ptr]"r"(src_ptr),            [dst_ptr]"r"(dst),
-      [lmov]"f"(0xc),                   [rmov]"f"(0x18),
-      [mask1]"f"(0xffff0000ffff),       [rmov8]"f"(0x8),
-      [zero]"f"(0x0),                   [mask2]"f"(0xff000000),
-      [width]"r"(dst_width),            [lmov1]"f"(0x10)
-    : "memory"
-  );
-}
-// clang-format on
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc
index 6a0d6e1b..ccc75106 100644
--- a/files/source/scale_neon.cc
+++ b/files/source/scale_neon.cc
@@ -1428,6 +1428,45 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
 
 #undef LOAD2_DATA32_LANE
 
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
+      "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst1.16     {q1}, [%1]!                   \n"  // store 8 UV
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1");
+}
+
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst,
+                                int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
+      "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vrhadd.u8   q0, q0, q1                    \n"  // rounding half add
+      "vst1.16     {q0}, [%1]!                   \n"  // store 8 UV
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1");
+}
+
 void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst,
diff --git a/files/source/scale_neon64.cc b/files/source/scale_neon64.cc
index 9f9636e6..ad06ee83 100644
--- a/files/source/scale_neon64.cc
+++ b/files/source/scale_neon64.cc
@@ -1568,6 +1568,45 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
   );
 }
 
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v1.8h}, [%1], #16            \n"  // store 8 UV
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1");
+}
+
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst,
+                                int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v0.8h}, [%1], #16            \n"  // store 8 UV
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1");
+}
+
 void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst,
diff --git a/files/source/scale_uv.cc b/files/source/scale_uv.cc
index 3b3d7b8e..1556071d 100644
--- a/files/source/scale_uv.cc
+++ b/files/source/scale_uv.cc
@@ -83,9 +83,9 @@ static void ScaleUVDown2(int src_width,
   assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
   // Advance to odd row, even column.
   if (filtering == kFilterBilinear) {
-    src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2;
+    src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
   } else {
-    src_uv += (y >> 16) * (int64_t)src_stride + ((x >> 16) - 1) * 2;
+    src_uv += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 2;
   }
 
 #if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
@@ -112,6 +112,22 @@ static void ScaleUVDown2(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEUVROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
+                                          : ScaleUVRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 =
+          filtering == kFilterNone
+              ? ScaleUVRowDown2_NEON
+              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
+                                            : ScaleUVRowDown2Box_NEON);
+    }
+  }
+#endif
 
 // This code is not enabled.  Only box filter is available at this time.
 #if defined(HAS_SCALEUVROWDOWN2_SSSE3)
@@ -130,23 +146,7 @@ static void ScaleUVDown2(int src_width,
     }
   }
 #endif
-// This code is not enabled.  Only box filter is available at this time.
-#if defined(HAS_SCALEUVROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleUVRowDown2 =
-        filtering == kFilterNone
-            ? ScaleUVRowDown2_Any_NEON
-            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
-                                          : ScaleUVRowDown2Box_Any_NEON);
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleUVRowDown2 =
-          filtering == kFilterNone
-              ? ScaleUVRowDown2_NEON
-              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
-                                            : ScaleUVRowDown2Box_NEON);
-    }
-  }
-#endif
+
 #if defined(HAS_SCALEUVROWDOWN2_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ScaleUVRowDown2 =
@@ -193,14 +193,14 @@ static void ScaleUVDown4Box(int src_width,
                             int dy) {
   int j;
   // Allocate 2 rows of UV.
-  const int kRowSize = (dst_width * 2 * 2 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
+  const int row_size = (dst_width * 2 * 2 + 15) & ~15;
+  align_buffer_64(row, row_size * 2);
   int row_stride = src_stride * (dy >> 16);
   void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
                           uint8_t* dst_uv, int dst_width) =
       ScaleUVRowDown2Box_C;
   // Advance to odd row, even column.
-  src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2;
+  src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
   (void)src_width;
   (void)src_height;
   (void)dx;
@@ -234,9 +234,9 @@ static void ScaleUVDown4Box(int src_width,
 
   for (j = 0; j < dst_height; ++j) {
     ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2);
-    ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + kRowSize,
+    ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + row_size,
                     dst_width * 2);
-    ScaleUVRowDown2(row, kRowSize, dst_uv, dst_width);
+    ScaleUVRowDown2(row, row_size, dst_uv, dst_width);
     src_uv += row_stride;
     dst_uv += dst_stride;
   }
@@ -263,7 +263,7 @@ static void ScaleUVDownEven(int src_width,
                             enum FilterMode filtering) {
   int j;
   int col_step = dx >> 16;
-  int row_stride = (dy >> 16) * (int64_t)src_stride;
+  ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride);
   void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride,
                              int src_step, uint8_t* dst_uv, int dst_width) =
       filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C;
@@ -271,7 +271,7 @@ static void ScaleUVDownEven(int src_width,
   (void)src_height;
   assert(IS_ALIGNED(src_width, 2));
   assert(IS_ALIGNED(src_height, 2));
-  src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2;
+  src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
 #if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
@@ -338,10 +338,10 @@ static void ScaleUVBilinearDown(int src_width,
                                 int dy,
                                 enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+  void (*InterpolateRow)(uint8_t* dst_uv, const uint8_t* src_uv,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+  void (*ScaleUVFilterCols)(uint8_t* dst_uv, const uint8_t* src_uv,
                             int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C;
   int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
@@ -397,6 +397,11 @@ static void ScaleUVBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 #if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
     ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
@@ -429,7 +434,7 @@ static void ScaleUVBilinearDown(int src_width,
     }
     for (j = 0; j < dst_height; ++j) {
       int yi = y >> 16;
-      const uint8_t* src = src_uv + yi * (int64_t)src_stride;
+      const uint8_t* src = src_uv + yi * (intptr_t)src_stride;
       if (filtering == kFilterLinear) {
         ScaleUVFilterCols(dst_uv, src, dst_width, x, dx);
       } else {
@@ -464,10 +469,10 @@ static void ScaleUVBilinearUp(int src_width,
                               int dy,
                               enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+  void (*InterpolateRow)(uint8_t* dst_uv, const uint8_t* src_uv,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+  void (*ScaleUVFilterCols)(uint8_t* dst_uv, const uint8_t* src_uv,
                             int dst_width, int x, int dx) =
       filtering ? ScaleUVFilterCols_C : ScaleUVCols_C;
   const int max_y = (src_height - 1) << 16;
@@ -510,6 +515,11 @@ static void ScaleUVBilinearUp(int src_width,
       InterpolateRow = InterpolateRow_LSX;
     }
   }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
 #endif
   if (src_width >= 32768) {
     ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C;
@@ -571,14 +581,14 @@ static void ScaleUVBilinearUp(int src_width,
 
   {
     int yi = y >> 16;
-    const uint8_t* src = src_uv + yi * (int64_t)src_stride;
+    const uint8_t* src = src_uv + yi * (intptr_t)src_stride;
 
     // Allocate 2 rows of UV.
-    const int kRowSize = (dst_width * 2 + 15) & ~15;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (dst_width * 2 + 15) & ~15;
+    align_buffer_64(row, row_size * 2);
 
     uint8_t* rowptr = row;
-    int rowstride = kRowSize;
+    int rowstride = row_size;
     int lasty = yi;
 
     ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
@@ -596,7 +606,7 @@ static void ScaleUVBilinearUp(int src_width,
         if (y > max_y) {
           y = max_y;
           yi = y >> 16;
-          src = src_uv + yi * (int64_t)src_stride;
+          src = src_uv + yi * (intptr_t)src_stride;
         }
         if (yi != lasty) {
           ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
@@ -644,32 +654,32 @@ void ScaleUVLinearUp2(int src_width,
   // This function can only scale up by 2 times horizontally.
   assert(src_width == ((dst_width + 1) / 2));
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
+#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
     ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_NEON
+#ifdef HAS_SCALEUVROWUP2_LINEAR_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
     ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON;
   }
 #endif
 
   if (dst_height == 1) {
-    ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv,
+    ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv,
                dst_width);
   } else {
     dy = FixedDiv(src_height - 1, dst_height - 1);
     y = (1 << 15) - 1;
     for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_uv + (y >> 16) * (int64_t)src_stride, dst_uv, dst_width);
+      ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width);
       dst_uv += dst_stride;
       y += dy;
     }
@@ -697,19 +707,19 @@ void ScaleUVBilinearUp2(int src_width,
   assert(src_width == ((dst_width + 1) / 2));
   assert(src_height == ((dst_height + 1) / 2));
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
   if (TestCpuFlag(kCpuHasSSSE3)) {
     Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
     Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
     Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON;
   }
@@ -751,32 +761,32 @@ void ScaleUVLinearUp2_16(int src_width,
   // This function can only scale up by 2 times horizontally.
   assert(src_width == ((dst_width + 1) / 2));
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
     ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
     ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
     ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON;
   }
 #endif
 
   if (dst_height == 1) {
-    ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv,
+    ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv,
                dst_width);
   } else {
     dy = FixedDiv(src_height - 1, dst_height - 1);
     y = (1 << 15) - 1;
     for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_uv + (y >> 16) * (int64_t)src_stride, dst_uv, dst_width);
+      ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width);
       dst_uv += dst_stride;
       y += dy;
     }
@@ -804,19 +814,19 @@ void ScaleUVBilinearUp2_16(int src_width,
   assert(src_width == ((dst_width + 1) / 2));
   assert(src_height == ((dst_height + 1) / 2));
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
     Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
     Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
     Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON;
   }
@@ -854,7 +864,7 @@ static void ScaleUVSimple(int src_width,
                           int y,
                           int dy) {
   int j;
-  void (*ScaleUVCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width,
+  void (*ScaleUVCols)(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width,
                       int x, int dx) =
       (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C;
   (void)src_height;
@@ -889,7 +899,7 @@ static void ScaleUVSimple(int src_width,
   }
 
   for (j = 0; j < dst_height; ++j) {
-    ScaleUVCols(dst_uv, src_uv + (y >> 16) * (int64_t)src_stride, dst_width, x,
+    ScaleUVCols(dst_uv, src_uv + (y >> 16) * (intptr_t)src_stride, dst_width, x,
                 dx);
     dst_uv += dst_stride;
     y += dy;
@@ -910,7 +920,7 @@ static int UVCopy(const uint8_t* src_uv,
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_uv = src_uv + (height - 1) * (int64_t)src_stride_uv;
+    src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv;
     src_stride_uv = -src_stride_uv;
   }
 
@@ -930,7 +940,7 @@ static int UVCopy_16(const uint16_t* src_uv,
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_uv = src_uv + (height - 1) * (int64_t)src_stride_uv;
+    src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv;
     src_stride_uv = -src_stride_uv;
   }
 
@@ -968,7 +978,7 @@ static void ScaleUV(const uint8_t* src,
   // Negative src_height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
-    src = src + (src_height - 1) * (int64_t)src_stride;
+    src = src + (src_height - 1) * (intptr_t)src_stride;
     src_stride = -src_stride;
   }
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -983,7 +993,7 @@ static void ScaleUV(const uint8_t* src,
   if (clip_y) {
     int64_t clipf = (int64_t)(clip_y)*dy;
     y += (clipf & 0xffff);
-    src += (clipf >> 16) * (int64_t)src_stride;
+    src += (clipf >> 16) * (intptr_t)src_stride;
     dst += clip_y * dst_stride;
   }
 
@@ -1024,7 +1034,7 @@ static void ScaleUV(const uint8_t* src,
 #ifdef HAS_UVCOPY
         if (dx == 0x10000 && dy == 0x10000) {
           // Straight copy.
-          UVCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 2,
+          UVCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2,
                  src_stride, dst, dst_stride, clip_width, clip_height);
           return;
         }
@@ -1039,7 +1049,7 @@ static void ScaleUV(const uint8_t* src,
                        dst_stride, src, dst, x, y, dy, /*bpp=*/2, filtering);
     return;
   }
-  if (filtering && (dst_width + 1) / 2 == src_width) {
+  if ((filtering == kFilterLinear) && ((dst_width + 1) / 2 == src_width)) {
     ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride,
                      dst_stride, src, dst);
     return;
@@ -1118,7 +1128,7 @@ int UVScale_16(const uint16_t* src_uv,
   // Negative src_height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
-    src_uv = src_uv + (src_height - 1) * (int64_t)src_stride_uv;
+    src_uv = src_uv + (src_height - 1) * (intptr_t)src_stride_uv;
     src_stride_uv = -src_stride_uv;
   }
   src_width = Abs(src_width);
@@ -1126,20 +1136,20 @@ int UVScale_16(const uint16_t* src_uv,
 #ifdef HAS_UVCOPY
   if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) {
     if (dst_height == 1) {
-      UVCopy_16(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride_uv,
+      UVCopy_16(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride_uv,
                 src_stride_uv, dst_uv, dst_stride_uv, dst_width, dst_height);
     } else {
       dy = src_height / dst_height;
-      UVCopy_16(src_uv + ((dy - 1) / 2) * (int64_t)src_stride_uv,
-                dy * (int64_t)src_stride_uv, dst_uv, dst_stride_uv, dst_width,
-                dst_height);
+      UVCopy_16(src_uv + ((dy - 1) / 2) * (intptr_t)src_stride_uv,
+                (int)(dy * (intptr_t)src_stride_uv), dst_uv, dst_stride_uv,
+                dst_width, dst_height);
     }
 
     return 0;
   }
 #endif
 
-  if (filtering && (dst_width + 1) / 2 == src_width) {
+  if ((filtering == kFilterLinear) && ((dst_width + 1) / 2 == src_width)) {
     ScaleUVLinearUp2_16(src_width, src_height, dst_width, dst_height,
                         src_stride_uv, dst_stride_uv, src_uv, dst_uv);
     return 0;
diff --git a/files/tools_libyuv/autoroller/roll_deps.py b/files/tools_libyuv/autoroller/roll_deps.py
index 977c86de..2b57eb65 100755
--- a/files/tools_libyuv/autoroller/roll_deps.py
+++ b/files/tools_libyuv/autoroller/roll_deps.py
@@ -1,18 +1,14 @@
 #!/usr/bin/env vpython3
 
-# Copyright 2017 The LibYuv Project Authors. All rights reserved.
+# Copyright (c) 2017 The LibYUV project authors. All Rights Reserved.
 #
 # Use of this source code is governed by a BSD-style license
 # that can be found in the LICENSE file in the root of the source
 # tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
+# in the file PATENTS.  All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
+"""Script to automatically roll dependencies in the LibYUV DEPS file."""
 
-# This is a modified copy of the script in
-# https://webrtc.googlesource.com/src/+/master/tools_webrtc/autoroller/roll_deps.py
-# customized for libyuv.
-
-"""Script to automatically roll dependencies in the libyuv DEPS file."""
 
 import argparse
 import base64
@@ -25,9 +21,46 @@ import sys
 import urllib.request
 
 
+def FindSrcDirPath():
+  """Returns the abs path to the src/ dir of the project."""
+  src_dir = os.path.dirname(os.path.abspath(__file__))
+  while os.path.basename(src_dir) != 'src':
+    src_dir = os.path.normpath(os.path.join(src_dir, os.pardir))
+  return src_dir
+
+
 # Skip these dependencies (list without solution name prefix).
 DONT_AUTOROLL_THESE = [
-  'src/third_party/gflags/src',
+    'src/third_party/gflags/src',
+    'src/third_party/mockito/src',
+]
+
+# These dependencies are missing in chromium/src/DEPS, either unused or already
+# in-tree. For instance, src/base is a part of the Chromium source git repo,
+# but we pull it through a subtree mirror, so therefore it isn't listed in
+# Chromium's deps but it is in ours.
+LIBYUV_ONLY_DEPS = [
+    'src/base',
+    'src/build',
+    'src/buildtools',
+    'src/ios',
+    'src/testing',
+    'src/third_party',
+    'src/third_party/android_support_test_runner',
+    'src/third_party/bazel',
+    'src/third_party/bouncycastle',
+    'src/third_party/errorprone/lib',
+    'src/third_party/findbugs',
+    'src/third_party/gson',
+    'src/third_party/gtest-parallel',
+    'src/third_party/guava',
+    'src/third_party/intellij',
+    'src/third_party/jsr-305/src',
+    'src/third_party/ow2_asm',
+    'src/third_party/proguard',
+    'src/third_party/ub-uiautomator/lib',
+    'src/tools',
+    'src/tools/clang/dsymutil',
 ]
 
 LIBYUV_URL = 'https://chromium.googlesource.com/libyuv/libyuv'
@@ -37,16 +70,22 @@ CHROMIUM_LOG_TEMPLATE = CHROMIUM_SRC_URL + '/+log/%s'
 CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s'
 
 COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$')
-CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'([0-9a-z-]+)\'$')
+CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'([-0-9a-z]+)\'$')
 ROLL_BRANCH_NAME = 'roll_chromium_revision'
 
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-CHECKOUT_SRC_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.pardir,
-                                                 os.pardir))
+CHECKOUT_SRC_DIR = FindSrcDirPath()
 CHECKOUT_ROOT_DIR = os.path.realpath(os.path.join(CHECKOUT_SRC_DIR, os.pardir))
 
+# Copied from tools/android/roll/android_deps/.../BuildConfigGenerator.groovy.
+ANDROID_DEPS_START = r'=== ANDROID_DEPS Generated Code Start ==='
+ANDROID_DEPS_END = r'=== ANDROID_DEPS Generated Code End ==='
+# Location of automically gathered android deps.
+ANDROID_DEPS_PATH = 'src/third_party/android_deps/'
+
 sys.path.append(os.path.join(CHECKOUT_SRC_DIR, 'build'))
-import find_depot_tools  # pylint: disable=wrong-import-position
+import find_depot_tools
+
 find_depot_tools.add_depot_tools_to_path()
 
 CLANG_UPDATE_SCRIPT_URL_PATH = 'tools/clang/scripts/update.py'
@@ -56,11 +95,26 @@ CLANG_UPDATE_SCRIPT_LOCAL_PATH = os.path.join(CHECKOUT_SRC_DIR, 'tools',
 DepsEntry = collections.namedtuple('DepsEntry', 'path url revision')
 ChangedDep = collections.namedtuple('ChangedDep',
                                     'path url current_rev new_rev')
+CipdDepsEntry = collections.namedtuple('CipdDepsEntry', 'path packages')
+VersionEntry = collections.namedtuple('VersionEntry', 'version')
+ChangedCipdPackage = collections.namedtuple(
+    'ChangedCipdPackage', 'path package current_version new_version')
+ChangedVersionEntry = collections.namedtuple(
+    'ChangedVersionEntry', 'path current_version new_version')
+
+ChromiumRevisionUpdate = collections.namedtuple('ChromiumRevisionUpdate',
+                                                ('current_chromium_rev '
+                                                 'new_chromium_rev '))
+
 
 class RollError(Exception):
   pass
 
 
+def StrExpansion():
+  return lambda str_value: str_value
+
+
 def VarLookup(local_scope):
   return lambda var_name: local_scope['vars'][var_name]
 
@@ -68,9 +122,9 @@ def VarLookup(local_scope):
 def ParseDepsDict(deps_content):
   local_scope = {}
   global_scope = {
-    'Var': VarLookup(local_scope),
-    'Str': lambda s: s,
-    'deps_os': {},
+      'Str': StrExpansion(),
+      'Var': VarLookup(local_scope),
+      'deps_os': {},
   }
   exec(deps_content, global_scope, local_scope)
   return local_scope
@@ -82,11 +136,6 @@ def ParseLocalDepsFile(filename):
   return ParseDepsDict(deps_content)
 
 
-def ParseRemoteCrDepsFile(revision):
-  deps_content = ReadRemoteCrFile('DEPS', revision)
-  return ParseDepsDict(deps_content)
-
-
 def ParseCommitPosition(commit_message):
   for line in reversed(commit_message.splitlines()):
     m = COMMIT_POSITION_RE.match(line.strip())
@@ -97,15 +146,18 @@ def ParseCommitPosition(commit_message):
   sys.exit(-1)
 
 
-def _RunCommand(command, working_dir=None, ignore_exit_code=False,
-                extra_env=None, input_data=None):
+def _RunCommand(command,
+                working_dir=None,
+                ignore_exit_code=False,
+                extra_env=None,
+                input_data=None):
   """Runs a command and returns the output from that command.
 
-  If the command fails (exit code != 0), the function will exit the process.
+    If the command fails (exit code != 0), the function will exit the process.
 
-  Returns:
-    A tuple containing the stdout and stderr outputs as strings.
-  """
+    Returns:
+      A tuple containing the stdout and stderr outputs as strings.
+    """
   working_dir = working_dir or CHECKOUT_SRC_DIR
   logging.debug('CMD: %s CWD: %s', ' '.join(command), working_dir)
   env = os.environ.copy()
@@ -134,9 +186,9 @@ def _RunCommand(command, working_dir=None, ignore_exit_code=False,
 def _GetBranches():
   """Returns a tuple of active,branches.
 
-  The 'active' is the name of the currently active branch and 'branches' is a
-  list of all branches.
-  """
+    The 'active' is the name of the currently active branch and 'branches' is a
+    list of all branches.
+    """
   lines = _RunCommand(['git', 'branch'])[0].split('\n')
   branches = []
   active = ''
@@ -160,9 +212,16 @@ def _ReadGitilesContent(url):
 
 
 def ReadRemoteCrFile(path_below_src, revision):
-  """Reads a remote Chromium file of a specific revision. Returns a string."""
-  return _ReadGitilesContent(CHROMIUM_FILE_TEMPLATE % (revision,
-                                                       path_below_src))
+  """Reads a remote Chromium file of a specific revision.
+
+    Args:
+      path_below_src: A path to the target file relative to src dir.
+      revision: Revision to read.
+    Returns:
+      A string with file content.
+    """
+  return _ReadGitilesContent(CHROMIUM_FILE_TEMPLATE %
+                             (revision, path_below_src))
 
 
 def ReadRemoteCrCommit(revision):
@@ -171,7 +230,13 @@ def ReadRemoteCrCommit(revision):
 
 
 def ReadUrlContent(url):
-  """Connect to a remote host and read the contents. Returns a list of lines."""
+  """Connect to a remote host and read the contents.
+
+    Args:
+      url: URL to connect to.
+    Returns:
+      A list of lines.
+    """
   conn = urllib.request.urlopen(url)
   try:
     return conn.readlines()
@@ -185,52 +250,172 @@ def ReadUrlContent(url):
 def GetMatchingDepsEntries(depsentry_dict, dir_path):
   """Gets all deps entries matching the provided path.
 
-  This list may contain more than one DepsEntry object.
-  Example: dir_path='src/testing' would give results containing both
-  'src/testing/gtest' and 'src/testing/gmock' deps entries for Chromium's DEPS.
-  Example 2: dir_path='src/build' should return 'src/build' but not
-  'src/buildtools'.
+    This list may contain more than one DepsEntry object.
+    Example: dir_path='src/testing' would give results containing both
+    'src/testing/gtest' and 'src/testing/gmock' deps entries for Chromium's
+    DEPS.
+    Example 2: dir_path='src/build' should return 'src/build' but not
+    'src/buildtools'.
 
-  Returns:
-    A list of DepsEntry objects.
-  """
+    Returns:
+      A list of DepsEntry objects.
+    """
   result = []
   for path, depsentry in depsentry_dict.items():
     if path == dir_path:
       result.append(depsentry)
     else:
       parts = path.split('/')
-      if all(part == parts[i]
-             for i, part in enumerate(dir_path.split('/'))):
+      if all(part == parts[i] for i, part in enumerate(dir_path.split('/'))):
         result.append(depsentry)
   return result
 
+
 def BuildDepsentryDict(deps_dict):
   """Builds a dict of paths to DepsEntry objects from a raw deps dict."""
   result = {}
 
   def AddDepsEntries(deps_subdict):
-    for path, deps_url_spec in deps_subdict.items():
-      if isinstance(deps_url_spec, dict):
-        if deps_url_spec.get('dep_type') == 'cipd':
-          continue
-        deps_url = deps_url_spec['url']
+    for path, dep in deps_subdict.items():
+      if path in result:
+        continue
+      if not isinstance(dep, dict):
+        dep = {'url': dep}
+      if dep.get('dep_type') == 'cipd':
+        result[path] = CipdDepsEntry(path, dep['packages'])
       else:
-        deps_url = deps_url_spec
-      if not path in result:
-        url, revision = deps_url.split('@') if deps_url else (None, None)
+        if '@' not in dep['url']:
+          continue
+        url, revision = dep['url'].split('@')
         result[path] = DepsEntry(path, url, revision)
 
+  def AddVersionEntry(vars_subdict):
+    for key, value in vars_subdict.items():
+      if key in result:
+        continue
+      if not key.endswith('_version'):
+        continue
+      key = re.sub('_version$', '', key)
+      result[key] = VersionEntry(value)
+
   AddDepsEntries(deps_dict['deps'])
-  for deps_os in ['win', 'mac', 'linux', 'android', 'ios', 'unix']:
+  for deps_os in ['win', 'mac', 'unix', 'android', 'ios', 'unix']:
     AddDepsEntries(deps_dict.get('deps_os', {}).get(deps_os, {}))
+  AddVersionEntry(deps_dict.get('vars', {}))
   return result
 
 
+def _FindChangedCipdPackages(path, old_pkgs, new_pkgs):
+  old_pkgs_names = {p['package'] for p in old_pkgs}
+  new_pkgs_names = {p['package'] for p in new_pkgs}
+  pkgs_equal = (old_pkgs_names == new_pkgs_names)
+  added_pkgs = [p for p in new_pkgs_names if p not in old_pkgs_names]
+  removed_pkgs = [p for p in old_pkgs_names if p not in new_pkgs_names]
+
+  assert pkgs_equal, ('Old: %s\n New: %s.\nYou need to do a manual roll '
+                      'and remove/add entries in DEPS so the old and new '
+                      'list match.\nMost likely, you should add \"%s\" and '
+                      'remove \"%s\"' %
+                      (old_pkgs, new_pkgs, added_pkgs, removed_pkgs))
+
+  for old_pkg in old_pkgs:
+    for new_pkg in new_pkgs:
+      old_version = old_pkg['version']
+      new_version = new_pkg['version']
+      if (old_pkg['package'] == new_pkg['package']
+          and old_version != new_version):
+        logging.debug('Roll dependency %s to %s', path, new_version)
+        yield ChangedCipdPackage(path, old_pkg['package'], old_version,
+                                 new_version)
+
+
+def _FindChangedVars(name, old_version, new_version):
+  if old_version != new_version:
+    logging.debug('Roll dependency %s to %s', name, new_version)
+    yield ChangedVersionEntry(name, old_version, new_version)
+
+
+def _FindNewDeps(old, new):
+  """ Gather dependencies only in `new` and return corresponding paths. """
+  old_entries = set(BuildDepsentryDict(old))
+  new_entries = set(BuildDepsentryDict(new))
+  return [
+      path for path in new_entries - old_entries
+      if path not in DONT_AUTOROLL_THESE
+  ]
+
+
+def FindAddedDeps(libyuv_deps, new_cr_deps):
+  """
+    Calculate new deps entries of interest.
+
+    Ideally, that would mean: only appearing in chromium DEPS
+    but transitively used in LibYUV.
+
+    Since it's hard to compute, we restrict ourselves to a well defined subset:
+    deps sitting in `ANDROID_DEPS_PATH`.
+    Otherwise, assumes that's a Chromium-only dependency.
+
+    Args:
+      libyuv_deps: dict of deps as defined in the LibYUV DEPS file.
+      new_cr_deps: dict of deps as defined in the chromium DEPS file.
+
+    Caveat: Doesn't detect a new package in existing dep.
+
+    Returns:
+      A tuple consisting of:
+        A list of paths added dependencies sitting in `ANDROID_DEPS_PATH`.
+        A list of paths for other added dependencies.
+    """
+  all_added_deps = _FindNewDeps(libyuv_deps, new_cr_deps)
+  generated_android_deps = [
+      path for path in all_added_deps if path.startswith(ANDROID_DEPS_PATH)
+  ]
+  other_deps = [
+      path for path in all_added_deps if path not in generated_android_deps
+  ]
+  return generated_android_deps, other_deps
+
+
+def FindRemovedDeps(libyuv_deps, new_cr_deps):
+  """
+    Calculate obsolete deps entries.
+
+    Ideally, that would mean: no more appearing in chromium DEPS
+    and not used in LibYUV.
+
+    Since it's hard to compute:
+     1/ We restrict ourselves to a well defined subset:
+        deps sitting in `ANDROID_DEPS_PATH`.
+     2/ We rely on existing behavior of CalculateChangeDeps.
+        I.e. Assumes non-CIPD dependencies are LibYUV-only, don't remove them.
+
+    Args:
+      libyuv_deps: dict of deps as defined in the LibYUV DEPS file.
+      new_cr_deps: dict of deps as defined in the chromium DEPS file.
+
+    Caveat: Doesn't detect a deleted package in existing dep.
+
+    Returns:
+      A tuple consisting of:
+        A list of paths of dependencies removed from `ANDROID_DEPS_PATH`.
+        A list of paths of unexpected disappearing dependencies.
+    """
+  all_removed_deps = _FindNewDeps(new_cr_deps, libyuv_deps)
+  generated_android_deps = sorted(
+      [path for path in all_removed_deps if path.startswith(ANDROID_DEPS_PATH)])
+  # Webrtc-only dependencies are handled in CalculateChangedDeps.
+  other_deps = sorted([
+      path for path in all_removed_deps
+      if path not in generated_android_deps and path not in LIBYUV_ONLY_DEPS
+  ])
+  return generated_android_deps, other_deps
+
+
 def CalculateChangedDeps(libyuv_deps, new_cr_deps):
   """
-  Calculate changed deps entries based on entries defined in the libyuv DEPS
-  file:
+    Calculate changed deps entries based on entries defined in the LibYUV DEPS
+    file:
      - If a shared dependency with the Chromium DEPS file: roll it to the same
        revision as Chromium (i.e. entry in the new_cr_deps dict)
      - If it's a Chromium sub-directory, roll it to the HEAD revision (notice
@@ -239,9 +424,9 @@ def CalculateChangedDeps(libyuv_deps, new_cr_deps):
      - If it's another DEPS entry (not shared with Chromium), roll it to HEAD
        unless it's configured to be skipped.
 
-  Returns:
-    A list of ChangedDep objects representing the changed deps.
-  """
+    Returns:
+      A list of ChangedDep objects representing the changed deps.
+    """
   result = []
   libyuv_entries = BuildDepsentryDict(libyuv_deps)
   new_cr_entries = BuildDepsentryDict(new_cr_deps)
@@ -250,68 +435,117 @@ def CalculateChangedDeps(libyuv_deps, new_cr_deps):
       continue
     cr_deps_entry = new_cr_entries.get(path)
     if cr_deps_entry:
+      assert type(cr_deps_entry) is type(libyuv_deps_entry)
+
+      if isinstance(cr_deps_entry, CipdDepsEntry):
+        result.extend(
+            _FindChangedCipdPackages(path, libyuv_deps_entry.packages,
+                                     cr_deps_entry.packages))
+        continue
+
+      if isinstance(cr_deps_entry, VersionEntry):
+        result.extend(
+            _FindChangedVars(path, libyuv_deps_entry.version,
+                             cr_deps_entry.version))
+        continue
+
       # Use the revision from Chromium's DEPS file.
       new_rev = cr_deps_entry.revision
       assert libyuv_deps_entry.url == cr_deps_entry.url, (
-        'Libyuv DEPS entry %s has a different URL (%s) than Chromium (%s).' %
-        (path, libyuv_deps_entry.url, cr_deps_entry.url))
+          'LibYUV DEPS entry %s has a different URL %s than Chromium %s.' %
+          (path, libyuv_deps_entry.url, cr_deps_entry.url))
     else:
-      # Use the HEAD of the deps repo.
-      stdout, _ = _RunCommand(['git', 'ls-remote', libyuv_deps_entry.url,
-                               'HEAD'])
-      new_rev = stdout.strip().split('\t')[0]
+      if isinstance(libyuv_deps_entry, DepsEntry):
+        # Use the HEAD of the deps repo.
+        stdout, _ = _RunCommand(
+            ['git', 'ls-remote', libyuv_deps_entry.url, 'HEAD'])
+        new_rev = stdout.strip().split('\t')[0]
+      else:
+        # The dependency has been removed from chromium.
+        # This is handled by FindRemovedDeps.
+        continue
 
     # Check if an update is necessary.
     if libyuv_deps_entry.revision != new_rev:
       logging.debug('Roll dependency %s to %s', path, new_rev)
-      result.append(ChangedDep(path, libyuv_deps_entry.url,
-                               libyuv_deps_entry.revision, new_rev))
+      result.append(
+          ChangedDep(path, libyuv_deps_entry.url, libyuv_deps_entry.revision,
+                     new_rev))
   return sorted(result)
 
 
 def CalculateChangedClang(new_cr_rev):
+
   def GetClangRev(lines):
     for line in lines:
       match = CLANG_REVISION_RE.match(line)
       if match:
         return match.group(1)
-    raise RollError('Could not parse Clang revision from:\n' + '\n'.join('  ' + l for l in lines))
+    raise RollError('Could not parse Clang revision!')
 
   with open(CLANG_UPDATE_SCRIPT_LOCAL_PATH, 'r') as f:
     current_lines = f.readlines()
   current_rev = GetClangRev(current_lines)
 
   new_clang_update_py = ReadRemoteCrFile(CLANG_UPDATE_SCRIPT_URL_PATH,
-                                             new_cr_rev).splitlines()
+                                         new_cr_rev).splitlines()
   new_rev = GetClangRev(new_clang_update_py)
   return ChangedDep(CLANG_UPDATE_SCRIPT_LOCAL_PATH, None, current_rev, new_rev)
 
 
-def GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos,
-                          new_commit_pos, changed_deps_list, clang_change):
-  current_cr_rev = current_cr_rev[0:10]
-  new_cr_rev = new_cr_rev[0:10]
+def GenerateCommitMessage(
+        rev_update,
+        current_commit_pos,
+        new_commit_pos,
+        changed_deps_list,
+        added_deps_paths=None,
+        removed_deps_paths=None,
+        clang_change=None,
+):
+  current_cr_rev = rev_update.current_chromium_rev[0:10]
+  new_cr_rev = rev_update.new_chromium_rev[0:10]
   rev_interval = '%s..%s' % (current_cr_rev, new_cr_rev)
   git_number_interval = '%s:%s' % (current_commit_pos, new_commit_pos)
 
-  commit_msg = ['Roll chromium_revision %s (%s)\n' % (rev_interval,
-                                                    git_number_interval)]
-  commit_msg.append('Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval))
-  commit_msg.append('Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE %
-                                         rev_interval))
+  commit_msg = [
+      'Roll chromium_revision %s (%s)\n' % (rev_interval, git_number_interval),
+      'Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval),
+      'Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE % rev_interval)
+  ]
+
+  def Section(adjective, deps):
+    noun = 'dependency' if len(deps) == 1 else 'dependencies'
+    commit_msg.append('%s %s' % (adjective, noun))
+
   if changed_deps_list:
-    commit_msg.append('Changed dependencies:')
+    Section('Changed', changed_deps_list)
 
     for c in changed_deps_list:
-      commit_msg.append('* %s: %s/+log/%s..%s' % (c.path, c.url,
-                                                  c.current_rev[0:10],
-                                                  c.new_rev[0:10]))
+      if isinstance(c, ChangedCipdPackage):
+        commit_msg.append('* %s: %s..%s' %
+                          (c.path, c.current_version, c.new_version))
+      elif isinstance(c, ChangedVersionEntry):
+        commit_msg.append('* %s_vesion: %s..%s' %
+                          (c.path, c.current_version, c.new_version))
+      else:
+        commit_msg.append('* %s: %s/+log/%s..%s' %
+                          (c.path, c.url, c.current_rev[0:10], c.new_rev[0:10]))
+
+  if added_deps_paths:
+    Section('Added', added_deps_paths)
+    commit_msg.extend('* %s' % p for p in added_deps_paths)
+
+  if removed_deps_paths:
+    Section('Removed', removed_deps_paths)
+    commit_msg.extend('* %s' % p for p in removed_deps_paths)
+
+  if any([changed_deps_list, added_deps_paths, removed_deps_paths]):
     change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval, 'DEPS')
     commit_msg.append('DEPS diff: %s\n' % change_url)
   else:
     commit_msg.append('No dependencies changed.')
 
-  if clang_change.current_rev != clang_change.new_rev:
+  if clang_change and clang_change.current_rev != clang_change.new_rev:
     commit_msg.append('Clang version changed %s:%s' %
                       (clang_change.current_rev, clang_change.new_rev))
     change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval,
@@ -320,38 +554,61 @@ def GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos,
   else:
     commit_msg.append('No update to Clang.\n')
 
-  # TBR needs to be non-empty for Gerrit to process it.
-  git_author = _RunCommand(['git', 'config', 'user.email'],
-                           working_dir=CHECKOUT_SRC_DIR)[0].strip()
-  commit_msg.append('TBR=%s' % git_author)
-
   commit_msg.append('BUG=None')
   return '\n'.join(commit_msg)
 
 
-def UpdateDepsFile(deps_filename, old_cr_revision, new_cr_revision,
-                   changed_deps):
+def UpdateDepsFile(deps_filename, rev_update, changed_deps, new_cr_content):
   """Update the DEPS file with the new revision."""
 
-  # Update the chromium_revision variable.
   with open(deps_filename, 'rb') as deps_file:
     deps_content = deps_file.read().decode('utf-8')
-  deps_content = deps_content.replace(old_cr_revision, new_cr_revision)
+
+  # Update the chromium_revision variable.
+  deps_content = deps_content.replace(rev_update.current_chromium_rev,
+                                      rev_update.new_chromium_rev)
+
+  # Add and remove dependencies. For now: only generated android deps.
+  # Since gclient cannot add or remove deps, we on the fact that
+  # these android deps are located in one place we can copy/paste.
+  deps_re = re.compile(ANDROID_DEPS_START + '.*' + ANDROID_DEPS_END, re.DOTALL)
+  new_deps = deps_re.search(new_cr_content)
+  old_deps = deps_re.search(deps_content)
+  if not new_deps or not old_deps:
+    faulty = 'Chromium' if not new_deps else 'LibYUV'
+    raise RollError('Was expecting to find "%s" and "%s"\n'
+                    'in %s DEPS' %
+                    (ANDROID_DEPS_START, ANDROID_DEPS_END, faulty))
+  deps_content = deps_re.sub(new_deps.group(0), deps_content)
+
+  for dep in changed_deps:
+    if isinstance(dep, ChangedVersionEntry):
+      deps_content = deps_content.replace(dep.current_version, dep.new_version)
+
   with open(deps_filename, 'wb') as deps_file:
     deps_file.write(deps_content.encode('utf-8'))
 
   # Update each individual DEPS entry.
   for dep in changed_deps:
+    # ChangedVersionEntry types are already been processed.
+    if isinstance(dep, ChangedVersionEntry):
+      continue
     local_dep_dir = os.path.join(CHECKOUT_ROOT_DIR, dep.path)
     if not os.path.isdir(local_dep_dir):
       raise RollError(
-          'Cannot find local directory %s. Make sure the .gclient file\n'
-          'contains all platforms in the target_os list, i.e.\n'
+          'Cannot find local directory %s. Either run\n'
+          'gclient sync --deps=all\n'
+          'or make sure the .gclient file for your solution contains all '
+          'platforms in the target_os list, i.e.\n'
           'target_os = ["android", "unix", "mac", "ios", "win"];\n'
           'Then run "gclient sync" again.' % local_dep_dir)
-    _RunCommand(
-      ['gclient', 'setdep', '--revision', '%s@%s' % (dep.path, dep.new_rev)],
-      working_dir=CHECKOUT_SRC_DIR)
+    if isinstance(dep, ChangedCipdPackage):
+      package = dep.package.format()  # Eliminate double curly brackets
+      update = '%s:%s@%s' % (dep.path, package, dep.new_version)
+    else:
+      update = '%s@%s' % (dep.path, dep.new_rev)
+    _RunCommand(['gclient', 'setdep', '--revision', update],
+                working_dir=CHECKOUT_SRC_DIR)
 
 
 def _IsTreeClean():
@@ -363,9 +620,9 @@ def _IsTreeClean():
   return False
 
 
-def _EnsureUpdatedMasterBranch(dry_run):
-  current_branch = _RunCommand(
-      ['git', 'rev-parse', '--abbrev-ref', 'HEAD'])[0].splitlines()[0]
+def _EnsureUpdatedMainBranch(dry_run):
+  current_branch = _RunCommand(['git', 'rev-parse', '--abbrev-ref',
+                                'HEAD'])[0].splitlines()[0]
   if current_branch != 'main':
     logging.error('Please checkout the main branch and re-run this script.')
     if not dry_run:
@@ -407,19 +664,34 @@ def ChooseCQMode(skip_cq, cq_over, current_commit_pos, new_commit_pos):
   return 2
 
 
-def _UploadCL(commit_queue_mode):
+def _GetCcRecipients(changed_deps_list):
+  """Returns a list of emails to notify based on the changed deps list.
+    """
+  cc_recipients = []
+  for c in changed_deps_list:
+    pass
+  return cc_recipients
+
+
+def _UploadCL(commit_queue_mode, add_cc=None):
   """Upload the committed changes as a changelist to Gerrit.
 
-  commit_queue_mode:
-    - 2: Submit to commit queue.
-    - 1: Run trybots but do not submit to CQ.
-    - 0: Skip CQ, upload only.
-  """
-  cmd = ['git', 'cl', 'upload', '--force', '--bypass-hooks', '--send-mail']
+    commit_queue_mode:
+     - 2: Submit to commit queue.
+     - 1: Run trybots but do not submit to CQ.
+     - 0: Skip CQ, upload only.
+
+    add_cc: A list of email addresses to add as CC recipients.
+    """
+  cc_recipients = []
+  if add_cc:
+    cc_recipients.extend(add_cc)
+  cmd = ['git', 'cl', 'upload', '--force', '--bypass-hooks']
   if commit_queue_mode >= 2:
     logging.info('Sending the CL to the CQ...')
     cmd.extend(['-o', 'label=Bot-Commit+1'])
     cmd.extend(['-o', 'label=Commit-Queue+2'])
+    cmd.extend(['--send-mail', '--cc', ','.join(cc_recipients)])
   elif commit_queue_mode >= 1:
     logging.info('Starting CQ dry run...')
     cmd.extend(['-o', 'label=Commit-Queue+1'])
@@ -429,31 +701,57 @@ def _UploadCL(commit_queue_mode):
   }
   stdout, stderr = _RunCommand(cmd, extra_env=extra_env)
   logging.debug('Output from "git cl upload":\nstdout:\n%s\n\nstderr:\n%s',
-      stdout, stderr)
+                stdout, stderr)
+
+
+def GetRollRevisionRanges(opts, libyuv_deps):
+  current_cr_rev = libyuv_deps['vars']['chromium_revision']
+  new_cr_rev = opts.revision
+  if not new_cr_rev:
+    stdout, _ = _RunCommand(['git', 'ls-remote', CHROMIUM_SRC_URL, 'HEAD'])
+    head_rev = stdout.strip().split('\t')[0]
+    logging.info('No revision specified. Using HEAD: %s', head_rev)
+    new_cr_rev = head_rev
+
+  return ChromiumRevisionUpdate(current_cr_rev, new_cr_rev)
 
 
 def main():
   p = argparse.ArgumentParser()
-  p.add_argument('--clean', action='store_true', default=False,
+  p.add_argument('--clean',
+                 action='store_true',
+                 default=False,
                  help='Removes any previous local roll branch.')
-  p.add_argument('-r', '--revision',
+  p.add_argument('-r',
+                 '--revision',
                  help=('Chromium Git revision to roll to. Defaults to the '
                        'Chromium HEAD revision if omitted.'))
-  p.add_argument('--dry-run', action='store_true', default=False,
+  p.add_argument('--dry-run',
+                 action='store_true',
+                 default=False,
                  help=('Calculate changes and modify DEPS, but don\'t create '
                        'any local branch, commit, upload CL or send any '
                        'tryjobs.'))
-  p.add_argument('-i', '--ignore-unclean-workdir', action='store_true',
+  p.add_argument('-i',
+                 '--ignore-unclean-workdir',
+                 action='store_true',
                  default=False,
                  help=('Ignore if the current branch is not main or if there '
                        'are uncommitted changes (default: %(default)s).'))
   grp = p.add_mutually_exclusive_group()
-  grp.add_argument('--skip-cq', action='store_true', default=False,
+  grp.add_argument('--skip-cq',
+                   action='store_true',
+                   default=False,
                    help='Skip sending the CL to the CQ (default: %(default)s)')
-  grp.add_argument('--cq-over', type=int, default=1,
+  grp.add_argument('--cq-over',
+                   type=int,
+                   default=1,
                    help=('Commit queue dry run if the revision difference '
                          'is below this number (default: %(default)s)'))
-  p.add_argument('-v', '--verbose', action='store_true', default=False,
+  p.add_argument('-v',
+                 '--verbose',
+                 action='store_true',
+                 default=False,
                  help='Be extra verbose in printing of log messages.')
   opts = p.parse_args()
 
@@ -470,38 +768,52 @@ def main():
     _RemovePreviousRollBranch(opts.dry_run)
 
   if not opts.ignore_unclean_workdir:
-    _EnsureUpdatedMasterBranch(opts.dry_run)
-
-  new_cr_rev = opts.revision
-  if not new_cr_rev:
-    stdout, _ = _RunCommand(['git', 'ls-remote', CHROMIUM_SRC_URL, 'HEAD'])
-    head_rev = stdout.strip().split('\t')[0]
-    logging.info('No revision specified. Using HEAD: %s', head_rev)
-    new_cr_rev = head_rev
+    _EnsureUpdatedMainBranch(opts.dry_run)
 
   deps_filename = os.path.join(CHECKOUT_SRC_DIR, 'DEPS')
   libyuv_deps = ParseLocalDepsFile(deps_filename)
-  current_cr_rev = libyuv_deps['vars']['chromium_revision']
 
-  current_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(current_cr_rev))
-  new_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(new_cr_rev))
+  rev_update = GetRollRevisionRanges(opts, libyuv_deps)
 
-  new_cr_deps = ParseRemoteCrDepsFile(new_cr_rev)
+  current_commit_pos = ParseCommitPosition(
+      ReadRemoteCrCommit(rev_update.current_chromium_rev))
+  new_commit_pos = ParseCommitPosition(
+      ReadRemoteCrCommit(rev_update.new_chromium_rev))
+
+  new_cr_content = ReadRemoteCrFile('DEPS', rev_update.new_chromium_rev)
+  new_cr_deps = ParseDepsDict(new_cr_content)
   changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps)
-  clang_change = CalculateChangedClang(new_cr_rev)
-  commit_msg = GenerateCommitMessage(current_cr_rev, new_cr_rev,
-                                     current_commit_pos, new_commit_pos,
-                                     changed_deps, clang_change)
+  # Discard other deps, assumed to be chromium-only dependencies.
+  new_generated_android_deps, _ = FindAddedDeps(libyuv_deps, new_cr_deps)
+  removed_generated_android_deps, other_deps = FindRemovedDeps(
+      libyuv_deps, new_cr_deps)
+  if other_deps:
+    raise RollError('LibYUV DEPS entries are missing from Chromium: %s.\n'
+                    'Remove them or add them to either '
+                    'LIBYUV_ONLY_DEPS or DONT_AUTOROLL_THESE.' % other_deps)
+  clang_change = CalculateChangedClang(rev_update.new_chromium_rev)
+  commit_msg = GenerateCommitMessage(
+      rev_update,
+      current_commit_pos,
+      new_commit_pos,
+      changed_deps,
+      added_deps_paths=new_generated_android_deps,
+      removed_deps_paths=removed_generated_android_deps,
+      clang_change=clang_change)
   logging.debug('Commit message:\n%s', commit_msg)
 
   _CreateRollBranch(opts.dry_run)
-  UpdateDepsFile(deps_filename, current_cr_rev, new_cr_rev, changed_deps)
-  _LocalCommit(commit_msg, opts.dry_run)
-  commit_queue_mode = ChooseCQMode(opts.skip_cq, opts.cq_over,
-                                   current_commit_pos, new_commit_pos)
-  logging.info('Uploading CL...')
   if not opts.dry_run:
-    _UploadCL(commit_queue_mode)
+    UpdateDepsFile(deps_filename, rev_update, changed_deps, new_cr_content)
+  if _IsTreeClean():
+    logging.info("No DEPS changes detected, skipping CL creation.")
+  else:
+    _LocalCommit(commit_msg, opts.dry_run)
+    commit_queue_mode = ChooseCQMode(opts.skip_cq, opts.cq_over,
+                                     current_commit_pos, new_commit_pos)
+    logging.info('Uploading CL...')
+    if not opts.dry_run:
+      _UploadCL(commit_queue_mode, _GetCcRecipients(changed_deps))
   return 0
 
 
diff --git a/files/unit_test/convert_test.cc b/files/unit_test/convert_test.cc
index 1f975825..1f1896b0 100644
--- a/files/unit_test/convert_test.cc
+++ b/files/unit_test/convert_test.cc
@@ -48,6 +48,7 @@ namespace libyuv {
 #define AR30ToAR30 ARGBCopy
 #define ABGRToABGR ARGBCopy
 
+// subsample amount uses a divide.
 #define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
 
 // Planar test
@@ -180,9 +181,12 @@ TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I012, uint16_t, 2, 2, 2, 12)
 TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10)
 TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I420, uint8_t, 1, 2, 2, 10)
 TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 10)
+TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I420, uint8_t, 1, 2, 2, 10)
 TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 10)
 TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I420, uint8_t, 1, 2, 2, 12)
 TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 12)
+TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I420, uint8_t, 1, 2, 2, 12)
 TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12)
 
 // Test Android 420 to I420
@@ -417,131 +421,136 @@ TESTPLANARTOBP(I210, uint16_t, 2, 2, 1, P210, uint16_t, 2, 2, 1, 10)
 TESTPLANARTOBP(I012, uint16_t, 2, 2, 2, P012, uint16_t, 2, 2, 2, 12)
 TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
 
-#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,       \
-                          SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,           \
-                          DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,    \
-                          DOY, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)             \
-  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {               \
-    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");        \
-    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");        \
-    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                    \
-                  "SRC_SUBSAMP_X unsupported");                                \
-    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                    \
-                  "SRC_SUBSAMP_Y unsupported");                                \
-    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                    \
-                  "DST_SUBSAMP_X unsupported");                                \
-    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                    \
-                  "DST_SUBSAMP_Y unsupported");                                \
-    const int kWidth = W1280;                                                  \
-    const int kHeight = benchmark_height_;                                     \
-    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);                \
-    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);                \
-    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);              \
-    const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1);  \
-    const int kPaddedHeight =                                                  \
-        (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1);                    \
-    const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X);    \
-    const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y);  \
-    align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF);  \
-    align_buffer_page_end(                                                     \
-        src_uv,                                                                \
-        2 * kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC + OFF);       \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                  \
-    align_buffer_page_end(dst_uv_c,                                            \
-                          2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);       \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);                \
-    align_buffer_page_end(dst_uv_opt,                                          \
-                          2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);       \
-    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                    \
-    SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                  \
-    for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) {                   \
-      src_y_p[i] =                                                             \
-          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
-    }                                                                          \
-    for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2; ++i) { \
-      src_uv_p[i] =                                                            \
-          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
-    }                                                                          \
-    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                              \
-    memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);         \
-    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                          \
-    memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);     \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(                                            \
-        src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth,                          \
-        DOY ? reinterpret_cast<DST_T*>(dst_y_c) : NULL, kWidth,                \
-        reinterpret_cast<DST_T*>(dst_uv_c), 2 * kDstHalfWidth, kWidth,         \
-        NEG kHeight);                                                          \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      SRC_FMT_PLANAR##To##FMT_PLANAR(                                          \
-          src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth,                        \
-          DOY ? reinterpret_cast<DST_T*>(dst_y_opt) : NULL, kWidth,            \
-          reinterpret_cast<DST_T*>(dst_uv_opt), 2 * kDstHalfWidth, kWidth,     \
-          NEG kHeight);                                                        \
-    }                                                                          \
-    if (DOY) {                                                                 \
-      for (int i = 0; i < kHeight; ++i) {                                      \
-        for (int j = 0; j < kWidth; ++j) {                                     \
-          EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);       \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-    for (int i = 0; i < kDstHalfHeight; ++i) {                                 \
-      for (int j = 0; j < 2 * kDstHalfWidth; ++j) {                            \
-        EXPECT_EQ(dst_uv_c[i * 2 * kDstHalfWidth + j],                         \
-                  dst_uv_opt[i * 2 * kDstHalfWidth + j]);                      \
-      }                                                                        \
-    }                                                                          \
-    free_aligned_buffer_page_end(dst_y_c);                                     \
-    free_aligned_buffer_page_end(dst_uv_c);                                    \
-    free_aligned_buffer_page_end(dst_y_opt);                                   \
-    free_aligned_buffer_page_end(dst_uv_opt);                                  \
-    free_aligned_buffer_page_end(src_y);                                       \
-    free_aligned_buffer_page_end(src_uv);                                      \
+#define TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,            \
+                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+                    DST_SUBSAMP_Y, W1280, N, NEG, OFF, DOY, SRC_DEPTH,        \
+                    TILE_WIDTH, TILE_HEIGHT)                                  \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
+    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");       \
+    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                   \
+                  "SRC_SUBSAMP_X unsupported");                               \
+    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                   \
+                  "SRC_SUBSAMP_Y unsupported");                               \
+    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                   \
+                  "DST_SUBSAMP_X unsupported");                               \
+    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
+                  "DST_SUBSAMP_Y unsupported");                               \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
+    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
+    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
+    const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1); \
+    const int kPaddedHeight =                                                 \
+        (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1);                   \
+    const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X);   \
+    const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \
+    align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \
+    align_buffer_page_end(                                                    \
+        src_uv,                                                               \
+        2 * kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC + OFF);      \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
+    align_buffer_page_end(dst_uv_c,                                           \
+                          2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);      \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
+    align_buffer_page_end(dst_uv_opt,                                         \
+                          2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);      \
+    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
+    SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                 \
+    for (int i = 0;                                                           \
+         i < kPaddedWidth * kPaddedHeight * SRC_BPC / (int)sizeof(SRC_T);     \
+         ++i) {                                                               \
+      src_y_p[i] =                                                            \
+          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));      \
+    }                                                                         \
+    for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2 *      \
+                            SRC_BPC / (int)sizeof(SRC_T);                     \
+         ++i) {                                                               \
+      src_uv_p[i] =                                                           \
+          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));      \
+    }                                                                         \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
+    memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);        \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
+    memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);    \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
+        src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p,              \
+        2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T),                     \
+        DOY ? reinterpret_cast<DST_T*>(dst_y_c) : NULL, kWidth,               \
+        reinterpret_cast<DST_T*>(dst_uv_c), 2 * kDstHalfWidth, kWidth,        \
+        NEG kHeight);                                                         \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
+          src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p,            \
+          2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T),                   \
+          DOY ? reinterpret_cast<DST_T*>(dst_y_opt) : NULL, kWidth,           \
+          reinterpret_cast<DST_T*>(dst_uv_opt), 2 * kDstHalfWidth, kWidth,    \
+          NEG kHeight);                                                       \
+    }                                                                         \
+    if (DOY) {                                                                \
+      for (int i = 0; i < kHeight; ++i) {                                     \
+        for (int j = 0; j < kWidth; ++j) {                                    \
+          EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);      \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    for (int i = 0; i < kDstHalfHeight; ++i) {                                \
+      for (int j = 0; j < 2 * kDstHalfWidth; ++j) {                           \
+        EXPECT_EQ(dst_uv_c[i * 2 * kDstHalfWidth + j],                        \
+                  dst_uv_opt[i * 2 * kDstHalfWidth + j]);                     \
+      }                                                                       \
+    }                                                                         \
+    free_aligned_buffer_page_end(dst_y_c);                                    \
+    free_aligned_buffer_page_end(dst_uv_c);                                   \
+    free_aligned_buffer_page_end(dst_y_opt);                                  \
+    free_aligned_buffer_page_end(dst_uv_opt);                                 \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_uv);                                     \
   }
 
-#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,        \
-                         SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,            \
-                         DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH,  \
-                         TILE_HEIGHT)                                          \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
-                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
-                    DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, 1,        \
-                    SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)                        \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
-                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
-                    DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2, 1,      \
-                    SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)                        \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
-                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
-                    DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1,         \
-                    SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)                        \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
-                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
-                    DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH, \
-                    TILE_WIDTH, TILE_HEIGHT)                                   \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
-                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
-                    DST_SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0,          \
-                    SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)
-
-TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1)
-TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8, 1, 1)
-TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
-TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
-TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10, 1, 1)
-TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10, 1, 1)
-TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10, 1, 1)
-TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12, 1, 1)
-TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12, 1, 1)
-TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12, 1, 1)
-TESTBIPLANARTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32)
-
-#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,        \
-                         SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,            \
-                         DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,     \
-                         SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)                   \
+#define TESTBPTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,            \
+                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+                   DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)        \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_ + 1, _Any, +, 0, 1, SRC_DEPTH, TILE_WIDTH,    \
+              TILE_HEIGHT)                                                   \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_, _Unaligned, +, 2, 1, SRC_DEPTH, TILE_WIDTH,  \
+              TILE_HEIGHT)                                                   \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_, _Invert, -, 0, 1, SRC_DEPTH, TILE_WIDTH,     \
+              TILE_HEIGHT)                                                   \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH, TILE_WIDTH,        \
+              TILE_HEIGHT)                                                   \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_, _NullY, +, 0, 0, SRC_DEPTH, TILE_WIDTH,      \
+              TILE_HEIGHT)
+
+TESTBPTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
+TESTBPTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
+TESTBPTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10, 1, 1)
+TESTBPTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10, 1, 1)
+TESTBPTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10, 1, 1)
+TESTBPTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12, 1, 1)
+TESTBPTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12, 1, 1)
+TESTBPTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12, 1, 1)
+TESTBPTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32)
+TESTBPTOBP(MT2T, uint8_t, 10 / 8, 2, 2, P010, uint16_t, 2, 2, 2, 10, 16, 32)
+
+#define TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                   DST_SUBSAMP_Y, W1280, N, NEG, OFF, SRC_DEPTH, TILE_WIDTH,   \
+                   TILE_HEIGHT)                                                \
   TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {               \
     static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");        \
     static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");        \
@@ -621,30 +630,30 @@ TESTBIPLANARTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32)
     free_aligned_buffer_page_end(src_uv);                                      \
   }
 
-#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,         \
-                        SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,             \
-                        DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH,   \
-                        TILE_HEIGHT)                                           \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
-                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                   DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH, \
-                   TILE_WIDTH, TILE_HEIGHT)                                    \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
-                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                   DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2,          \
-                   SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)                         \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
-                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                   DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH,  \
-                   TILE_WIDTH, TILE_HEIGHT)                                    \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
-                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                   DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH,     \
-                   TILE_WIDTH, TILE_HEIGHT)
-
-TESTBIPLANARTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
-TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
-TESTBIPLANARTOP(MM21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 16, 32)
+#define TESTBPTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,            \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+                  DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)        \
+  TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+             FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+             benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH, TILE_WIDTH,       \
+             TILE_HEIGHT)                                                   \
+  TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+             FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+             benchmark_width_, _Unaligned, +, 2, SRC_DEPTH, TILE_WIDTH,     \
+             TILE_HEIGHT)                                                   \
+  TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+             FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+             benchmark_width_, _Invert, -, 0, SRC_DEPTH, TILE_WIDTH,        \
+             TILE_HEIGHT)                                                   \
+  TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+             FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+             benchmark_width_, _Opt, +, 0, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)
+
+TESTBPTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOP(MM21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 16, 32)
+TESTBPTOP(P010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10, 1, 1)
+TESTBPTOP(P012, uint16_t, 2, 2, 2, I012, uint16_t, 2, 2, 2, 12, 1, 1)
 
 // Provide matrix wrappers for full range bt.709
 #define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \
@@ -680,6 +689,12 @@ TESTBIPLANARTOP(MM21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 16, 32)
 #define I422ToARGBFilter(a, b, c, d, e, f, g, h, i, j)                     \
   I422ToARGBMatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
                          kFilterBilinear)
+#define I420ToRGB24Filter(a, b, c, d, e, f, g, h, i, j)                     \
+  I420ToRGB24MatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+                          kFilterBilinear)
+#define I422ToRGB24Filter(a, b, c, d, e, f, g, h, i, j)                     \
+  I420ToRGB24MatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+                          kFilterBilinear)
 
 #define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
 
@@ -792,8 +807,12 @@ TESTPLANARTOB(V422, 2, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(V422, 2, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
+TESTPLANARTOB(I422, 1, 1, RGB24, 3, 3, 1)
+TESTPLANARTOB(I422, 1, 1, RAW, 3, 3, 1)
 TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(I444, 1, 1, RGB24, 3, 3, 1)
+TESTPLANARTOB(I444, 1, 1, RAW, 3, 3, 1)
 TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(J444, 1, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(H444, 1, 1, ARGB, 4, 4, 1)
@@ -816,6 +835,8 @@ TESTPLANARTOB(H420, 2, 2, AB30, 4, 4, 1)
 #endif
 TESTPLANARTOB(I420, 2, 2, ARGBFilter, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, ARGBFilter, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, RGB24Filter, 3, 3, 1)
+TESTPLANARTOB(I422, 2, 2, RGB24Filter, 3, 3, 1)
 #else
 TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1)
 TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1)
@@ -832,14 +853,15 @@ TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1)
 TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1)
 TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1)
 TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1)
-TESTPLANARTOB(I420, 2, 2, ARGBFilter, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1)
 TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
+TESTPLANARTOB(I420, 2, 2, ARGBFilter, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, ARGBFilter, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, RGB24Filter, 3, 3, 1)
 TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
 #endif
@@ -1056,8 +1078,8 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ARGBFilter, 4, 4, 1)
 TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1)
 #endif
 
-#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C,       \
-                         BPP_B, W1280, N, NEG, OFF)                            \
+#define TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+                   W1280, N, NEG, OFF)                                         \
   TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
     const int kWidth = W1280;                                                  \
     const int kHeight = benchmark_height_;                                     \
@@ -1110,15 +1132,15 @@ TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1)
     free_aligned_buffer_page_end(dst_argb32_opt);                              \
   }
 
-#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
-                   benchmark_width_ + 1, _Any, +, 0)                           \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
-                   benchmark_width_, _Unaligned, +, 2)                         \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
-                   benchmark_width_, _Invert, -, 0)                            \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
-                   benchmark_width_, _Opt, +, 0)
+#define TESTBPTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \
+  TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+             benchmark_width_ + 1, _Any, +, 0)                           \
+  TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+             benchmark_width_, _Unaligned, +, 2)                         \
+  TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+             benchmark_width_, _Invert, -, 0)                            \
+  TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+             benchmark_width_, _Opt, +, 0)
 
 #define JNV12ToARGB(a, b, c, d, e, f, g, h) \
   NV12ToARGBMatrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
@@ -1139,29 +1161,29 @@ TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1)
 #define JNV12ToRGB565(a, b, c, d, e, f, g, h) \
   NV12ToRGB565Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
 
-TESTBIPLANARTOB(JNV12, 2, 2, ARGB, ARGB, 4)
-TESTBIPLANARTOB(JNV21, 2, 2, ARGB, ARGB, 4)
-TESTBIPLANARTOB(JNV12, 2, 2, ABGR, ABGR, 4)
-TESTBIPLANARTOB(JNV21, 2, 2, ABGR, ABGR, 4)
-TESTBIPLANARTOB(JNV12, 2, 2, RGB24, RGB24, 3)
-TESTBIPLANARTOB(JNV21, 2, 2, RGB24, RGB24, 3)
-TESTBIPLANARTOB(JNV12, 2, 2, RAW, RAW, 3)
-TESTBIPLANARTOB(JNV21, 2, 2, RAW, RAW, 3)
+TESTBPTOB(JNV12, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(JNV21, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(JNV12, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(JNV21, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(JNV12, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(JNV21, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(JNV12, 2, 2, RAW, RAW, 3)
+TESTBPTOB(JNV21, 2, 2, RAW, RAW, 3)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTBIPLANARTOB(JNV12, 2, 2, RGB565, RGB565, 2)
+TESTBPTOB(JNV12, 2, 2, RGB565, RGB565, 2)
 #endif
 
-TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4)
-TESTBIPLANARTOB(NV21, 2, 2, ARGB, ARGB, 4)
-TESTBIPLANARTOB(NV12, 2, 2, ABGR, ABGR, 4)
-TESTBIPLANARTOB(NV21, 2, 2, ABGR, ABGR, 4)
-TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3)
-TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3)
-TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3)
-TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3)
-TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3)
+TESTBPTOB(NV12, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(NV21, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(NV12, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(NV21, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(NV12, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(NV21, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(NV12, 2, 2, RAW, RAW, 3)
+TESTBPTOB(NV21, 2, 2, RAW, RAW, 3)
+TESTBPTOB(NV21, 2, 2, YUV24, RAW, 3)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2)
+TESTBPTOB(NV12, 2, 2, RGB565, RGB565, 2)
 #endif
 
 #define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
@@ -1236,6 +1258,8 @@ TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1)
 TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1)
 TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2)
 TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1)
+TESTATOPLANAR(ABGR, 4, 1, J420, 2, 2)
+TESTATOPLANAR(ABGR, 4, 1, J422, 2, 1)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
 TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2)
 TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2)
@@ -1254,8 +1278,84 @@ TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1)
 TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2)
 TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1)
 
-#define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X,          \
-                         SUBSAMP_Y, W1280, N, NEG, OFF)                       \
+#define TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X,           \
+                        SUBSAMP_Y, W1280, N, NEG, OFF)                         \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                        \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
+    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                   \
+    align_buffer_page_end(dst_a_c, kWidth* kHeight);                           \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
+    align_buffer_page_end(dst_uv_c,                                            \
+                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    align_buffer_page_end(dst_a_opt, kWidth* kHeight);                         \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
+    align_buffer_page_end(dst_uv_opt,                                          \
+                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    memset(dst_a_c, 1, kWidth* kHeight);                                       \
+    memset(dst_y_c, 2, kWidth* kHeight);                                       \
+    memset(dst_uv_c, 3, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+    memset(dst_a_opt, 101, kWidth* kHeight);                                   \
+    memset(dst_y_opt, 102, kWidth* kHeight);                                   \
+    memset(dst_uv_opt, 103, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));    \
+    for (int i = 0; i < kHeight; ++i)                                          \
+      for (int j = 0; j < kStride; ++j)                                        \
+        src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);               \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c,  \
+                          kStrideUV * 2, dst_uv_c + kStrideUV, kStrideUV * 2,  \
+                          dst_a_c, kWidth, kWidth, NEG kHeight);               \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth,        \
+                            dst_uv_opt, kStrideUV * 2, dst_uv_opt + kStrideUV, \
+                            kStrideUV * 2, dst_a_opt, kWidth, kWidth,          \
+                            NEG kHeight);                                      \
+    }                                                                          \
+    for (int i = 0; i < kHeight; ++i) {                                        \
+      for (int j = 0; j < kWidth; ++j) {                                       \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);         \
+        EXPECT_EQ(dst_a_c[i * kWidth + j], dst_a_opt[i * kWidth + j]);         \
+      }                                                                        \
+    }                                                                          \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) {              \
+      for (int j = 0; j < kStrideUV; ++j) {                                    \
+        EXPECT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \
+      }                                                                        \
+    }                                                                          \
+    free_aligned_buffer_page_end(dst_a_c);                                     \
+    free_aligned_buffer_page_end(dst_y_c);                                     \
+    free_aligned_buffer_page_end(dst_uv_c);                                    \
+    free_aligned_buffer_page_end(dst_a_opt);                                   \
+    free_aligned_buffer_page_end(dst_y_opt);                                   \
+    free_aligned_buffer_page_end(dst_uv_opt);                                  \
+    free_aligned_buffer_page_end(src_argb);                                    \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTATOPLANARA(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_ + 1, _Any, +, 0)                            \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_, _Unaligned, +, 2)                          \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_, _Invert, -, 0)                             \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_, _Opt, +, 0)
+#else
+#define TESTATOPLANARA(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_ + 1, _Any, +, 0)                            \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_, _Opt, +, 0)
+#endif
+
+TESTATOPLANARA(ARGB, 4, 1, I420Alpha, 2, 2)
+
+#define TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,     \
+                   W1280, N, NEG, OFF)                                        \
   TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                       \
     const int kWidth = W1280;                                                 \
     const int kHeight = benchmark_height_;                                    \
@@ -1301,25 +1401,25 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1)
     free_aligned_buffer_page_end(src_argb);                                   \
   }
 
-#define TESTATOBIPLANAR(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
-  TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                   benchmark_width_ + 1, _Any, +, 0)                           \
-  TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                   benchmark_width_, _Unaligned, +, 2)                         \
-  TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                   benchmark_width_, _Invert, -, 0)                            \
-  TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                   benchmark_width_, _Opt, +, 0)
-
-TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2)
-TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2)
-TESTATOBIPLANAR(ABGR, 1, 4, NV12, 2, 2)
-TESTATOBIPLANAR(ABGR, 1, 4, NV21, 2, 2)
-TESTATOBIPLANAR(RAW, 1, 3, JNV21, 2, 2)
-TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2)
-TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
-TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2)
-TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2)
+#define TESTATOBP(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+  TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+             benchmark_width_ + 1, _Any, +, 0)                           \
+  TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+             benchmark_width_, _Unaligned, +, 2)                         \
+  TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+             benchmark_width_, _Invert, -, 0)                            \
+  TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+             benchmark_width_, _Opt, +, 0)
+
+TESTATOBP(ARGB, 1, 4, NV12, 2, 2)
+TESTATOBP(ARGB, 1, 4, NV21, 2, 2)
+TESTATOBP(ABGR, 1, 4, NV12, 2, 2)
+TESTATOBP(ABGR, 1, 4, NV21, 2, 2)
+TESTATOBP(RAW, 1, 3, JNV21, 2, 2)
+TESTATOBP(YUY2, 2, 4, NV12, 2, 2)
+TESTATOBP(UYVY, 2, 4, NV12, 2, 2)
+TESTATOBP(AYUV, 1, 4, NV12, 2, 2)
+TESTATOBP(AYUV, 1, 4, NV21, 2, 2)
 
 #define TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B,     \
                   EPP_B, STRIDE_B, HEIGHT_B, W1280, N, NEG, OFF)               \
@@ -1440,6 +1540,7 @@ TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGBMirror, uint8_t, 4, 4, 1)
 TESTATOB(ARGB, uint8_t, 4, 4, 1, BGRA, uint8_t, 4, 4, 1)
 TESTATOB(ARGB, uint8_t, 4, 4, 1, I400, uint8_t, 1, 1, 1)
 TESTATOB(ARGB, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
 TESTATOB(RGBA, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
 TESTATOB(ARGB, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
 TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
@@ -1450,7 +1551,7 @@ TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB565, uint8_t, 2, 2, 1)
 #endif
 TESTATOB(ARGB, uint8_t, 4, 4, 1, RGBA, uint8_t, 4, 4, 1)
 TESTATOB(ARGB, uint8_t, 4, 4, 1, UYVY, uint8_t, 2, 4, 1)
-TESTATOB(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1)  // 4
+TESTATOB(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1)
 TESTATOB(ARGB1555, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
 TESTATOB(ARGB4444, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
 TESTATOB(BGRA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
@@ -1484,6 +1585,127 @@ TESTATOB(AB64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
 TESTATOB(AR64, uint16_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
 TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
 
+// in place test
+#define TESTATOAI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B,    \
+                  EPP_B, STRIDE_B, HEIGHT_B, W1280, N, NEG, OFF)              \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) {                            \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;      \
+    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;      \
+    const int kStrideA =                                                      \
+        (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                \
+    const int kStrideB =                                                      \
+        (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                \
+    align_buffer_page_end(src_argb,                                           \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
+    align_buffer_page_end(dst_argb_c,                                         \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
+    align_buffer_page_end(dst_argb_opt,                                       \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
+    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {     \
+      src_argb[i + OFF] = (fastrand() & 0xff);                                \
+    }                                                                         \
+    memcpy(dst_argb_c + OFF, src_argb,                                        \
+           kStrideA * kHeightA * (int)sizeof(TYPE_A));                        \
+    memcpy(dst_argb_opt + OFF, src_argb,                                      \
+           kStrideA * kHeightA * (int)sizeof(TYPE_A));                        \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    FMT_A##To##FMT_B((TYPE_A*)(dst_argb_c /* src */ + OFF), kStrideA,         \
+                     (TYPE_B*)dst_argb_c, kStrideB, kWidth, NEG kHeight);     \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      FMT_A##To##FMT_B((TYPE_A*)(dst_argb_opt /* src */ + OFF), kStrideA,     \
+                       (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight); \
+    }                                                                         \
+    memcpy(dst_argb_opt + OFF, src_argb,                                      \
+           kStrideA * kHeightA * (int)sizeof(TYPE_A));                        \
+    FMT_A##To##FMT_B((TYPE_A*)(dst_argb_opt /* src */ + OFF), kStrideA,       \
+                     (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight);   \
+    for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) {     \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                              \
+    }                                                                         \
+    free_aligned_buffer_page_end(src_argb);                                   \
+    free_aligned_buffer_page_end(dst_argb_c);                                 \
+    free_aligned_buffer_page_end(dst_argb_opt);                               \
+  }
+
+#define TESTATOA(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B,   \
+                 EPP_B, STRIDE_B, HEIGHT_B)                                 \
+  TESTATOAI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+            STRIDE_B, HEIGHT_B, benchmark_width_, _Inplace, +, 0)
+
+TESTATOA(AB30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOA(AB30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(ABGR, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(ABGR, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(AR30, uint8_t, 4, 4, 1, AB30, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(AR30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(AR30, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+TESTATOA(AR30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(ARGB, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGB1555, uint8_t, 2, 2, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGB4444, uint8_t, 2, 2, 1)
+// TODO(fbarchard): Support in place for mirror.
+// TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGBMirror, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, BGRA, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, I400, uint8_t, 1, 1, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOA(RGBA, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
+TESTATOA(ABGR, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
+TESTATOA(ABGR, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RGB565, uint8_t, 2, 2, 1)
+#endif
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RGBA, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, UYVY, uint8_t, 2, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1)
+// TODO(fbarchard): Support in place for conversions that increase bpp.
+// TESTATOA(ARGB1555, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(ARGB4444, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(BGRA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(I400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(I400, uint8_t, 1, 1, 1, I400, uint8_t, 1, 1, 1)
+// TESTATOA(I400, uint8_t, 1, 1, 1, I400Mirror, uint8_t, 1, 1, 1)
+// TESTATOA(J400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(J400, uint8_t, 1, 1, 1, J400, uint8_t, 1, 1, 1)
+// TESTATOA(RAW, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(RAW, uint8_t, 3, 3, 1, RGBA, uint8_t, 4, 4, 1)
+TESTATOA(RAW, uint8_t, 3, 3, 1, RGB24, uint8_t, 3, 3, 1)
+// TESTATOA(RGB24, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(RGB24, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1)
+// TESTATOA(RGB24, uint8_t, 3, 3, 1, RGB24Mirror, uint8_t, 3, 3, 1)
+TESTATOA(RAW, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+// TESTATOA(RGB565, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(RGBA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(UYVY, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(YUY2, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(YUY2, uint8_t, 2, 4, 1, Y, uint8_t, 1, 1, 1)
+// TESTATOA(ARGB, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+// TESTATOA(ARGB, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+// TESTATOA(ABGR, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+// TESTATOA(ABGR, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOA(AR64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(AB64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(AR64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOA(AB64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOA(AR64, uint16_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+
 #define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
                    HEIGHT_B, W1280, N, NEG, OFF)                             \
   TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) {                   \
@@ -2065,6 +2287,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420) {
   int half_height = (height + 1) / 2;
   int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
                              benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
 
   align_buffer_page_end(dst_y, width * height);
   align_buffer_page_end(dst_u, half_width * half_height);
@@ -2099,6 +2324,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
   int half_height = (height + 1) / 2;
   int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
                              benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
 
   // Convert to NV21
   align_buffer_page_end(dst_y, width * height);
@@ -2158,6 +2386,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV12) {
   int half_height = (height + 1) / 2;
   int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
                              benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
 
   // Convert to NV12
   align_buffer_page_end(dst_y, width * height);
@@ -2217,6 +2448,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) {
   int half_height = (height + 1) / 2;
   int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
                              benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
 
   align_buffer_page_end(dst_y, width * height);
   align_buffer_page_end(dst_uv, half_width * half_height * 2);
@@ -2247,6 +2481,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV12_420) {
   int half_height = (height + 1) / 2;
   int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
                              benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
 
   align_buffer_page_end(dst_y, width * height);
   align_buffer_page_end(dst_uv, half_width * half_height * 2);
@@ -2282,6 +2519,9 @@ TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV21_422) {
   int half_height = (height + 1) / 2;
   int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
                              benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
 
   align_buffer_page_end(dst_y, width * height);
   align_buffer_page_end(dst_uv, half_width * half_height * 2);
@@ -2312,6 +2552,9 @@ TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV12_422) {
   int half_height = (height + 1) / 2;
   int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
                              benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
 
   align_buffer_page_end(dst_y, width * height);
   align_buffer_page_end(dst_uv, half_width * half_height * 2);
@@ -2346,6 +2589,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) {
   int half_height = (height + 1) / 2;
   int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
                              benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
 
   align_buffer_page_end(dst_y, width * height);
   align_buffer_page_end(dst_uv, half_width * half_height * 2);
@@ -2376,6 +2622,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV12_400) {
   int half_height = (height + 1) / 2;
   int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
                              benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
 
   align_buffer_page_end(dst_y, width * height);
   align_buffer_page_end(dst_uv, half_width * half_height * 2);
@@ -2410,6 +2659,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) {
   int half_height = (height + 1) / 2;
   int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
                              benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
 
   align_buffer_page_end(dst_y, width * height);
   align_buffer_page_end(dst_uv, half_width * half_height * 2);
@@ -2440,6 +2692,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV12_444) {
   int half_height = (height + 1) / 2;
   int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
                              benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
 
   align_buffer_page_end(dst_y, width * height);
   align_buffer_page_end(dst_uv, half_width * half_height * 2);
@@ -2472,6 +2727,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToARGB) {
 
   int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
                              benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
 
   align_buffer_page_end(dst_argb, width * height * 4);
   for (int times = 0; times < benchmark_iterations; ++times) {
@@ -2921,6 +3179,51 @@ TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
 TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12)
 TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
 
+TEST_F(LibYUVConvertTest, MM21ToYUY2) {
+  const int kWidth = (benchmark_width_ + 15) & (~15);
+  const int kHeight = (benchmark_height_ + 31) & (~31);
+
+  align_buffer_page_end(orig_y, kWidth * kHeight);
+  align_buffer_page_end(orig_uv,
+                        2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+
+  align_buffer_page_end(tmp_y, kWidth * kHeight);
+  align_buffer_page_end(tmp_u, SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+  align_buffer_page_end(tmp_v, SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+
+  align_buffer_page_end(dst_yuyv, 4 * SUBSAMPLE(kWidth, 2) * kHeight);
+  align_buffer_page_end(golden_yuyv, 4 * SUBSAMPLE(kWidth, 2) * kHeight);
+
+  MemRandomize(orig_y, kWidth * kHeight);
+  MemRandomize(orig_uv, 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+
+  /* Convert MM21 to YUY2 in 2 steps for reference */
+  libyuv::MM21ToI420(orig_y, kWidth, orig_uv, 2 * SUBSAMPLE(kWidth, 2), tmp_y,
+                     kWidth, tmp_u, SUBSAMPLE(kWidth, 2), tmp_v,
+                     SUBSAMPLE(kWidth, 2), kWidth, kHeight);
+  libyuv::I420ToYUY2(tmp_y, kWidth, tmp_u, SUBSAMPLE(kWidth, 2), tmp_v,
+                     SUBSAMPLE(kWidth, 2), golden_yuyv,
+                     4 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);
+
+  /* Convert to NV12 */
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    libyuv::MM21ToYUY2(orig_y, kWidth, orig_uv, 2 * SUBSAMPLE(kWidth, 2),
+                       dst_yuyv, 4 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);
+  }
+
+  for (int i = 0; i < 4 * SUBSAMPLE(kWidth, 2) * kHeight; ++i) {
+    EXPECT_EQ(dst_yuyv[i], golden_yuyv[i]);
+  }
+
+  free_aligned_buffer_page_end(orig_y);
+  free_aligned_buffer_page_end(orig_uv);
+  free_aligned_buffer_page_end(tmp_y);
+  free_aligned_buffer_page_end(tmp_u);
+  free_aligned_buffer_page_end(tmp_v);
+  free_aligned_buffer_page_end(dst_yuyv);
+  free_aligned_buffer_page_end(golden_yuyv);
+}
+
 // Transitive test.  A to B to C is same as A to C.
 // Benchmarks A To B to C for comparison to 1 step, benchmarked elsewhere.
 #define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
@@ -3353,6 +3656,8 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
   I012ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
 #define I012ToAR30(a, b, c, d, e, f, g, h, i, j) \
   I012ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define I012ToAB30(a, b, c, d, e, f, g, h, i, j) \
+  I012ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
 
 #define I410ToARGB(a, b, c, d, e, f, g, h, i, j) \
   I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
@@ -3495,6 +3800,7 @@ TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AB30, 4, 4, 1)
 TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AR30, 4, 4, 1)
 TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AB30, 4, 4, 1)
 TESTPLANAR16TOB(I012, 2, 2, 0xfff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I012, 2, 2, 0xfff, AB30, 4, 4, 1)
 TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AR30Filter, 4, 4, 1)
 TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30Filter, 4, 4, 1)
 #endif  // LITTLE_ENDIAN_ONLY_TEST
@@ -3733,8 +4039,8 @@ TESTQPLANAR16TOB(I010Alpha, 2, 2, ARGBFilter, 4, 4, 1, 10)
 TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10)
 #endif  // DISABLE_SLOW_TESTS
 
-#define TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,     \
-                           ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH)  \
+#define TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,    \
+                     YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH)               \
   TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
     const int kWidth = W1280;                                                  \
     const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
@@ -3777,16 +4083,16 @@ TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10)
     free_aligned_buffer_page_end(dst_argb_opt);                                \
   }
 
-#define TESTBIPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,    \
-                          ALIGN, YALIGN, S_DEPTH)                            \
-  TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
-                     YALIGN, benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH)   \
-  TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
-                     YALIGN, benchmark_width_, _Unaligned, +, 4, 4, S_DEPTH) \
-  TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
-                     YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH)    \
-  TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
-                     YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH)
+#define TESTBP16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,    \
+                    YALIGN, S_DEPTH)                                          \
+  TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+               benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH)                  \
+  TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+               benchmark_width_, _Unaligned, +, 4, 4, S_DEPTH)                \
+  TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+               benchmark_width_, _Invert, -, 0, 0, S_DEPTH)                   \
+  TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+               benchmark_width_, _Opt, +, 0, 0, S_DEPTH)
 
 #define P010ToARGB(a, b, c, d, e, f, g, h) \
   P010ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
@@ -3829,23 +4135,23 @@ TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10)
                          kFilterBilinear)
 
 #if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
-TESTBIPLANAR16TOB(P010, 2, 2, ARGB, 4, 4, 1, 10)
-TESTBIPLANAR16TOB(P210, 2, 1, ARGB, 4, 4, 1, 10)
-TESTBIPLANAR16TOB(P012, 2, 2, ARGB, 4, 4, 1, 12)
-TESTBIPLANAR16TOB(P212, 2, 1, ARGB, 4, 4, 1, 12)
-TESTBIPLANAR16TOB(P016, 2, 2, ARGB, 4, 4, 1, 16)
-TESTBIPLANAR16TOB(P216, 2, 1, ARGB, 4, 4, 1, 16)
-TESTBIPLANAR16TOB(P010, 2, 2, ARGBFilter, 4, 4, 1, 10)
-TESTBIPLANAR16TOB(P210, 2, 1, ARGBFilter, 4, 4, 1, 10)
+TESTBP16TOB(P010, 2, 2, ARGB, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, ARGB, 4, 4, 1, 10)
+TESTBP16TOB(P012, 2, 2, ARGB, 4, 4, 1, 12)
+TESTBP16TOB(P212, 2, 1, ARGB, 4, 4, 1, 12)
+TESTBP16TOB(P016, 2, 2, ARGB, 4, 4, 1, 16)
+TESTBP16TOB(P216, 2, 1, ARGB, 4, 4, 1, 16)
+TESTBP16TOB(P010, 2, 2, ARGBFilter, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, ARGBFilter, 4, 4, 1, 10)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTBIPLANAR16TOB(P010, 2, 2, AR30, 4, 4, 1, 10)
-TESTBIPLANAR16TOB(P210, 2, 1, AR30, 4, 4, 1, 10)
-TESTBIPLANAR16TOB(P012, 2, 2, AR30, 4, 4, 1, 12)
-TESTBIPLANAR16TOB(P212, 2, 1, AR30, 4, 4, 1, 12)
-TESTBIPLANAR16TOB(P016, 2, 2, AR30, 4, 4, 1, 16)
-TESTBIPLANAR16TOB(P216, 2, 1, AR30, 4, 4, 1, 16)
-TESTBIPLANAR16TOB(P010, 2, 2, AR30Filter, 4, 4, 1, 10)
-TESTBIPLANAR16TOB(P210, 2, 1, AR30Filter, 4, 4, 1, 10)
+TESTBP16TOB(P010, 2, 2, AR30, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, AR30, 4, 4, 1, 10)
+TESTBP16TOB(P012, 2, 2, AR30, 4, 4, 1, 12)
+TESTBP16TOB(P212, 2, 1, AR30, 4, 4, 1, 12)
+TESTBP16TOB(P016, 2, 2, AR30, 4, 4, 1, 16)
+TESTBP16TOB(P216, 2, 1, AR30, 4, 4, 1, 16)
+TESTBP16TOB(P010, 2, 2, AR30Filter, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, AR30Filter, 4, 4, 1, 10)
 #endif  // LITTLE_ENDIAN_ONLY_TEST
 #endif  // DISABLE_SLOW_TESTS
 
diff --git a/files/unit_test/cpu_test.cc b/files/unit_test/cpu_test.cc
index 080778f5..93867fa7 100644
--- a/files/unit_test/cpu_test.cc
+++ b/files/unit_test/cpu_test.cc
@@ -20,13 +20,23 @@ namespace libyuv {
 
 TEST_F(LibYUVBaseTest, TestCpuHas) {
   int cpu_flags = TestCpuFlag(-1);
-  printf("Cpu Flags %d\n", cpu_flags);
+  printf("Cpu Flags 0x%x\n", cpu_flags);
 #if defined(__arm__) || defined(__aarch64__)
   int has_arm = TestCpuFlag(kCpuHasARM);
-  printf("Has ARM %d\n", has_arm);
+  printf("Has ARM 0x%x\n", has_arm);
   int has_neon = TestCpuFlag(kCpuHasNEON);
-  printf("Has NEON %d\n", has_neon);
+  printf("Has NEON 0x%x\n", has_neon);
 #endif
+#if defined(__riscv) && defined(__linux__)
+  int has_riscv = TestCpuFlag(kCpuHasRISCV);
+  printf("Has RISCV 0x%x\n", has_riscv);
+  int has_rvv = TestCpuFlag(kCpuHasRVV);
+  printf("Has RVV 0x%x\n", has_rvv);
+  int has_rvvzvfh = TestCpuFlag(kCpuHasRVVZVFH);
+  printf("Has RVVZVFH 0x%x\n", has_rvvzvfh);
+#endif
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+    defined(_M_X64)
   int has_x86 = TestCpuFlag(kCpuHasX86);
   int has_sse2 = TestCpuFlag(kCpuHasSSE2);
   int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
@@ -45,39 +55,38 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
   int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
   int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
   int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ);
-  printf("Has X86 %d\n", has_x86);
-  printf("Has SSE2 %d\n", has_sse2);
-  printf("Has SSSE3 %d\n", has_ssse3);
-  printf("Has SSE41 %d\n", has_sse41);
-  printf("Has SSE42 %d\n", has_sse42);
-  printf("Has AVX %d\n", has_avx);
-  printf("Has AVX2 %d\n", has_avx2);
-  printf("Has ERMS %d\n", has_erms);
-  printf("Has FMA3 %d\n", has_fma3);
-  printf("Has F16C %d\n", has_f16c);
-  printf("Has GFNI %d\n", has_gfni);
-  printf("Has AVX512BW %d\n", has_avx512bw);
-  printf("Has AVX512VL %d\n", has_avx512vl);
-  printf("Has AVX512VNNI %d\n", has_avx512vnni);
-  printf("Has AVX512VBMI %d\n", has_avx512vbmi);
-  printf("Has AVX512VBMI2 %d\n", has_avx512vbmi2);
-  printf("Has AVX512VBITALG %d\n", has_avx512vbitalg);
-  printf("Has AVX512VPOPCNTDQ %d\n", has_avx512vpopcntdq);
-
+  printf("Has X86 0x%x\n", has_x86);
+  printf("Has SSE2 0x%x\n", has_sse2);
+  printf("Has SSSE3 0x%x\n", has_ssse3);
+  printf("Has SSE41 0x%x\n", has_sse41);
+  printf("Has SSE42 0x%x\n", has_sse42);
+  printf("Has AVX 0x%x\n", has_avx);
+  printf("Has AVX2 0x%x\n", has_avx2);
+  printf("Has ERMS 0x%x\n", has_erms);
+  printf("Has FMA3 0x%x\n", has_fma3);
+  printf("Has F16C 0x%x\n", has_f16c);
+  printf("Has GFNI 0x%x\n", has_gfni);
+  printf("Has AVX512BW 0x%x\n", has_avx512bw);
+  printf("Has AVX512VL 0x%x\n", has_avx512vl);
+  printf("Has AVX512VNNI 0x%x\n", has_avx512vnni);
+  printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi);
+  printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2);
+  printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg);
+  printf("Has AVX512VPOPCNTDQ 0x%x\n", has_avx512vpopcntdq);
+#endif
 #if defined(__mips__)
   int has_mips = TestCpuFlag(kCpuHasMIPS);
-  printf("Has MIPS %d\n", has_mips);
+  printf("Has MIPS 0x%x\n", has_mips);
   int has_msa = TestCpuFlag(kCpuHasMSA);
-  printf("Has MSA %d\n", has_msa);
+  printf("Has MSA 0x%x\n", has_msa);
 #endif
-
 #if defined(__loongarch__)
   int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
-  printf("Has LOONGARCH %d\n", has_loongarch);
+  printf("Has LOONGARCH 0x%x\n", has_loongarch);
   int has_lsx = TestCpuFlag(kCpuHasLSX);
-  printf("Has LSX %d\n", has_lsx);
+  printf("Has LSX 0x%x\n", has_lsx);
   int has_lasx = TestCpuFlag(kCpuHasLASX);
-  printf("Has LASX %d\n", has_lasx);
+  printf("Has LASX 0x%x\n", has_lasx);
 #endif
 }
 
@@ -104,27 +113,33 @@ TEST_F(LibYUVBaseTest, TestCompilerMacros) {
 #ifdef __i386__
   printf("__i386__ %d\n", __i386__);
 #endif
-#ifdef __mips
-  printf("__mips %d\n", __mips);
-#endif
-#ifdef __mips_isa_rev
-  printf("__mips_isa_rev %d\n", __mips_isa_rev);
-#endif
 #ifdef __x86_64__
   printf("__x86_64__ %d\n", __x86_64__);
 #endif
+#ifdef _M_IX86
+  printf("_M_IX86 %d\n", _M_IX86);
+#endif
+#ifdef _M_X64
+  printf("_M_X64 %d\n", _M_X64);
+#endif
 #ifdef _MSC_VER
   printf("_MSC_VER %d\n", _MSC_VER);
 #endif
 #ifdef __aarch64__
   printf("__aarch64__ %d\n", __aarch64__);
 #endif
-#ifdef __APPLE__
-  printf("__APPLE__ %d\n", __APPLE__);
-#endif
 #ifdef __arm__
   printf("__arm__ %d\n", __arm__);
 #endif
+#ifdef __riscv
+  printf("__riscv %d\n", __riscv);
+#endif
+#ifdef __riscv_vector
+  printf("__riscv_vector %d\n", __riscv_vector);
+#endif
+#ifdef __APPLE__
+  printf("__APPLE__ %d\n", __APPLE__);
+#endif
 #ifdef __clang__
   printf("__clang__ %d\n", __clang__);
 #endif
@@ -140,20 +155,11 @@ TEST_F(LibYUVBaseTest, TestCompilerMacros) {
 #ifdef __mips_msa
   printf("__mips_msa %d\n", __mips_msa);
 #endif
-#ifdef __native_client__
-  printf("__native_client__ %d\n", __native_client__);
-#endif
-#ifdef __pic__
-  printf("__pic__ %d\n", __pic__);
-#endif
-#ifdef __pnacl__
-  printf("__pnacl__ %d\n", __pnacl__);
-#endif
-#ifdef _M_IX86
-  printf("_M_IX86 %d\n", _M_IX86);
+#ifdef __mips
+  printf("__mips %d\n", __mips);
 #endif
-#ifdef _M_X64
-  printf("_M_X64 %d\n", _M_X64);
+#ifdef __mips_isa_rev
+  printf("__mips_isa_rev %d\n", __mips_isa_rev);
 #endif
 #ifdef _MIPS_ARCH_LOONGSON3A
   printf("_MIPS_ARCH_LOONGSON3A %d\n", _MIPS_ARCH_LOONGSON3A);
@@ -164,6 +170,15 @@ TEST_F(LibYUVBaseTest, TestCompilerMacros) {
 #ifdef _WIN32
   printf("_WIN32 %d\n", _WIN32);
 #endif
+#ifdef __native_client__
+  printf("__native_client__ %d\n", __native_client__);
+#endif
+#ifdef __pic__
+  printf("__pic__ %d\n", __pic__);
+#endif
+#ifdef __pnacl__
+  printf("__pnacl__ %d\n", __pnacl__);
+#endif
 #ifdef GG_LONGLONG
   printf("GG_LONGLONG %d\n", GG_LONGLONG);
 #endif
@@ -200,8 +215,9 @@ TEST_F(LibYUVBaseTest, TestCpuId) {
     cpu_info[0] = cpu_info[1];  // Reorder output
     cpu_info[1] = cpu_info[3];
     cpu_info[3] = 0;
-    printf("Cpu Vendor: %s %x %x %x\n", reinterpret_cast<char*>(&cpu_info[0]),
-           cpu_info[0], cpu_info[1], cpu_info[2]);
+    printf("Cpu Vendor: %s 0x%x 0x%x 0x%x\n",
+           reinterpret_cast<char*>(&cpu_info[0]), cpu_info[0], cpu_info[1],
+           cpu_info[2]);
     EXPECT_EQ(12u, strlen(reinterpret_cast<char*>(&cpu_info[0])));
 
     // CPU Family and Model
@@ -264,6 +280,32 @@ TEST_F(LibYUVBaseTest, TestLinuxMipsMsa) {
   }
 }
 
+TEST_F(LibYUVBaseTest, TestLinuxRVV) {
+  if (FileExists("../../unit_test/testdata/riscv64.txt")) {
+    printf("Note: testing to load \"../../unit_test/testdata/riscv64.txt\"\n");
+
+    EXPECT_EQ(0, RiscvCpuCaps("../../unit_test/testdata/riscv64.txt"));
+    EXPECT_EQ(kCpuHasRVV,
+              RiscvCpuCaps("../../unit_test/testdata/riscv64_rvv.txt"));
+    EXPECT_EQ(kCpuHasRVV | kCpuHasRVVZVFH,
+              RiscvCpuCaps("../../unit_test/testdata/riscv64_rvv_zvfh.txt"));
+  } else {
+    printf(
+        "WARNING: unable to load "
+        "\"../../unit_test/testdata/riscv64.txt\"\n");
+  }
+#if defined(__linux__) && defined(__riscv)
+  if (FileExists("/proc/cpuinfo")) {
+    if (!(kCpuHasRVV & RiscvCpuCaps("/proc/cpuinfo"))) {
+      // This can happen on RVV emulator but /proc/cpuinfo is from host.
+      printf("WARNING: RVV build enabled but CPU does not have RVV\n");
+    }
+  } else {
+    printf("WARNING: unable to load \"/proc/cpuinfo\"\n");
+  }
+#endif
+}
+
 // TODO(fbarchard): Fix clangcl test of cpuflags.
 #ifdef _MSC_VER
 TEST_F(LibYUVBaseTest, DISABLED_TestSetCpuFlags) {
diff --git a/files/unit_test/planar_test.cc b/files/unit_test/planar_test.cc
index 3a8c470b..ad97b87e 100644
--- a/files/unit_test/planar_test.cc
+++ b/files/unit_test/planar_test.cc
@@ -1638,29 +1638,29 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) {
   int i, j;
 
   // orig is tiled.  Allocate enough memory for tiles.
-  int orig_width = (benchmark_width_ + 15) & ~15;
-  int orig_height = (benchmark_height_ + 15) & ~15;
-  int orig_plane_size = orig_width * orig_height;
+  int tile_width = (benchmark_width_ + 15) & ~15;
+  int tile_height = (benchmark_height_ + 15) & ~15;
+  int tile_plane_size = tile_width * tile_height;
   int y_plane_size = benchmark_width_ * benchmark_height_;
-  align_buffer_page_end(orig_y, orig_plane_size);
+  align_buffer_page_end(tile_y, tile_plane_size);
   align_buffer_page_end(dst_c, y_plane_size);
   align_buffer_page_end(dst_opt, y_plane_size);
 
-  MemRandomize(orig_y, orig_plane_size);
+  MemRandomize(tile_y, tile_plane_size);
   memset(dst_c, 0, y_plane_size);
   memset(dst_opt, 0, y_plane_size);
 
   // Disable all optimizations.
   MaskCpuFlags(disable_cpu_flags_);
   for (j = 0; j < benchmark_iterations_; j++) {
-    DetilePlane(orig_y, orig_width, dst_c, benchmark_width_, benchmark_width_,
+    DetilePlane(tile_y, tile_width, dst_c, benchmark_width_, benchmark_width_,
                 benchmark_height_, 16);
   }
 
   // Enable optimizations.
   MaskCpuFlags(benchmark_cpu_info_);
   for (j = 0; j < benchmark_iterations_; j++) {
-    DetilePlane(orig_y, orig_width, dst_opt, benchmark_width_, benchmark_width_,
+    DetilePlane(tile_y, tile_width, dst_opt, benchmark_width_, benchmark_width_,
                 benchmark_height_, 16);
   }
 
@@ -1668,7 +1668,46 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) {
     EXPECT_EQ(dst_c[i], dst_opt[i]);
   }
 
-  free_aligned_buffer_page_end(orig_y);
+  free_aligned_buffer_page_end(tile_y);
+  free_aligned_buffer_page_end(dst_c);
+  free_aligned_buffer_page_end(dst_opt);
+}
+
+TEST_F(LibYUVPlanarTest, TestDetilePlane_16) {
+  int i, j;
+
+  // orig is tiled.  Allocate enough memory for tiles.
+  int tile_width = (benchmark_width_ + 15) & ~15;
+  int tile_height = (benchmark_height_ + 15) & ~15;
+  int tile_plane_size = tile_width * tile_height * 2;
+  int y_plane_size = benchmark_width_ * benchmark_height_ * 2;
+  align_buffer_page_end(tile_y, tile_plane_size);
+  align_buffer_page_end(dst_c, y_plane_size);
+  align_buffer_page_end(dst_opt, y_plane_size);
+
+  MemRandomize(tile_y, tile_plane_size);
+  memset(dst_c, 0, y_plane_size);
+  memset(dst_opt, 0, y_plane_size);
+
+  // Disable all optimizations.
+  MaskCpuFlags(disable_cpu_flags_);
+  for (j = 0; j < benchmark_iterations_; j++) {
+    DetilePlane_16((const uint16_t*)tile_y, tile_width, (uint16_t*)dst_c,
+                   benchmark_width_, benchmark_width_, benchmark_height_, 16);
+  }
+
+  // Enable optimizations.
+  MaskCpuFlags(benchmark_cpu_info_);
+  for (j = 0; j < benchmark_iterations_; j++) {
+    DetilePlane_16((const uint16_t*)tile_y, tile_width, (uint16_t*)dst_opt,
+                   benchmark_width_, benchmark_width_, benchmark_height_, 16);
+  }
+
+  for (i = 0; i < y_plane_size; ++i) {
+    EXPECT_EQ(dst_c[i], dst_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(tile_y);
   free_aligned_buffer_page_end(dst_c);
   free_aligned_buffer_page_end(dst_opt);
 }
@@ -1678,33 +1717,33 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) {
   int i, j;
 
   // orig is tiled.  Allocate enough memory for tiles.
-  int orig_width = (benchmark_width_ + 15) & ~15;
-  int orig_height = (benchmark_height_ + 15) & ~15;
-  int orig_plane_size = orig_width * orig_height;
+  int tile_width = (benchmark_width_ + 15) & ~15;
+  int tile_height = (benchmark_height_ + 15) & ~15;
+  int tile_plane_size = tile_width * tile_height;
   int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_;
-  align_buffer_page_end(orig_uv, orig_plane_size);
-  align_buffer_page_end(detiled_uv, orig_plane_size);
+  align_buffer_page_end(tile_uv, tile_plane_size);
+  align_buffer_page_end(detiled_uv, tile_plane_size);
   align_buffer_page_end(dst_u_two_stage, uv_plane_size);
   align_buffer_page_end(dst_u_opt, uv_plane_size);
   align_buffer_page_end(dst_v_two_stage, uv_plane_size);
   align_buffer_page_end(dst_v_opt, uv_plane_size);
 
-  MemRandomize(orig_uv, orig_plane_size);
-  memset(detiled_uv, 0, orig_plane_size);
+  MemRandomize(tile_uv, tile_plane_size);
+  memset(detiled_uv, 0, tile_plane_size);
   memset(dst_u_two_stage, 0, uv_plane_size);
   memset(dst_u_opt, 0, uv_plane_size);
   memset(dst_v_two_stage, 0, uv_plane_size);
   memset(dst_v_opt, 0, uv_plane_size);
 
-  DetileSplitUVPlane(orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2,
+  DetileSplitUVPlane(tile_uv, tile_width, dst_u_opt, (benchmark_width_ + 1) / 2,
                      dst_v_opt, (benchmark_width_ + 1) / 2, benchmark_width_,
                      benchmark_height_, 16);
 
   // Benchmark 2 step conversion for comparison.
   for (j = 0; j < benchmark_iterations_; j++) {
-    DetilePlane(orig_uv, orig_width, detiled_uv, benchmark_width_,
+    DetilePlane(tile_uv, tile_width, detiled_uv, benchmark_width_,
                 benchmark_width_, benchmark_height_, 16);
-    SplitUVPlane(detiled_uv, orig_width, dst_u_two_stage,
+    SplitUVPlane(detiled_uv, tile_width, dst_u_two_stage,
                  (benchmark_width_ + 1) / 2, dst_v_two_stage,
                  (benchmark_width_ + 1) / 2, (benchmark_width_ + 1) / 2,
                  benchmark_height_);
@@ -1715,7 +1754,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) {
     EXPECT_EQ(dst_v_two_stage[i], dst_v_opt[i]);
   }
 
-  free_aligned_buffer_page_end(orig_uv);
+  free_aligned_buffer_page_end(tile_uv);
   free_aligned_buffer_page_end(detiled_uv);
   free_aligned_buffer_page_end(dst_u_two_stage);
   free_aligned_buffer_page_end(dst_u_opt);
@@ -1727,17 +1766,17 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
   int i, j;
 
   // orig is tiled.  Allocate enough memory for tiles.
-  int orig_width = (benchmark_width_ + 15) & ~15;
-  int orig_height = (benchmark_height_ + 15) & ~15;
-  int orig_plane_size = orig_width * orig_height;
+  int tile_width = (benchmark_width_ + 15) & ~15;
+  int tile_height = (benchmark_height_ + 15) & ~15;
+  int tile_plane_size = tile_width * tile_height;
   int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_;
-  align_buffer_page_end(orig_uv, orig_plane_size);
+  align_buffer_page_end(tile_uv, tile_plane_size);
   align_buffer_page_end(dst_u_c, uv_plane_size);
   align_buffer_page_end(dst_u_opt, uv_plane_size);
   align_buffer_page_end(dst_v_c, uv_plane_size);
   align_buffer_page_end(dst_v_opt, uv_plane_size);
 
-  MemRandomize(orig_uv, orig_plane_size);
+  MemRandomize(tile_uv, tile_plane_size);
   memset(dst_u_c, 0, uv_plane_size);
   memset(dst_u_opt, 0, uv_plane_size);
   memset(dst_v_c, 0, uv_plane_size);
@@ -1746,7 +1785,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
   // Disable all optimizations.
   MaskCpuFlags(disable_cpu_flags_);
 
-  DetileSplitUVPlane(orig_uv, orig_width, dst_u_c, (benchmark_width_ + 1) / 2,
+  DetileSplitUVPlane(tile_uv, tile_width, dst_u_c, (benchmark_width_ + 1) / 2,
                      dst_v_c, (benchmark_width_ + 1) / 2, benchmark_width_,
                      benchmark_height_, 16);
 
@@ -1755,7 +1794,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
 
   for (j = 0; j < benchmark_iterations_; j++) {
     DetileSplitUVPlane(
-        orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt,
+        tile_uv, tile_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt,
         (benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16);
   }
 
@@ -1764,7 +1803,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
     EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);
   }
 
-  free_aligned_buffer_page_end(orig_uv);
+  free_aligned_buffer_page_end(tile_uv);
   free_aligned_buffer_page_end(dst_u_c);
   free_aligned_buffer_page_end(dst_u_opt);
   free_aligned_buffer_page_end(dst_v_c);
@@ -3495,8 +3534,8 @@ TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16)
 // TODO(fbarchard): improve test for platforms and cpu detect
 #ifdef HAS_MERGEUVROW_16_AVX2
 TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
-  // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  // Round count up to multiple of 8
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
 
   align_buffer_page_end(src_pixels_u, kPixels * 2);
   align_buffer_page_end(src_pixels_v, kPixels * 2);
diff --git a/files/unit_test/rotate_argb_test.cc b/files/unit_test/rotate_argb_test.cc
index 01ed69ca..74952c4e 100644
--- a/files/unit_test/rotate_argb_test.cc
+++ b/files/unit_test/rotate_argb_test.cc
@@ -225,4 +225,110 @@ TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) {
   free_aligned_buffer_page_end(src_argb);
 }
 
+static void TestRotatePlane_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               libyuv::RotationMode mode,
+                               int benchmark_iterations,
+                               int disable_cpu_flags,
+                               int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height < 1) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_stride = src_width;
+  int src_plane_size = src_stride * abs(src_height);
+  align_buffer_page_end_16(src, src_plane_size);
+  for (int i = 0; i < src_plane_size; ++i) {
+    src[i] = fastrand() & 0xff;
+  }
+
+  int dst_stride = dst_width;
+  int dst_plane_size = dst_stride * dst_height;
+  align_buffer_page_end_16(dst_c, dst_plane_size);
+  align_buffer_page_end_16(dst_opt, dst_plane_size);
+  memset(dst_c, 2, dst_plane_size);
+  memset(dst_opt, 3, dst_plane_size);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  RotatePlane_16(src, src_stride, dst_c, dst_stride, src_width, src_height,
+                 mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    RotatePlane_16(src, src_stride, dst_opt, dst_stride, src_width, src_height,
+                   mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_plane_size; ++i) {
+    EXPECT_EQ(dst_c[i], dst_opt[i]);
+  }
+
+  free_aligned_buffer_page_end_16(dst_c);
+  free_aligned_buffer_page_end_16(dst_opt);
+  free_aligned_buffer_page_end_16(src);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane0_16_Opt) {
+  TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_width_,
+                     benchmark_height_, kRotate0, benchmark_iterations_,
+                     disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane90_16_Opt) {
+  TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_height_,
+                     benchmark_width_, kRotate90, benchmark_iterations_,
+                     disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane180_16_Opt) {
+  TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_width_,
+                     benchmark_height_, kRotate180, benchmark_iterations_,
+                     disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane270_16_Opt) {
+  TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_height_,
+                     benchmark_width_, kRotate270, benchmark_iterations_,
+                     disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane0_16_Odd) {
+  TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+                     benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
+                     benchmark_iterations_, disable_cpu_flags_,
+                     benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane90_16_Odd) {
+  TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+                     benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
+                     benchmark_iterations_, disable_cpu_flags_,
+                     benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane180_16_Odd) {
+  TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+                     benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
+                     benchmark_iterations_, disable_cpu_flags_,
+                     benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane270_16_Odd) {
+  TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+                     benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
+                     benchmark_iterations_, disable_cpu_flags_,
+                     benchmark_cpu_info_);
+}
+
 }  // namespace libyuv
diff --git a/files/unit_test/rotate_test.cc b/files/unit_test/rotate_test.cc
index d3887414..abc08efa 100644
--- a/files/unit_test/rotate_test.cc
+++ b/files/unit_test/rotate_test.cc
@@ -14,6 +14,10 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/rotate.h"
 
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/rotate_row.h"
+#endif
+
 namespace libyuv {
 
 #define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
@@ -596,4 +600,363 @@ TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
 #undef TESTAPLANARTOP
 #undef TESTAPLANARTOPI
 
+static void I010TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i010_y_size = src_width * Abs(src_height);
+  int src_i010_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2);
+  int src_i010_size = src_i010_y_size + src_i010_uv_size * 2;
+  align_buffer_page_end_16(src_i010, src_i010_size);
+  for (int i = 0; i < src_i010_size; ++i) {
+    src_i010[i] = fastrand() & 0x3ff;
+  }
+
+  int dst_i010_y_size = dst_width * dst_height;
+  int dst_i010_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+  int dst_i010_size = dst_i010_y_size + dst_i010_uv_size * 2;
+  align_buffer_page_end_16(dst_i010_c, dst_i010_size);
+  align_buffer_page_end_16(dst_i010_opt, dst_i010_size);
+  memset(dst_i010_c, 2, dst_i010_size * 2);
+  memset(dst_i010_opt, 3, dst_i010_size * 2);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I010Rotate(src_i010, src_width, src_i010 + src_i010_y_size,
+             (src_width + 1) / 2, src_i010 + src_i010_y_size + src_i010_uv_size,
+             (src_width + 1) / 2, dst_i010_c, dst_width,
+             dst_i010_c + dst_i010_y_size, (dst_width + 1) / 2,
+             dst_i010_c + dst_i010_y_size + dst_i010_uv_size,
+             (dst_width + 1) / 2, src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I010Rotate(
+        src_i010, src_width, src_i010 + src_i010_y_size, (src_width + 1) / 2,
+        src_i010 + src_i010_y_size + src_i010_uv_size, (src_width + 1) / 2,
+        dst_i010_opt, dst_width, dst_i010_opt + dst_i010_y_size,
+        (dst_width + 1) / 2, dst_i010_opt + dst_i010_y_size + dst_i010_uv_size,
+        (dst_width + 1) / 2, src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i010_size; ++i) {
+    EXPECT_EQ(dst_i010_c[i], dst_i010_opt[i]);
+  }
+
+  free_aligned_buffer_page_end_16(dst_i010_c);
+  free_aligned_buffer_page_end_16(dst_i010_opt);
+  free_aligned_buffer_page_end_16(src_i010);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate0_Opt) {
+  I010TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate90_Opt) {
+  I010TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate180_Opt) {
+  I010TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate270_Opt) {
+  I010TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+static void I210TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i210_y_size = src_width * Abs(src_height);
+  int src_i210_uv_size = ((src_width + 1) / 2) * Abs(src_height);
+  int src_i210_size = src_i210_y_size + src_i210_uv_size * 2;
+  align_buffer_page_end_16(src_i210, src_i210_size);
+  for (int i = 0; i < src_i210_size; ++i) {
+    src_i210[i] = fastrand() & 0x3ff;
+  }
+
+  int dst_i210_y_size = dst_width * dst_height;
+  int dst_i210_uv_size = ((dst_width + 1) / 2) * dst_height;
+  int dst_i210_size = dst_i210_y_size + dst_i210_uv_size * 2;
+  align_buffer_page_end_16(dst_i210_c, dst_i210_size);
+  align_buffer_page_end_16(dst_i210_opt, dst_i210_size);
+  memset(dst_i210_c, 2, dst_i210_size * 2);
+  memset(dst_i210_opt, 3, dst_i210_size * 2);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I210Rotate(src_i210, src_width, src_i210 + src_i210_y_size,
+             (src_width + 1) / 2, src_i210 + src_i210_y_size + src_i210_uv_size,
+             (src_width + 1) / 2, dst_i210_c, dst_width,
+             dst_i210_c + dst_i210_y_size, (dst_width + 1) / 2,
+             dst_i210_c + dst_i210_y_size + dst_i210_uv_size,
+             (dst_width + 1) / 2, src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I210Rotate(
+        src_i210, src_width, src_i210 + src_i210_y_size, (src_width + 1) / 2,
+        src_i210 + src_i210_y_size + src_i210_uv_size, (src_width + 1) / 2,
+        dst_i210_opt, dst_width, dst_i210_opt + dst_i210_y_size,
+        (dst_width + 1) / 2, dst_i210_opt + dst_i210_y_size + dst_i210_uv_size,
+        (dst_width + 1) / 2, src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i210_size; ++i) {
+    EXPECT_EQ(dst_i210_c[i], dst_i210_opt[i]);
+  }
+
+  free_aligned_buffer_page_end_16(dst_i210_c);
+  free_aligned_buffer_page_end_16(dst_i210_opt);
+  free_aligned_buffer_page_end_16(src_i210);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate0_Opt) {
+  I210TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate90_Opt) {
+  I210TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate180_Opt) {
+  I210TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate270_Opt) {
+  I210TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+static void I410TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i410_y_size = src_width * Abs(src_height);
+  int src_i410_uv_size = src_width * Abs(src_height);
+  int src_i410_size = src_i410_y_size + src_i410_uv_size * 2;
+  align_buffer_page_end_16(src_i410, src_i410_size);
+  for (int i = 0; i < src_i410_size; ++i) {
+    src_i410[i] = fastrand() & 0x3ff;
+  }
+
+  int dst_i410_y_size = dst_width * dst_height;
+  int dst_i410_uv_size = dst_width * dst_height;
+  int dst_i410_size = dst_i410_y_size + dst_i410_uv_size * 2;
+  align_buffer_page_end_16(dst_i410_c, dst_i410_size);
+  align_buffer_page_end_16(dst_i410_opt, dst_i410_size);
+  memset(dst_i410_c, 2, dst_i410_size * 2);
+  memset(dst_i410_opt, 3, dst_i410_size * 2);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I410Rotate(src_i410, src_width, src_i410 + src_i410_y_size, src_width,
+             src_i410 + src_i410_y_size + src_i410_uv_size, src_width,
+             dst_i410_c, dst_width, dst_i410_c + dst_i410_y_size, dst_width,
+             dst_i410_c + dst_i410_y_size + dst_i410_uv_size, dst_width,
+             src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I410Rotate(src_i410, src_width, src_i410 + src_i410_y_size, src_width,
+               src_i410 + src_i410_y_size + src_i410_uv_size, src_width,
+               dst_i410_opt, dst_width, dst_i410_opt + dst_i410_y_size,
+               dst_width, dst_i410_opt + dst_i410_y_size + dst_i410_uv_size,
+               dst_width, src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i410_size; ++i) {
+    EXPECT_EQ(dst_i410_c[i], dst_i410_opt[i]);
+  }
+
+  free_aligned_buffer_page_end_16(dst_i410_c);
+  free_aligned_buffer_page_end_16(dst_i410_opt);
+  free_aligned_buffer_page_end_16(src_i410);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate0_Opt) {
+  I410TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate90_Opt) {
+  I410TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate180_Opt) {
+  I410TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate270_Opt) {
+  I410TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+#if defined(ENABLE_ROW_TESTS)
+
+TEST_F(LibYUVRotateTest, Transpose4x4_Test) {
+  // dst width and height
+  const int width = 4;
+  const int height = 4;
+  int src_pixels[4][4];
+  int dst_pixels_c[4][4];
+  int dst_pixels_opt[4][4];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      src_pixels[i][j] = i * 10 + j;
+    }
+  }
+  memset(dst_pixels_c, 1, width * height * 4);
+  memset(dst_pixels_opt, 2, width * height * 4);
+
+  Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                    (uint8_t*)dst_pixels_c, width * 4, width);
+
+  const int benchmark_iterations =
+      (benchmark_iterations_ * benchmark_width_ * benchmark_height_ + 15) /
+      (4 * 4);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+#if defined(HAS_TRANSPOSE4X4_32_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else
+#elif defined(HAS_TRANSPOSE4X4_32_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2)) {
+      Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else
+#endif
+    {
+      Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                        (uint8_t*)dst_pixels_opt, width * 4, width);
+    }
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      EXPECT_EQ(dst_pixels_c[i][j], src_pixels[j][i]);
+      EXPECT_EQ(dst_pixels_c[i][j], dst_pixels_opt[i][j]);
+    }
+  }
+}
+
+TEST_F(LibYUVRotateTest, Transpose4x4_Opt) {
+  // dst width and height
+  const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3;
+  const int height = 4;
+  align_buffer_page_end(src_pixels, height * width * 4);
+  align_buffer_page_end(dst_pixels_c, width * height * 4);
+  align_buffer_page_end(dst_pixels_opt, width * height * 4);
+
+  MemRandomize(src_pixels, height * width * 4);
+  memset(dst_pixels_c, 1, width * height * 4);
+  memset(dst_pixels_opt, 2, width * height * 4);
+
+  Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                    (uint8_t*)dst_pixels_c, width * 4, width);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+#if defined(HAS_TRANSPOSE4X4_32_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else
+#elif defined(HAS_TRANSPOSE4X4_32_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+      Transpose4x4_32_AVX2((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else if (TestCpuFlag(kCpuHasSSE2)) {
+      Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else
+#endif
+    {
+      Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                        (uint8_t*)dst_pixels_opt, width * 4, width);
+    }
+  }
+
+  for (int i = 0; i < width * height; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_c);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+}
+
+#endif  // ENABLE_ROW_TESTS
+
 }  // namespace libyuv
diff --git a/files/unit_test/scale_uv_test.cc b/files/unit_test/scale_uv_test.cc
index 3d524bef..dab217c9 100644
--- a/files/unit_test/scale_uv_test.cc
+++ b/files/unit_test/scale_uv_test.cc
@@ -39,55 +39,35 @@ static int UVTestFilter(int src_width,
     return 0;
   }
 
-  int i, j;
-  const int b = 0;  // 128 to test for padding/stride.
-  int64_t src_uv_plane_size =
-      (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 2LL;
-  int src_stride_uv = (b * 2 + Abs(src_width)) * 2;
+  int i;
+  int64_t src_uv_plane_size = Abs(src_width) * Abs(src_height) * 2LL;
+  int src_stride_uv = Abs(src_width) * 2;
+  int64_t dst_uv_plane_size = dst_width * dst_height * 2LL;
+  int dst_stride_uv = dst_width * 2;
 
   align_buffer_page_end(src_uv, src_uv_plane_size);
-  if (!src_uv) {
-    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
-    return 0;
-  }
-  MemRandomize(src_uv, src_uv_plane_size);
-
-  int64_t dst_uv_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 2LL;
-  int dst_stride_uv = (b * 2 + dst_width) * 2;
-
   align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
   align_buffer_page_end(dst_uv_opt, dst_uv_plane_size);
-  if (!dst_uv_c || !dst_uv_opt) {
+
+  if (!src_uv || !dst_uv_c || !dst_uv_opt) {
     printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
     return 0;
   }
+  MemRandomize(src_uv, src_uv_plane_size);
   memset(dst_uv_c, 2, dst_uv_plane_size);
-  memset(dst_uv_opt, 3, dst_uv_plane_size);
-
-  // Warm up both versions for consistent benchmarks.
-  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
-  UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-          src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv,
-          dst_width, dst_height, f);
-  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
-  UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-          src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
-          dst_width, dst_height, f);
+  memset(dst_uv_opt, 123, dst_uv_plane_size);
 
   MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
   double c_time = get_time();
-  UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-          src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv,
+  UVScale(src_uv, src_stride_uv, src_width, src_height, dst_uv_c, dst_stride_uv,
           dst_width, dst_height, f);
-
   c_time = (get_time() - c_time);
 
   MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
   double opt_time = get_time();
   for (i = 0; i < benchmark_iterations; ++i) {
-    UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-            src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
-            dst_width, dst_height, f);
+    UVScale(src_uv, src_stride_uv, src_width, src_height, dst_uv_opt,
+            dst_stride_uv, dst_width, dst_height, f);
   }
   opt_time = (get_time() - opt_time) / benchmark_iterations;
 
@@ -95,18 +75,11 @@ static int UVTestFilter(int src_width,
   printf("filter %d - %8d us C - %8d us OPT\n", f,
          static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
 
-  // C version may be a little off from the optimized. Order of
-  //  operations may introduce rounding somewhere. So do a difference
-  //  of the buffers and look to see that the max difference isn't
-  //  over 2.
   int max_diff = 0;
-  for (i = b; i < (dst_height + b); ++i) {
-    for (j = b * 2; j < (dst_width + b) * 2; ++j) {
-      int abs_diff = Abs(dst_uv_c[(i * dst_stride_uv) + j] -
-                         dst_uv_opt[(i * dst_stride_uv) + j]);
-      if (abs_diff > max_diff) {
-        max_diff = abs_diff;
-      }
+  for (i = 0; i < dst_uv_plane_size; ++i) {
+    int abs_diff = Abs(dst_uv_c[i] - dst_uv_opt[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
     }
   }
 
@@ -121,28 +94,26 @@ static int UVTestFilter(int src_width,
 #define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
 #define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
 
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                     \
+#define TEST_FACTOR1(name, filter, nom, denom)                               \
   TEST_F(LibYUVScaleTest, UVScaleDownBy##name##_##filter) {                  \
     int diff = UVTestFilter(                                                 \
         SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
         kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
         benchmark_cpu_info_);                                                \
-    EXPECT_LE(diff, max_diff);                                               \
+    EXPECT_EQ(0, diff);                                                      \
   }
 
 #if defined(ENABLE_FULL_TESTS)
-// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
-// filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom)         \
-  TEST_FACTOR1(name, None, nom, denom, 0)     \
-  TEST_FACTOR1(name, Linear, nom, denom, 3)   \
-  TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
-  TEST_FACTOR1(name, Box, nom, denom, 3)
+// Test a scale factor with all 4 filters.  Expect exact for SIMD vs C.
+#define TEST_FACTOR(name, nom, denom)      \
+  TEST_FACTOR1(name, None, nom, denom)     \
+  TEST_FACTOR1(name, Linear, nom, denom)   \
+  TEST_FACTOR1(name, Bilinear, nom, denom) \
+  TEST_FACTOR1(name, Box, nom, denom)
 #else
 // Test a scale factor with Bilinear.
-#define TEST_FACTOR(name, nom, denom) \
-  TEST_FACTOR1(name, Bilinear, nom, denom, 3)
+#define TEST_FACTOR(name, nom, denom) TEST_FACTOR1(name, Bilinear, nom, denom)
 #endif
 
 TEST_FACTOR(2, 1, 2)
diff --git a/files/unit_test/testdata/riscv64.txt b/files/unit_test/testdata/riscv64.txt
new file mode 100644
index 00000000..fbb4200f
--- /dev/null
+++ b/files/unit_test/testdata/riscv64.txt
@@ -0,0 +1,4 @@
+processor       : 0
+hart            : 1
+isa             : rv64imac
+mmu             : sv48
\ No newline at end of file
diff --git a/files/unit_test/testdata/riscv64_rvv.txt b/files/unit_test/testdata/riscv64_rvv.txt
new file mode 100644
index 00000000..af1b3f36
--- /dev/null
+++ b/files/unit_test/testdata/riscv64_rvv.txt
@@ -0,0 +1,4 @@
+processor       : 0
+hart            : 1
+isa             : rv64imafdcv
+mmu             : sv48
\ No newline at end of file
diff --git a/files/unit_test/testdata/riscv64_rvv_zvfh.txt b/files/unit_test/testdata/riscv64_rvv_zvfh.txt
new file mode 100644
index 00000000..c416c1af
--- /dev/null
+++ b/files/unit_test/testdata/riscv64_rvv_zvfh.txt
@@ -0,0 +1,4 @@
+processor       : 0
+hart            : 1
+isa             : rv64imafdcv_zfh_zvfh
+mmu             : sv48
\ No newline at end of file
diff --git a/files/unit_test/unit_test.cc b/files/unit_test/unit_test.cc
index 61145a46..b66ebfab 100644
--- a/files/unit_test/unit_test.cc
+++ b/files/unit_test/unit_test.cc
@@ -88,6 +88,11 @@ int TestCpuEnv(int cpu_info) {
     cpu_info &= ~libyuv::kCpuHasLASX;
   }
 #endif
+#if defined(__riscv) && defined(__linux__)
+  if (TestEnv("LIBYUV_DISABLE_RVV")) {
+    cpu_info &= ~libyuv::kCpuHasRVV;
+  }
+#endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) &&                   \
     (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
      defined(_M_IX86))
diff --git a/files/unit_test/unit_test.h b/files/unit_test/unit_test.h
index 0a8df4d2..99cc8d19 100644
--- a/files/unit_test/unit_test.h
+++ b/files/unit_test/unit_test.h
@@ -11,10 +11,10 @@
 #ifndef UNIT_TEST_UNIT_TEST_H_  // NOLINT
 #define UNIT_TEST_UNIT_TEST_H_
 
+#include <stddef.h>  // For NULL
 #ifdef _WIN32
 #include <windows.h>
 #else
-#include <sys/resource.h>
 #include <sys/time.h>
 #endif
 
@@ -77,7 +77,18 @@ static inline bool SizeValid(int src_width,
 
 #define free_aligned_buffer_page_end(var) \
   free(var##_mem);                        \
-  var = 0
+  var = NULL
+
+#define align_buffer_page_end_16(var, size)                                 \
+  uint8_t* var##_mem =                                                      \
+      reinterpret_cast<uint8_t*>(malloc(((size)*2 + 4095 + 63) & ~4095));   \
+  uint16_t* var = reinterpret_cast<uint16_t*>(                              \
+      (intptr_t)(var##_mem + (((size)*2 + 4095 + 63) & ~4095) - (size)*2) & \
+      ~63)
+
+#define free_aligned_buffer_page_end_16(var) \
+  free(var##_mem);                           \
+  var = NULL
 
 #ifdef WIN32
 static inline double get_time() {
diff --git a/files/util/cpuid.c b/files/util/cpuid.c
index b618bb10..edc6a26e 100644
--- a/files/util/cpuid.c
+++ b/files/util/cpuid.c
@@ -21,8 +21,9 @@ using namespace libyuv;
 int main(int argc, const char* argv[]) {
   int cpu_flags = TestCpuFlag(-1);
   int has_arm = TestCpuFlag(kCpuHasARM);
-  int has_mips = TestCpuFlag(kCpuHasMIPS);
+  int has_riscv = TestCpuFlag(kCpuHasRISCV);
   int has_x86 = TestCpuFlag(kCpuHasX86);
+  int has_mips = TestCpuFlag(kCpuHasMIPS);
   int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
   (void)argc;
   (void)argv;
@@ -62,24 +63,28 @@ int main(int argc, const char* argv[]) {
            model, model);
   }
 #endif
-  printf("Cpu Flags %x\n", cpu_flags);
-  printf("Has ARM %x\n", has_arm);
-  printf("Has MIPS %x\n", has_mips);
-  printf("Has X86 %x\n", has_x86);
-  printf("Has LOONGARCH %x\n", has_loongarch);
+  printf("Cpu Flags 0x%x\n", cpu_flags);
   if (has_arm) {
     int has_neon = TestCpuFlag(kCpuHasNEON);
-    printf("Has NEON %x\n", has_neon);
+    printf("Has ARM 0x%x\n", has_arm);
+    printf("Has NEON 0x%x\n", has_neon);
+  }
+  if (has_riscv) {
+    int has_rvv = TestCpuFlag(kCpuHasRVV);
+    printf("Has RISCV 0x%x\n", has_riscv);
+    printf("Has RVV 0x%x\n", has_rvv);
   }
   if (has_mips) {
     int has_msa = TestCpuFlag(kCpuHasMSA);
-    printf("Has MSA %x\n", has_msa);
+    printf("Has MIPS 0x%x\n", has_mips);
+    printf("Has MSA 0x%x\n", has_msa);
   }
   if (has_loongarch) {
     int has_lsx  = TestCpuFlag(kCpuHasLSX);
-    printf("Has LSX %x\n", has_lsx);
     int has_lasx = TestCpuFlag(kCpuHasLASX);
-    printf("Has LASX %x\n", has_lasx);
+    printf("Has LOONGARCH 0x%x\n", has_loongarch);
+    printf("Has LSX 0x%x\n", has_lsx);
+    printf("Has LASX 0x%x\n", has_lasx);
   }
   if (has_x86) {
     int has_sse2 = TestCpuFlag(kCpuHasSSE2);
@@ -99,23 +104,24 @@ int main(int argc, const char* argv[]) {
     int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
     int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
     int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ);
-    printf("Has SSE2 %x\n", has_sse2);
-    printf("Has SSSE3 %x\n", has_ssse3);
-    printf("Has SSE4.1 %x\n", has_sse41);
-    printf("Has SSE4.2 %x\n", has_sse42);
-    printf("Has AVX %x\n", has_avx);
-    printf("Has AVX2 %x\n", has_avx2);
-    printf("Has ERMS %x\n", has_erms);
-    printf("Has FMA3 %x\n", has_fma3);
-    printf("Has F16C %x\n", has_f16c);
-    printf("Has GFNI %x\n", has_gfni);
-    printf("Has AVX512BW %x\n", has_avx512bw);
-    printf("Has AVX512VL %x\n", has_avx512vl);
-    printf("Has AVX512VNNI %x\n", has_avx512vnni);
-    printf("Has AVX512VBMI %x\n", has_avx512vbmi);
-    printf("Has AVX512VBMI2 %x\n", has_avx512vbmi2);
-    printf("Has AVX512VBITALG %x\n", has_avx512vbitalg);
-    printf("Has AVX512VPOPCNTDQ %x\n", has_avx512vpopcntdq);
+    printf("Has X86 0x%x\n", has_x86);
+    printf("Has SSE2 0x%x\n", has_sse2);
+    printf("Has SSSE3 0x%x\n", has_ssse3);
+    printf("Has SSE4.1 0x%x\n", has_sse41);
+    printf("Has SSE4.2 0x%x\n", has_sse42);
+    printf("Has AVX 0x%x\n", has_avx);
+    printf("Has AVX2 0x%x\n", has_avx2);
+    printf("Has ERMS 0x%x\n", has_erms);
+    printf("Has FMA3 0x%x\n", has_fma3);
+    printf("Has F16C 0x%x\n", has_f16c);
+    printf("Has GFNI 0x%x\n", has_gfni);
+    printf("Has AVX512BW 0x%x\n", has_avx512bw);
+    printf("Has AVX512VL 0x%x\n", has_avx512vl);
+    printf("Has AVX512VNNI 0x%x\n", has_avx512vnni);
+    printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi);
+    printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2);
+    printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg);
+    printf("Has AVX512VPOPCNTDQ 0x%x\n", has_avx512vpopcntdq);
   }
   return 0;
 }
diff --git a/files/util/yuvconstants.c b/files/util/yuvconstants.c
index 037e0824..4e5185af 100644
--- a/files/util/yuvconstants.c
+++ b/files/util/yuvconstants.c
@@ -43,9 +43,10 @@
 // #define BR (-VR * 128 + YB)
 
 int main(int argc, const char* argv[]) {
-  if (argc < 2) {
-    printf("yuvconstants Kr Kb\n");
-    printf("  MC BT          KR = 0.2126; KB = 0.0722\n");
+  if (argc < 3) {
+    printf("yuvconstants [KR] [KB]\n");
+    printf("  e.g. yuvconstants 0.2126 0.0722\n");
+    printf("  MC BT          KR           KB\n");
     printf("  1  BT.709      KR = 0.2126; KB = 0.0722\n");
     printf("  4  FCC         KR = 0.30;   KB = 0.11\n");
     printf("  6  BT.601      KR = 0.299;  KB = 0.114\n");
@@ -53,8 +54,8 @@ int main(int argc, const char* argv[]) {
     printf("  9  BT.2020     KR = 0.2627; KB = 0.0593\n");
     return -1;
   }
-  float kr = atof(argv[1]);
-  float kb = atof(argv[2]);
+  float kr = (float)atof(argv[1]);
+  float kb = (float)atof(argv[2]);
   float kg = 1 - kr - kb;
 
   float vr = 2 * (1 - kr);
diff --git a/files/util/yuvconvert.cc b/files/util/yuvconvert.cc
index 332699e3..93b52668 100644
--- a/files/util/yuvconvert.cc
+++ b/files/util/yuvconvert.cc
@@ -42,9 +42,9 @@ static __inline uint32_t Abs(int32_t v) {
 }
 
 // Parse PYUV format. ie name.1920x800_24Hz_P420.yuv
-bool ExtractResolutionFromFilename(const char* name,
-                                   int* width_ptr,
-                                   int* height_ptr) {
+static bool ExtractResolutionFromFilename(const char* name,
+                                          int* width_ptr,
+                                          int* height_ptr) {
   // Isolate the .width_height. section of the filename by searching for a
   // dot or underscore followed by a digit.
   for (int i = 0; name[i]; ++i) {
@@ -59,7 +59,7 @@ bool ExtractResolutionFromFilename(const char* name,
   return false;
 }
 
-void PrintHelp(const char* program) {
+static void PrintHelp(const char* program) {
   printf("%s [-options] src_argb.raw dst_yuv.raw\n", program);
   printf(
       " -s <width> <height> .... specify source resolution.  "
@@ -78,7 +78,7 @@ void PrintHelp(const char* program) {
   exit(0);
 }
 
-void ParseOptions(int argc, const char* argv[]) {
+static void ParseOptions(int argc, const char* argv[]) {
   if (argc <= 1) {
     PrintHelp(argv[0]);
   }
-- 
cgit v1.2.3