Merge "Snap for 11610999 from 488a2af021e3e7473f083a9435b1472c0d411f3d to androidx-fragment-release" into androidx-fragment-releaseandroidx-fragment-release

author: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> 2024-03-21 23:41:35 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> 2024-03-21 23:41:35 +0000
commit: 4d6f34f32c7828ab935beb54cdca08bd7bbd39c3 (patch)
tree: 7aa355fd0b89ec0b2611e17ee84a14c6fa449e22
parent: 7447ad8db319c34111a3d43b64b9485ac1a8115d (diff)
parent: 50ea3b267729413aa561eaea5aac7e6fcd565101 (diff)
download: libyuv-androidx-fragment-release.tar.gz
203 files changed, 20413 insertions, 16890 deletions
diff --git a/files/.clang-format b/.clang-format
index 59d48705..59d48705 100644
--- a/files/.clang-format
+++ b/.clang-format
diff --git a/files/.gitignore b/.gitignore
index 20d679b7..20d679b7 100644
--- a/files/.gitignore
+++ b/.gitignore
diff --git a/files/.gn b/.gn
index a765caa5..f9a5ee6c 100644
--- a/files/.gn
+++ b/.gn
@@ -34,7 +34,5 @@ exec_script_whitelist = build_dotfile_settings.exec_script_whitelist +
 
 default_args = {
   mac_sdk_min = "10.12"
-
-  # https://bugs.chromium.org/p/libyuv/issues/detail?id=826
-  ios_deployment_target = "10.0"
+  ios_deployment_target = "12.0"
 }
diff --git a/files/.vpython b/.vpython
index 4a64fd21..4a64fd21 100644
--- a/files/.vpython
+++ b/.vpython
diff --git a/files/.vpython3 b/.vpython3
index 0a9aa38b..28d819e7 100644
--- a/files/.vpython3
+++ b/.vpython3
@@ -76,8 +76,8 @@ wheel: <
   version: "version:5.8.0.chromium.2"
 >
 wheel: <
-  name: "infra/python/wheels/requests-py2_py3"
-  version: "version:2.26.0"
+  name: "infra/python/wheels/requests-py3"
+  version: "version:2.31.0"
 >
 
 # Used by various python unit tests.
diff --git a/files/AUTHORS b/AUTHORS
index 28c08956..28c08956 100644
--- a/files/AUTHORS
+++ b/AUTHORS
diff --git a/Android.bp b/Android.bp
index e4ed511c..506184e0 100644
--- a/Android.bp
+++ b/Android.bp
@@ -1,7 +1,6 @@
 package {
     default_applicable_licenses: ["external_libyuv_license"],
 }
-
 // Added automatically by a large-scale-change
 // See: http://go/android-license-faq
 license {
@@ -12,7 +11,183 @@ license {
     ],
     license_text: [
         "LICENSE",
+        "PATENTS",
     ],
 }
-
 subdirs = ["files"]
+
+cc_library {
+    name: "libyuv",
+    vendor_available: true,
+    product_available: true,
+    host_supported: true,
+
+    srcs: [
+        "source/compare.cc",
+        "source/compare_common.cc",
+        "source/compare_gcc.cc",
+        "source/compare_msa.cc",
+        "source/compare_neon.cc",
+        "source/compare_neon64.cc",
+        "source/convert.cc",
+        "source/convert_argb.cc",
+        "source/convert_from.cc",
+        "source/convert_from_argb.cc",
+        "source/convert_jpeg.cc",
+        "source/convert_to_argb.cc",
+        "source/convert_to_i420.cc",
+        "source/cpu_id.cc",
+        "source/mjpeg_decoder.cc",
+        "source/mjpeg_validate.cc",
+        "source/planar_functions.cc",
+        "source/rotate.cc",
+        "source/rotate_any.cc",
+        "source/rotate_argb.cc",
+        "source/rotate_common.cc",
+        "source/rotate_gcc.cc",
+        "source/rotate_msa.cc",
+        "source/rotate_neon.cc",
+        "source/rotate_neon64.cc",
+        "source/row_any.cc",
+        "source/row_common.cc",
+        "source/row_gcc.cc",
+        "source/row_msa.cc",
+        "source/row_neon.cc",
+        "source/row_neon64.cc",
+        "source/row_rvv.cc",
+        "source/scale.cc",
+        "source/scale_any.cc",
+        "source/scale_argb.cc",
+        "source/scale_common.cc",
+        "source/scale_gcc.cc",
+        "source/scale_msa.cc",
+        "source/scale_neon.cc",
+        "source/scale_neon64.cc",
+        "source/scale_rgb.cc",
+        "source/scale_rvv.cc",
+        "source/scale_uv.cc",
+        "source/video_common.cc",
+    ],
+
+    cflags: [
+        "-Wall",
+        "-Werror",
+        "-Wno-unused-parameter",
+        "-fexceptions",
+        "-DHAVE_JPEG",
+        "-DLIBYUV_UNLIMITED_DATA",
+    ],
+
+    arch: {
+        arm: {
+            cflags: ["-mfpu=neon"],
+        },
+    },
+
+    shared_libs: ["libjpeg"],
+
+    export_include_dirs: ["include"],
+
+    apex_available: [
+        "//apex_available:platform",
+        "com.android.media.swcodec",
+        "com.android.virt",
+    ],
+    min_sdk_version: "29",
+}
+
+// compatibilty static library until all uses of libyuv_static are replaced
+// with libyuv (b/37646797)
+cc_library_static {
+    name: "libyuv_static",
+    vendor_available: true,
+    whole_static_libs: ["libyuv"],
+    apex_available: [
+        "//apex_available:platform",
+        "com.android.media.swcodec",
+    ],
+    min_sdk_version: "29",
+}
+
+cc_test {
+    name: "libyuv_unittest",
+    static_libs: ["libyuv"],
+    shared_libs: ["libjpeg"],
+    cflags: ["-Wall", "-Werror"],
+    srcs: [
+        "unit_test/basictypes_test.cc",
+        "unit_test/color_test.cc",
+        "unit_test/compare_test.cc",
+        "unit_test/convert_test.cc",
+        "unit_test/cpu_test.cc",
+        "unit_test/cpu_thread_test.cc",
+        "unit_test/math_test.cc",
+        "unit_test/planar_test.cc",
+        "unit_test/rotate_argb_test.cc",
+        "unit_test/rotate_test.cc",
+        "unit_test/scale_argb_test.cc",
+        "unit_test/scale_plane_test.cc",
+        "unit_test/scale_rgb_test.cc",
+        "unit_test/scale_test.cc",
+        "unit_test/scale_uv_test.cc",
+        "unit_test/unit_test.cc",
+        "unit_test/video_common_test.cc",
+    ],
+}
+
+cc_test {
+    name: "compare",
+    gtest: false,
+    srcs: [
+        "util/compare.cc",
+    ],
+    static_libs: ["libyuv"],
+}
+
+cc_test {
+    name: "cpuid",
+    gtest: false,
+    srcs: [
+        "util/cpuid.c",
+    ],
+    static_libs: ["libyuv"],
+}
+
+cc_test {
+    name: "i444tonv12_eg",
+    gtest: false,
+    srcs: [
+        "util/i444tonv12_eg.cc",
+    ],
+    static_libs: ["libyuv"],
+}
+
+cc_test {
+    name: "psnr",
+    gtest: false,
+    srcs: [
+        "util/psnr_main.cc",
+        "util/psnr.cc",
+        "util/ssim.cc",
+    ],
+    static_libs: ["libyuv"],
+}
+
+cc_test {
+    name: "yuvconstants",
+    gtest: false,
+    srcs: [
+        "util/yuvconstants.c",
+    ],
+    static_libs: ["libyuv"],
+}
+
+cc_test {
+    name: "yuvconvert",
+    gtest: false,
+    srcs: [
+        "util/yuvconvert.cc",
+    ],
+    static_libs: ["libyuv"],
+    shared_libs: ["libjpeg"],
+}
diff --git a/BUILD b/BUILD
deleted file mode 100644
index 3145e36a..00000000
--- a/BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright 2011 Google Inc. All Rights Reserved.
-#
-# Description:
-#   The libyuv package provides implementation yuv image conversion and
-#   scaling.
-#
-#   This library is used by Talk Video and WebRTC.
-#
-
-licenses(['notice'])  #  3-clause BSD
-
-exports_files(['LICENSE'])
-
-package(default_visibility = ['//visibility:public'])
diff --git a/files/BUILD.gn b/BUILD.gn
index a72ff065..2c600b22 100644
--- a/files/BUILD.gn
+++ b/BUILD.gn
@@ -6,6 +6,7 @@
 # in the file PATENTS. All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
+import("//build/config/features.gni")
 import("//testing/test.gni")
 import("libyuv.gni")
 
@@ -21,15 +22,25 @@ declare_args() {
 
 config("libyuv_config") {
   include_dirs = [ "include" ]
-  if (is_android && current_cpu == "arm64") {
-    ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ]
-  }
-  if (is_android && current_cpu != "arm64") {
-    ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ]
+  if (is_android) {
+    if (target_cpu == "arm" || target_cpu == "x86" || target_cpu == "mipsel") {
+      ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ]
+    } else {
+      ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ]
+    }
   }
-
+  defines = []
   if (!libyuv_use_neon) {
-    defines = [ "LIBYUV_DISABLE_NEON" ]
+    defines += [ "LIBYUV_DISABLE_NEON" ]
+  }
+  if (libyuv_disable_rvv) {
+    defines += [ "LIBYUV_DISABLE_RVV" ]
+  }
+  if (!libyuv_use_lsx) {
+    defines += [ "LIBYUV_DISABLE_LSX" ]
+  }
+  if (!libyuv_use_lasx) {
+    defines += [ "LIBYUV_DISABLE_LASX" ]
   }
 }
 
@@ -69,6 +80,14 @@ group("libyuv") {
     deps += [ ":libyuv_msa" ]
   }
 
+  if (libyuv_use_lsx) {
+    deps += [ ":libyuv_lsx" ]
+  }
+
+  if (libyuv_use_lasx) {
+    deps += [ ":libyuv_lasx" ]
+  }
+
   if (!is_ios && !libyuv_disable_jpeg) {
     # Make sure that clients of libyuv link with libjpeg. This can't go in
     # libyuv_internal because in Windows x64 builds that will generate a clang
@@ -129,6 +148,7 @@ static_library("libyuv_internal") {
     "source/row_any.cc",
     "source/row_common.cc",
     "source/row_gcc.cc",
+    "source/row_rvv.cc",
     "source/row_win.cc",
     "source/scale.cc",
     "source/scale_any.cc",
@@ -136,6 +156,7 @@ static_library("libyuv_internal") {
     "source/scale_common.cc",
     "source/scale_gcc.cc",
     "source/scale_rgb.cc",
+    "source/scale_rvv.cc",
     "source/scale_uv.cc",
     "source/scale_win.cc",
     "source/video_common.cc",
@@ -150,7 +171,7 @@ static_library("libyuv_internal") {
     configs += [ "//build/config/gcc:symbol_visibility_default" ]
   }
 
-  if (!is_ios && !libyuv_disable_jpeg) {
+  if ((!is_ios || use_blink) && !libyuv_disable_jpeg) {
     defines += [ "HAVE_JPEG" ]
 
     # Needed to pull in libjpeg headers. Can't add //third_party:jpeg to deps
@@ -229,6 +250,44 @@ if (libyuv_use_msa) {
   }
 }
 
+if (libyuv_use_lsx) {
+  static_library("libyuv_lsx") {
+    sources = [
+      # LSX Source Files
+      "source/rotate_lsx.cc",
+      "source/row_lsx.cc",
+      "source/scale_lsx.cc",
+    ]
+
+    cflags_cc = [
+      "-mlsx",
+      "-Wno-c++11-narrowing",
+    ]
+
+    deps = [ ":libyuv_internal" ]
+
+    public_configs = [ ":libyuv_config" ]
+  }
+}
+
+if (libyuv_use_lasx) {
+  static_library("libyuv_lasx") {
+    sources = [
+      # LASX Source Files
+      "source/row_lasx.cc",
+    ]
+
+    cflags_cc = [
+      "-mlasx",
+      "-Wno-c++11-narrowing",
+    ]
+
+    deps = [ ":libyuv_internal" ]
+
+    public_configs = [ ":libyuv_config" ]
+  }
+}
+
 if (libyuv_include_tests) {
   config("libyuv_unittest_warnings_config") {
     if (!is_win) {
@@ -256,6 +315,7 @@ if (libyuv_include_tests) {
       "unit_test/basictypes_test.cc",
       "unit_test/color_test.cc",
       "unit_test/compare_test.cc",
+      "unit_test/convert_argb_test.cc",
       "unit_test/convert_test.cc",
       "unit_test/cpu_test.cc",
       "unit_test/cpu_thread_test.cc",
@@ -264,6 +324,7 @@ if (libyuv_include_tests) {
       "unit_test/rotate_argb_test.cc",
       "unit_test/rotate_test.cc",
       "unit_test/scale_argb_test.cc",
+      "unit_test/scale_plane_test.cc",
       "unit_test/scale_rgb_test.cc",
       "unit_test/scale_test.cc",
       "unit_test/scale_uv_test.cc",
diff --git a/files/CM_linux_packages.cmake b/CM_linux_packages.cmake
index 5f676f89..a073edfa 100644
--- a/files/CM_linux_packages.cmake
+++ b/CM_linux_packages.cmake
@@ -8,7 +8,7 @@ SET ( YUV_VER_MAJOR 0 )
 SET ( YUV_VER_MINOR 0 )
 SET ( YUV_VER_PATCH ${YUV_VERSION_NUMBER} )
 SET ( YUV_VERSION ${YUV_VER_MAJOR}.${YUV_VER_MINOR}.${YUV_VER_PATCH} )
-MESSAGE ( "Building ver.: ${YUV_VERSION}" )
+MESSAGE ( VERBOSE "Building ver.: ${YUV_VERSION}" )
 
 # is this a 32-bit or 64-bit build?
 IF ( CMAKE_SIZEOF_VOID_P EQUAL 8 )
@@ -45,7 +45,7 @@ ELSE ()
 		SET ( YUV_SYSTEM_NAME "amd-${YUV_BIT_SIZE}" )
 	ENDIF ()
 ENDIF ()
-MESSAGE ( "Packaging for: ${YUV_SYSTEM_NAME}" )
+MESSAGE ( VERBOSE "Packaging for: ${YUV_SYSTEM_NAME}" )
 
 # define all the variables needed by CPack to create .deb and .rpm packages
 SET ( CPACK_PACKAGE_VENDOR					"Frank Barchard" )
diff --git a/files/CMakeLists.txt b/CMakeLists.txt
index d190507b..9abfa74b 100644
--- a/files/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@
 
 PROJECT ( YUV C CXX )	# "C" is required even for C++ projects
 CMAKE_MINIMUM_REQUIRED( VERSION 2.8.12 )
-OPTION( TEST "Built unit tests" OFF )
+OPTION( UNIT_TEST "Built unit tests" OFF )
 
 SET ( ly_base_dir	${PROJECT_SOURCE_DIR} )
 SET ( ly_src_dir	${ly_base_dir}/source )
@@ -37,22 +37,32 @@ if(WIN32)
   SET_TARGET_PROPERTIES	( ${ly_lib_shared} PROPERTIES IMPORT_PREFIX "lib" )
 endif()
 
+# this creates the cpuid tool
+ADD_EXECUTABLE      ( cpuid ${ly_base_dir}/util/cpuid.c )
+TARGET_LINK_LIBRARIES  ( cpuid ${ly_lib_static} )
+
 # this creates the conversion tool
 ADD_EXECUTABLE			( yuvconvert ${ly_base_dir}/util/yuvconvert.cc )
 TARGET_LINK_LIBRARIES	( yuvconvert ${ly_lib_static} )
 
+# this creates the yuvconstants tool
+ADD_EXECUTABLE      ( yuvconstants ${ly_base_dir}/util/yuvconstants.c )
+TARGET_LINK_LIBRARIES  ( yuvconstants ${ly_lib_static} )
 
-INCLUDE ( FindJPEG )
+find_package ( JPEG )
 if (JPEG_FOUND)
   include_directories( ${JPEG_INCLUDE_DIR} )
-  target_link_libraries( yuvconvert ${JPEG_LIBRARY} )
+  target_link_libraries( ${ly_lib_shared} ${JPEG_LIBRARY} )
   add_definitions( -DHAVE_JPEG )
 endif()
 
-if(TEST)
+if(UNIT_TEST)
   find_library(GTEST_LIBRARY gtest)
   if(GTEST_LIBRARY STREQUAL "GTEST_LIBRARY-NOTFOUND")
     set(GTEST_SRC_DIR /usr/src/gtest CACHE STRING "Location of gtest sources")
+    if (CMAKE_CROSSCOMPILING)
+      set(GTEST_SRC_DIR third_party/googletest/src/googletest)
+    endif()
     if(EXISTS ${GTEST_SRC_DIR}/src/gtest-all.cc)
       message(STATUS "building gtest from sources in ${GTEST_SRC_DIR}")
       set(gtest_sources ${GTEST_SRC_DIR}/src/gtest-all.cc)
@@ -61,7 +71,7 @@ if(TEST)
       include_directories(${GTEST_SRC_DIR}/include)
       set(GTEST_LIBRARY gtest)
     else()
-      message(FATAL_ERROR "TEST is set but unable to find gtest library")
+      message(FATAL_ERROR "UNIT_TEST is set but unable to find gtest library")
     endif()
   endif()
 
@@ -78,6 +88,12 @@ if(TEST)
   if(NACL AND NACL_LIBC STREQUAL "newlib")
     target_link_libraries(libyuv_unittest glibc-compat)
   endif()
+
+  find_library(GFLAGS_LIBRARY gflags)
+  if(NOT GFLAGS_LIBRARY STREQUAL "GFLAGS_LIBRARY-NOTFOUND")
+    target_link_libraries(libyuv_unittest gflags)
+    add_definitions(-DLIBYUV_USE_GFLAGS)
+  endif()
 endif()
 
 
diff --git a/files/DEPS b/DEPS
index 3cf2dbe0..70ed1d58 100644
--- a/files/DEPS
+++ b/DEPS
@@ -5,43 +5,62 @@ gclient_gn_args = [
 
 vars = {
   'chromium_git': 'https://chromium.googlesource.com',
-  'chromium_revision': '829c6df33dce1085a61d8fd44209fc84bbf9a6a7',
-  'gn_version': 'git_revision:6f13aaac55a977e1948910942675c69f2b4f7a94',
+  'chromium_revision': 'af3d01376bec75a68f90160bfd38057d60510a2b',
+  'gn_version': 'git_revision:fae280eabe5d31accc53100137459ece19a7a295',
+  # ninja CIPD package version.
+  # https://chrome-infra-packages.appspot.com/p/infra/3pp/tools/ninja
+  'ninja_version': 'version:2@1.11.1.chromium.6',
+  # reclient CIPD package version
+  'reclient_version': 're_client_version:0.110.0.43ec6b1-gomaip',
 
   # Keep the Chromium default of generating location tags.
   'generate_location_tags': True,
+
+  # By default, download the fuchsia sdk from the public sdk directory.
+  'fuchsia_sdk_cipd_prefix': 'fuchsia/sdk/core/',
+  'fuchsia_version': 'version:15.20230909.2.1',
+  # By default, download the fuchsia images from the fuchsia GCS bucket.
+  'fuchsia_images_bucket': 'fuchsia',
+  'checkout_fuchsia': False,
+  # Since the images are hundreds of MB, default to only downloading the image
+  # most commonly useful for developers. Bots and developers that need to use
+  # other images can override this with additional images.
+  'checkout_fuchsia_boot_images': "terminal.qemu-x64,terminal.x64",
+  'checkout_fuchsia_product_bundles': '"{checkout_fuchsia_boot_images}" != ""',
 }
 
 deps = {
   'src/build':
-    Var('chromium_git') + '/chromium/src/build' + '@' + 'dcea3443035f48d58193788e0bc56daca4e5db33',
+    Var('chromium_git') + '/chromium/src/build' + '@' + '5885d3c24833ad72845a52a1b913a2b8bc651b56',
   'src/buildtools':
-    Var('chromium_git') + '/chromium/src/buildtools' + '@' + '075dd7e22837a69189003e4fa84499acf63188cf',
+    Var('chromium_git') + '/chromium/src/buildtools' + '@' + '79ab87fa54614258c4c95891e873223371194525',
   'src/testing':
-    Var('chromium_git') + '/chromium/src/testing' + '@' + 'f4e42be13265ec304b0f3085eee2b15f30f44077',
+    Var('chromium_git') + '/chromium/src/testing' + '@' + '51e9a02297057cc0e917763a51e16680b7d16fb6',
   'src/third_party':
-    Var('chromium_git') + '/chromium/src/third_party' + '@' + '42c249feeb71bc0cd184849f0509aefef599343d',
+    Var('chromium_git') + '/chromium/src/third_party' + '@' + '2dc4b18abd1003ce7b1eda509dc96f12d49a9667',
 
   'src/buildtools/linux64': {
     'packages': [
       {
-        'package': 'gn/gn/linux-amd64',
+        'package': 'gn/gn/linux-${{arch}}',
         'version': Var('gn_version'),
       }
     ],
     'dep_type': 'cipd',
-    'condition': 'checkout_linux',
+    'condition': 'host_os == "linux"',
   },
+
   'src/buildtools/mac': {
     'packages': [
       {
-        'package': 'gn/gn/mac-amd64',
+        'package': 'gn/gn/mac-${{arch}}',
         'version': Var('gn_version'),
       }
     ],
     'dep_type': 'cipd',
-    'condition': 'checkout_mac',
+    'condition': 'host_os == "mac"',
   },
+
   'src/buildtools/win': {
     'packages': [
       {
@@ -50,43 +69,60 @@ deps = {
       }
     ],
     'dep_type': 'cipd',
-    'condition': 'checkout_win',
+    'condition': 'host_os == "win"',
   },
 
-  'src/buildtools/clang_format/script':
-    Var('chromium_git') + '/external/github.com/llvm/llvm-project/clang/tools/clang-format.git' + '@' + '99876cacf78329e5f99c244dbe42ccd1654517a0',
-  'src/buildtools/third_party/libc++/trunk':
-    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxx.git' + '@' + '79a2e924d96e2fc1e4b937c42efd08898fa472d7',
-  'src/buildtools/third_party/libc++abi/trunk':
-    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxxabi.git' + '@' + '665b74f7d1b3bb295cd6ba7d8fcec1acd3d2ac84',
-  'src/buildtools/third_party/libunwind/trunk':
-    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'f51a154281bdfe746c46c07cd4fb05be97f9441d',
+  'src/buildtools/reclient': {
+    'packages': [
+      {
+        'package': 'infra/rbe/client/${{platform}}',
+        'version': Var('reclient_version'),
+      }
+    ],
+    'dep_type': 'cipd',
+  },
 
   'src/third_party/catapult':
-    Var('chromium_git') + '/catapult.git' + '@' + '75423c310eb303d28978be892fcf7b9c2c824909',
+    Var('chromium_git') + '/catapult.git' + '@' + 'fa05d995e152efdae488a2aeba397cd609fdbc9d',
+  'src/third_party/clang-format/script':
+      Var('chromium_git') + '/external/github.com/llvm/llvm-project/clang/tools/clang-format.git' + '@' + 'f97059df7f8b205064625cdb5f97b56668a125ef',
   'src/third_party/colorama/src':
-    Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8',
+    Var('chromium_git') + '/external/colorama.git' + '@' + '3de9f013df4b470069d03d250224062e8cf15c49',
+  'src/third_party/cpu_features/src': {
+    'url': Var('chromium_git') + '/external/github.com/google/cpu_features.git' + '@' + '936b9ab5515dead115606559502e3864958f7f6e',
+    'condition': 'checkout_android',
+  },
   'src/third_party/depot_tools':
-    Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + '2ffa1bde797a8127c0f72908d0bd74051fd65d0d',
+    Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + 'd3e43dd4319ba169c0aaf44547eecf861f2fe5da',
   'src/third_party/freetype/src':
-    Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + 'cff026d41599945498044d2f4dcc0e610ffb6929',
+    Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + '9e3c5d7e183c1a8d5ed8868d7d28ef18d3ec9ec8',
+  'third_party/fuchsia-gn-sdk': {
+    'url': Var('chromium_git') + '/chromium/src/third_party/fuchsia-gn-sdk.git' + '@' + '0d6902558d92fe3d49ba9a8f638ddea829be595b',
+    'condition': 'checkout_fuchsia',
+  },
   'src/third_party/googletest/src':
-    Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + 'e2f3978937c0244508135f126e2617a7734a68be',
+    Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + 'af29db7ec28d6df1c7f0f745186884091e602e07',
   'src/third_party/harfbuzz-ng/src':
-    Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + '64b29dbd5994a511acee69cb9b45ad650ef88359',
+    Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + 'db700b5670d9475cc8ed4880cc9447b232c5e432',
+  'src/third_party/libc++/src':
+    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxx.git' + '@' + '84fb809dd6dae36d556dc0bb702c6cc2ce9d4b80',
+  'src/third_party/libc++abi/src':
+    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxxabi.git' + '@' + '8d21803b9076b16d46c32e2f10da191ee758520c',
+  'src/third_party/libunwind/src':
+    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'f1c687e0aaf0d70b9a53a150e9be5cb63af9215f',
   'src/third_party/libjpeg_turbo':
-    Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + '02959c3ee17abacfd1339ec22ea93301292ffd56',
+    Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + '30bdb85e302ecfc52593636b2f44af438e05e784',
   'src/third_party/nasm':
-    Var('chromium_git') + '/chromium/deps/nasm.git' + '@' + '9215e8e1d0fe474ffd3e16c1a07a0f97089e6224',
+    Var('chromium_git') + '/chromium/deps/nasm.git' + '@' + '7fc833e889d1afda72c06220e5bed8fb43b2e5ce',
   'src/tools':
-    Var('chromium_git') + '/chromium/src/tools' + '@' + '198dc879529652b39ba6e223bcc0bcad5f1facd6',
+    Var('chromium_git') + '/chromium/src/tools' + '@' + 'a76c0dbb64c603a0d45e0c6dfae3a351b6e1adf1',
 
   # libyuv-only dependencies (not present in Chromium).
   'src/third_party/gtest-parallel':
     Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '1dad0e9f6d82ff994130b529d7d814b40eb32b0e',
 
   'src/third_party/lss': {
-    'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + '92a65a8f5d705d1928874420c8d0d15bde8c89e5',
+    'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + 'ce877209e11aa69dcfffbd53ef90ea1d07136521',
     'condition': 'checkout_android or checkout_linux',
   },
 
@@ -101,14 +137,32 @@ deps = {
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/auto/src': {
-    'url': Var('chromium_git') + '/external/github.com/google/auto.git' + '@' + 'fe67d853d6356943dc79541c892ab6d3e6a7b61a',
-    'condition': 'checkout_android',
+
+  'src/third_party/kotlin_stdlib': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/kotlin_stdlib',
+              'version': 'Z1gsqhL967kFQecxKrRwXHbl-vwQjpv0l7PMUZ0EVO8C',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/kotlinc/current': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/kotlinc',
+              'version': 'Rr02Gf2EkaeSs3EhSUHhPqDHSd1AzimrM6cRYUJCPjQC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
   },
+
   'src/third_party/boringssl/src':
-    'https://boringssl.googlesource.com/boringssl.git' + '@' + '3a667d10e94186fd503966f5638e134fe9fb4080',
+    'https://boringssl.googlesource.com/boringssl.git' + '@' + '20a06474c0b4a16779311bfe98ba69dc2402101d',
   'src/base': {
-    'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'e9e639622449a893a1b5e32781d072cec08ead72',
+    'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'd407b7061bce341bb6e11b539ea86c46c949ac4c',
     'condition': 'checkout_android',
   },
   'src/third_party/bazel': {
@@ -131,20 +185,28 @@ deps = {
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_ndk': {
-    'url': Var('chromium_git') + '/android_ndk.git' + '@' + '401019bf85744311b26c88ced255cd53401af8b7',
-    'condition': 'checkout_android',
+  'src/third_party/android_toolchain': {
+      'packages': [
+            {
+                'package': 'chromium/third_party/android_toolchain/android_toolchain',
+                'version': 'R_8suM8m0oHbZ1awdxGXvKEFpAOETscbfZxkkMthyk8C',
+            },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
   },
+
   'src/third_party/androidx': {
     'packages': [
       {
           'package': 'chromium/third_party/androidx',
-          'version': '6d8ij5pzYh29WWjPbdbAWFBJSA1nUgkWf2p6wCVZKIsC',
+          'version': 'y7rF_rx56mD3FGhMiqnlbQ6HOqHJ95xUFNX1m-_a988C',
       },
     ],
     'condition': 'checkout_android',
     'dep_type': 'cipd',
   },
+
   'src/third_party/android_support_test_runner': {
       'packages': [
           {
@@ -158,16 +220,12 @@ deps = {
   'src/third_party/android_sdk/public': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_sdk/public/build-tools/31.0.0',
-              'version': 'tRoD45SCi7UleQqSV7MrMQO1_e5P8ysphkCcj6z_cCQC',
+              'package': 'chromium/third_party/android_sdk/public/build-tools/34.0.0',
+              'version': 'YK9Rzw3fDzMHVzatNN6VlyoD_81amLZpN1AbmkdOd6AC',
           },
           {
               'package': 'chromium/third_party/android_sdk/public/emulator',
-              'version': 'gMHhUuoQRKfxr-MBn3fNNXZtkAVXtOwMwT7kfx8jkIgC',
-          },
-          {
-              'package': 'chromium/third_party/android_sdk/public/extras',
-              'version': 'ppQ4TnqDvBHQ3lXx5KPq97egzF5X2FFyOrVHkGmiTMQC',
+              'version': '9lGp8nTUCRRWGMnI_96HcKfzjnxEJKUcfvfwmA3wXNkC',
           },
           {
               'package': 'chromium/third_party/android_sdk/public/patcher',
@@ -175,11 +233,15 @@ deps = {
           },
           {
               'package': 'chromium/third_party/android_sdk/public/platform-tools',
-              'version': 'g7n_-r6yJd_SGRklujGB1wEt8iyr77FZTUJVS9w6O34C',
+              'version': 'HWVsGs2HCKgSVv41FsOcsfJbNcB0UFiNrF6Tc4yRArYC',
           },
           {
-              'package': 'chromium/third_party/android_sdk/public/platforms/android-31',
-              'version': 'lL3IGexKjYlwjO_1Ga-xwxgwbE_w-lmi2Zi1uOlWUIAC',
+              'package': 'chromium/third_party/android_sdk/public/platforms/android-34',
+              'version': 'u-bhWbTME6u-DjypTgr3ZikCyeAeU6txkR9ET6Uudc8C',
+          },
+   {
+              'package': 'chromium/third_party/android_sdk/public/platforms/android-tiramisuprivacysandbox',
+              'version': 'YWMYkzyxGBgVsty0GhXL1oxbY0pGXQIgFc0Rh7ZMRPYC',
           },
           {
               'package': 'chromium/third_party/android_sdk/public/sources/android-31',
@@ -187,7 +249,7 @@ deps = {
           },
           {
               'package': 'chromium/third_party/android_sdk/public/cmdline-tools',
-              'version': 'Ez2NWws2SJYCF6qw2O-mSCqK6424l3ZdSTpppLyVR_cC',
+              'version': 'EWnL2r7oV5GtE9Ef7GyohyFam42wtMtEKYU4dCb3U1YC',
           },
       ],
       'condition': 'checkout_android',
@@ -207,7 +269,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_build_tools/aapt2',
-              'version': 'version:3.6.0-alpha03-5516695-cr0',
+              'version': 'STY0BXlZxsEhudnlXQFed-B5UpwehcoM0sYqor6qRqsC',
           },
       ],
       'condition': 'checkout_android',
@@ -223,6 +285,16 @@ deps = {
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+  'src/third_party/byte_buddy/android_sdk_build_tools_25_0_2': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_sdk/public/build-tools',
+              'version': 'kwIs2vdfTm93yEP8LG5aSnchN4BVEdVxbqQtF4XpPdkC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
   'src/third_party/ced/src': {
     'url': Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + 'ba412eaaacd3186085babcd901679a48863c7dd5',
     'condition': 'checkout_android',
@@ -267,7 +339,7 @@ deps = {
   },
 
   'src/third_party/icu': {
-    'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'bf66d373ae781a3498f2babe7b61d933dd774b82',
+    'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'e8c3bc9ea97d4423ad0515e5f1c064f486dae8b1',
   },
   'src/third_party/icu4j': {
       'packages': [
@@ -293,11 +365,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/jdk',
-              'version': 'PfRSnxe8Od6WU4zBXomq-zsgcJgWmm3z4gMQNB-r2QcC',
-          },
-          {
-              'package': 'chromium/third_party/jdk/extras',
-              'version': 'fkhuOQ3r-zKtWEdKplpo6k0vKkjl-LY_rJTmtzFCQN4C',
+              'version': 'GCFtf5t6M4HlrHj6NXedHbpHp2xjgognF8ptNci4478C',
           },
       ],
       'condition': 'checkout_android',
@@ -308,22 +376,31 @@ deps = {
     'condition': 'checkout_android',
   },
   'src/third_party/junit/src': {
-    'url': Var('chromium_git') + '/external/junit.git' + '@' + '64155f8a9babcfcf4263cf4d08253a1556e75481',
+    'url': Var('chromium_git') + '/external/junit.git' + '@' + '05fe2a64f59127c02135be22f416e91260d6ede6',
     'condition': 'checkout_android',
   },
   'src/third_party/libunwindstack': {
-      'url': Var('chromium_git') + '/chromium/src/third_party/libunwindstack.git' + '@' + '6868358481bb1e5e20d155c1084dc436c88b5e6b',
+      'url': Var('chromium_git') + '/chromium/src/third_party/libunwindstack.git' + '@' + '4dbfa0e8c844c8e243b297bc185e54a99ff94f9e',
       'condition': 'checkout_android',
   },
+  'src/third_party/ninja': {
+    'packages': [
+      {
+        'package': 'infra/3pp/tools/ninja/${{platform}}',
+        'version': Var('ninja_version'),
+      }
+    ],
+    'dep_type': 'cipd',
+  },
   'src/third_party/mockito/src': {
-    'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + '04a2a289a4222f80ad20717c25144981210d2eac',
+    'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + '7c3641bcef717ffa7d765f2c86b847d0aab1aac9',
     'condition': 'checkout_android',
   },
   'src/third_party/objenesis': {
       'packages': [
           {
               'package': 'chromium/third_party/objenesis',
-              'version': '9e367f55e5a65781ee77bfcbaa88fb82b30e75c0',
+              'version': 'tknDblENYi8IaJYyD6tUahUyHYZlzJ_Y74_QZSz4DpIC',
           },
       ],
       'condition': 'checkout_android',
@@ -343,7 +420,20 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/r8',
-              'version': 'Nu_mvQJe34CotIXadFlA3w732CJ9EvQGuVs4udcZedAC',
+              'version': 'O1BBWiBTIeNUcraX8STMtQXVaCleu6SJJjWCcnfhPLkC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  # This duplication is intentional, so we avoid updating the r8.jar used by
+  # dexing unless necessary, since each update invalidates all incremental
+  # dexing and unnecessarily slows down all bots.
+  'src/third_party/r8/d8': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/r8',
+              'version': 'vw5kLlW3-suSlCKSO9OQpFWpR8oDnvQ8k1RgKNUapQYC',
           },
       ],
       'condition': 'checkout_android',
@@ -360,14 +450,14 @@ deps = {
       'dep_type': 'cipd',
   },
   'src/third_party/requests/src': {
-    'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'refs/tags/v2.23.0',
+    'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'c7e0fc087ceeadb8b4c84a0953a422c474093d6d',
     'condition': 'checkout_android',
   },
   'src/third_party/robolectric': {
       'packages': [
           {
               'package': 'chromium/third_party/robolectric',
-              'version': 'iC6RDM5EH3GEAzR-1shW_Mg0FeeNE5shq1okkFfuuNQC',
+              'version': 'hzetqh1qFI32FOgQroZvGcGdomrgVBJ6WKRnl1KFw6EC',
           },
       ],
       'condition': 'checkout_android',
@@ -377,7 +467,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/sqlite4java',
-              'version': '889660698187baa7c8b0d79f7bf58563125fbd66',
+              'version': 'LofjKH9dgXIAJhRYCPQlMFywSwxYimrfDeBmaHc-Z5EC',
           },
       ],
       'condition': 'checkout_android',
@@ -387,7 +477,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/turbine',
-              'version': 'Om6yIEXgJxuqghErK29h9RcMH6VaymMbxwScwXmcN6EC',
+              'version': '2I2Nz480QsuCxpQ1lMfbigX8l5HAhX3_ykWU4TKRGo4C',
           },
       ],
       'condition': 'checkout_android',
@@ -400,1718 +490,1822 @@ deps = {
 
   # iOS deps:
   'src/ios': {
-    'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '81826d980c159f949c2c7901f4dbec9a09788964',
+    'url': Var('chromium_git') + '/chromium/src/ios' + '@' + 'ddd58e86cf4ebdc0db60a5d0f3c323de49bb295c',
     'condition': 'checkout_ios'
   },
 
   # Everything coming after this is automatically updated by the auto-roller.
   # === ANDROID_DEPS Generated Code Start ===
-
+  # Generated by //third_party/android_deps/fetch_all.py
   'src/third_party/android_deps/libs/android_arch_core_common': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_core_common',
-              'version': 'version:2@1.1.1.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/android_arch_core_runtime': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_core_runtime',
-              'version': 'version:2@1.1.1.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/android_arch_lifecycle_common': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common',
-              'version': 'version:2@1.1.1.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/android_arch_lifecycle_common_java8': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common_java8',
-              'version': 'version:2@1.1.1.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/android_arch_lifecycle_livedata': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata',
-              'version': 'version:2@1.1.1.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/android_arch_lifecycle_livedata_core': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata_core',
-              'version': 'version:2@1.1.1.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/android_arch_lifecycle_runtime': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_runtime',
-              'version': 'version:2@1.1.1.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/android_arch_lifecycle_viewmodel': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_viewmodel',
-              'version': 'version:2@1.1.1.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/backport_util_concurrent_backport_util_concurrent': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/backport_util_concurrent_backport_util_concurrent',
-              'version': 'version:2@3.1.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/classworlds_classworlds': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/classworlds_classworlds',
-              'version': 'version:2@1.1-alpha-2.cr0',
+              'version': 'version:2@1.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_animated_vector_drawable': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_animated_vector_drawable',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_appcompat_v7': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_appcompat_v7',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_asynclayoutinflater': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_asynclayoutinflater',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_cardview_v7': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_cardview_v7',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_collections': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_collections',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_coordinatorlayout': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_coordinatorlayout',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_cursoradapter': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_cursoradapter',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_customview': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_customview',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_design': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_design',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_documentfile': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_documentfile',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_drawerlayout': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_drawerlayout',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_interpolator': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_interpolator',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_loader': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_loader',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_localbroadcastmanager': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_localbroadcastmanager',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_multidex': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_multidex',
-              'version': 'version:2@1.0.0.cr0',
+              'version': 'version:2@1.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_print': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_print',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_recyclerview_v7': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_recyclerview_v7',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_slidingpanelayout': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_slidingpanelayout',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_annotations',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_compat': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_compat',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_core_ui': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_ui',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_core_utils': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_utils',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_fragment': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_fragment',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_media_compat': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_media_compat',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_v4': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v4',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_support_vector_drawable': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_vector_drawable',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_swiperefreshlayout': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_swiperefreshlayout',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_transition': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_transition',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_versionedparcelable': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_versionedparcelable',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_support_viewpager': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_viewpager',
-              'version': 'version:2@28.0.0.cr0',
+              'version': 'version:2@28.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_android_tools_common': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_tools_common',
-              'version': 'version:2@30.0.0-alpha10.cr0',
+              'version': 'version:2@30.2.0-beta01.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs': {
+
+  'src/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs',
-              'version': 'version:2@1.1.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api',
+              'version': 'version:2@30.2.0-beta01.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs_configuration': {
+
+  'src/third_party/android_deps/libs/com_android_tools_sdk_common': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs_configuration',
-              'version': 'version:2@1.1.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_android_tools_sdk_common',
+              'version': 'version:2@30.2.0-beta01.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api': {
+
+  'src/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api',
-              'version': 'version:2@30.0.0-alpha10.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine',
+              'version': 'version:2@2.8.8.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/com_android_tools_sdk_common': {
+
+  'src/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/com_android_tools_sdk_common',
-              'version': 'version:2@30.0.0-alpha10.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms',
+              'version': 'version:2@1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine': {
+
+  'src/third_party/android_deps/libs/com_google_android_annotations': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine',
-              'version': 'version:2@2.8.8.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_annotations',
+              'version': 'version:2@4.1.1.4.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms': {
+
+  'src/third_party/android_deps/libs/com_google_android_apps_common_testing_accessibility_framework_accessibility_test_framework': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms',
-              'version': 'version:2@1.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_apps_common_testing_accessibility_framework_accessibility_test_framework',
+              'version': 'version:2@4.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_datatransport_transport_api': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_datatransport_transport_api',
-              'version': 'version:2@2.2.1.cr0',
+              'version': 'version:2@2.2.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@20.1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone',
-              'version': 'version:2@17.5.0.cr0',
+              'version': 'version:2@18.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@18.0.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_base': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_base',
-              'version': 'version:2@17.5.0.cr0',
+              'version': 'version:2@18.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_basement': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_basement',
-              'version': 'version:2@17.5.0.cr0',
+              'version': 'version:2@18.1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_cloud_messaging': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cloud_messaging',
-              'version': 'version:2@16.0.0.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_fido': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_fido',
-              'version': 'version:2@19.0.0-beta.cr0',
+              'version': 'version:2@16.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_flags': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_flags',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_gcm': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_gcm',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_iid': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_iid',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@18.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_location': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_location',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@19.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_stats': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_stats',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_tasks': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_tasks',
-              'version': 'version:2@17.2.0.cr0',
+              'version': 'version:2@18.0.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision',
-              'version': 'version:2@18.0.0.cr0',
+              'version': 'version:2@20.1.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common',
-              'version': 'version:2@18.0.0.cr0',
+              'version': 'version:2@19.1.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_android_material_material': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_material_material',
-              'version': 'version:2@1.6.0-alpha01.cr0',
+              'version': 'version:2@1.7.0-alpha02.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/com_google_android_play_core': {
+
+  'src/third_party/android_deps/libs/com_google_android_play_core_common': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_play_core',
-              'version': 'version:2@1.10.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_play_core_common',
+              'version': 'version:2@2.0.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
+  'src/third_party/android_deps/libs/com_google_android_play_feature_delivery': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_play_feature_delivery',
+              'version': 'version:2@2.0.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
   'src/third_party/android_deps/libs/com_google_auto_auto_common': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_auto_auto_common',
-              'version': 'version:2@1.1.2.cr0',
+              'version': 'version:2@1.2.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_auto_service_auto_service': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service',
-              'version': 'version:2@1.0-rc6.cr0',
+              'version': 'version:2@1.0-rc6.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations',
-              'version': 'version:2@1.0-rc6.cr0',
+              'version': 'version:2@1.0-rc6.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations',
-              'version': 'version:2@1.7.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/com_google_code_findbugs_jformatstring': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jformatstring',
-              'version': 'version:2@3.0.0.cr0',
+              'version': 'version:2@1.10.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_code_findbugs_jsr305': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jsr305',
-              'version': 'version:2@3.0.2.cr0',
+              'version': 'version:2@3.0.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_code_gson_gson': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_code_gson_gson',
-              'version': 'version:2@2.8.0.cr0',
+              'version': 'version:2@2.9.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_dagger_dagger': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger',
-              'version': 'version:2@2.30.cr0',
+              'version': 'version:2@2.30.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_dagger_dagger_compiler': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_compiler',
-              'version': 'version:2@2.30.cr0',
+              'version': 'version:2@2.30.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_dagger_dagger_producers': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_producers',
-              'version': 'version:2@2.30.cr0',
+              'version': 'version:2@2.30.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_dagger_dagger_spi': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_spi',
-              'version': 'version:2@2.30.cr0',
+              'version': 'version:2@2.30.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation',
-              'version': 'version:2@2.10.0.cr0',
+              'version': 'version:2@2.11.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations',
-              'version': 'version:2@2.10.0.cr0',
+              'version': 'version:2@2.18.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api',
-              'version': 'version:2@2.10.0.cr0',
+              'version': 'version:2@2.11.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_errorprone_error_prone_core': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_core',
-              'version': 'version:2@2.10.0.cr0',
+              'version': 'version:2@2.11.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations',
-              'version': 'version:2@2.10.0.cr0',
+              'version': 'version:2@2.11.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_errorprone_javac': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac',
-              'version': 'version:2@9+181-r4173-1.cr0',
+              'version': 'version:2@9+181-r4173-1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_errorprone_javac_shaded': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac_shaded',
-              'version': 'version:2@9-dev-r4023-3.cr0',
+              'version': 'version:2@9-dev-r4023-3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_annotations',
-              'version': 'version:2@16.0.0.cr0',
+              'version': 'version:2@16.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_common': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_common',
-              'version': 'version:2@19.5.0.cr0',
+              'version': 'version:2@19.5.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_components': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_components',
-              'version': 'version:2@16.1.0.cr0',
+              'version': 'version:2@16.1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_encoders': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_encoders',
-              'version': 'version:2@16.1.0.cr0',
+              'version': 'version:2@16.1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_encoders_json': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_encoders_json',
-              'version': 'version:2@17.1.0.cr0',
+              'version': 'version:2@17.1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_iid': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_iid',
-              'version': 'version:2@21.0.1.cr0',
+              'version': 'version:2@21.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_iid_interop': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_iid_interop',
-              'version': 'version:2@17.0.0.cr0',
+              'version': 'version:2@17.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_installations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_installations',
-              'version': 'version:2@16.3.5.cr0',
+              'version': 'version:2@16.3.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_installations_interop': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_installations_interop',
-              'version': 'version:2@16.0.1.cr0',
+              'version': 'version:2@16.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_measurement_connector': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_measurement_connector',
-              'version': 'version:2@18.0.0.cr0',
+              'version': 'version:2@18.0.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_firebase_firebase_messaging': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_messaging',
-              'version': 'version:2@21.0.1.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/com_google_flatbuffers_flatbuffers_java': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_flatbuffers_flatbuffers_java',
-              'version': 'version:2@2.0.3.cr0',
+              'version': 'version:2@21.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format',
-              'version': 'version:2@1.5.cr0',
+              'version': 'version:2@1.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_guava_failureaccess': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_guava_failureaccess',
-              'version': 'version:2@1.0.1.cr0',
+              'version': 'version:2@1.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_guava_guava': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava',
-              'version': 'version:2@31.0-jre.cr0',
+              'version': 'version:2@31.1-jre.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_guava_guava_android': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava_android',
-              'version': 'version:2@31.0-android.cr0',
+              'version': 'version:2@31.1-android.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_guava_listenablefuture': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_guava_listenablefuture',
-              'version': 'version:2@1.0.cr0',
+              'version': 'version:2@1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations',
-              'version': 'version:2@1.3.cr0',
+              'version': 'version:2@1.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_protobuf_protobuf_java': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_java',
-              'version': 'version:2@3.4.0.cr0',
+              'version': 'version:2@3.19.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite',
-              'version': 'version:2@3.13.0.cr0',
+              'version': 'version:2@3.21.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils',
-              'version': 'version:2@1.3.0.cr0',
+              'version': 'version:2@1.3.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_squareup_javapoet': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_squareup_javapoet',
-              'version': 'version:2@1.13.0.cr0',
+              'version': 'version:2@1.13.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/com_squareup_javawriter': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_squareup_javawriter',
-              'version': 'version:2@2.1.1.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils',
-              'version': 'version:2@4.0.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/javax_annotation_javax_annotation_api': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/javax_annotation_javax_annotation_api',
-              'version': 'version:2@1.3.2.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/javax_annotation_jsr250_api': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/javax_annotation_jsr250_api',
-              'version': 'version:2@1.0.cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_deps/libs/javax_inject_javax_inject': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/javax_inject_javax_inject',
-              'version': 'version:2@1.cr0',
+              'version': 'version:2@2.1.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/nekohtml_nekohtml': {
+
+  'src/third_party/android_deps/libs/com_squareup_okio_okio_jvm': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/nekohtml_nekohtml',
-              'version': 'version:2@1.9.6.2.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_squareup_okio_okio_jvm',
+              'version': 'version:2@3.3.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/nekohtml_xercesminimal': {
+
+  'src/third_party/android_deps/libs/com_squareup_wire_wire_runtime_jvm': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/nekohtml_xercesminimal',
-              'version': 'version:2@1.9.6.2.cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_squareup_wire_wire_runtime_jvm',
+              'version': 'version:2@4.7.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/net_ltgt_gradle_incap_incap': {
+
+  'src/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/net_ltgt_gradle_incap_incap',
-              'version': 'version:2@0.2.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils',
+              'version': 'version:2@4.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/net_sf_kxml_kxml2': {
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_api': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/net_sf_kxml_kxml2',
-              'version': 'version:2@2.3.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_api',
+              'version': 'version:2@1.49.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_ant_ant': {
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_binder': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_ant_ant',
-              'version': 'version:2@1.8.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_binder',
+              'version': 'version:2@1.49.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_ant_ant_launcher': {
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_context': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_ant_ant_launcher',
-              'version': 'version:2@1.8.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_context',
+              'version': 'version:2@1.49.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_ant_tasks': {
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_core': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_ant_tasks',
-              'version': 'version:2@2.1.3.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_core',
+              'version': 'version:2@1.49.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_artifact': {
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_protobuf_lite': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_artifact',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_protobuf_lite',
+              'version': 'version:2@1.49.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_artifact_manager': {
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_stub': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_artifact_manager',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_stub',
+              'version': 'version:2@1.49.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_error_diagnostics': {
+
+  'src/third_party/android_deps/libs/io_perfmark_perfmark_api': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_error_diagnostics',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/io_perfmark_perfmark_api',
+              'version': 'version:2@0.25.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_model': {
+
+  'src/third_party/android_deps/libs/javax_annotation_javax_annotation_api': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_model',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/javax_annotation_javax_annotation_api',
+              'version': 'version:2@1.3.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_plugin_registry': {
+
+  'src/third_party/android_deps/libs/javax_annotation_jsr250_api': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_plugin_registry',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/javax_annotation_jsr250_api',
+              'version': 'version:2@1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_profile': {
+
+  'src/third_party/android_deps/libs/javax_inject_javax_inject': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_profile',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/javax_inject_javax_inject',
+              'version': 'version:2@1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_project': {
+
+  'src/third_party/android_deps/libs/net_bytebuddy_byte_buddy': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_project',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/net_bytebuddy_byte_buddy',
+              'version': 'version:2@1.14.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_repository_metadata': {
+
+  'src/third_party/android_deps/libs/net_bytebuddy_byte_buddy_agent': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_repository_metadata',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/net_bytebuddy_byte_buddy_agent',
+              'version': 'version:2@1.14.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_maven_settings': {
+
+  'src/third_party/android_deps/libs/net_ltgt_gradle_incap_incap': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_settings',
-              'version': 'version:2@2.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/net_ltgt_gradle_incap_incap',
+              'version': 'version:2@0.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_file': {
+
+  'src/third_party/android_deps/libs/org_bouncycastle_bcprov_jdk18on': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_file',
-              'version': 'version:2@1.0-beta-6.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_bouncycastle_bcprov_jdk18on',
+              'version': 'version:2@1.72.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_lightweight': {
+
+  'src/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_lightweight',
-              'version': 'version:2@1.0-beta-6.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup',
+              'version': 'version:2@1.2.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_shared': {
+
+  'src/third_party/android_deps/libs/org_checkerframework_checker_compat_qual': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_shared',
-              'version': 'version:2@1.0-beta-6.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_compat_qual',
+              'version': 'version:2@2.5.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_provider_api': {
+
+  'src/third_party/android_deps/libs/org_checkerframework_checker_qual': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_provider_api',
-              'version': 'version:2@1.0-beta-6.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_qual',
+              'version': 'version:2@3.25.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup': {
+
+  'src/third_party/android_deps/libs/org_checkerframework_checker_util': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup',
-              'version': 'version:2@1.2.1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_util',
+              'version': 'version:2@3.25.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_checkerframework_checker_compat_qual': {
+
+  'src/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_compat_qual',
-              'version': 'version:2@2.5.5.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone',
+              'version': 'version:2@3.15.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_checkerframework_checker_qual': {
+
+  'src/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_qual',
-              'version': 'version:2@3.12.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations',
+              'version': 'version:2@1.21.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone': {
+
+  'src/third_party/android_deps/libs/org_conscrypt_conscrypt_openjdk_uber': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone',
-              'version': 'version:2@3.15.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_conscrypt_conscrypt_openjdk_uber',
+              'version': 'version:2@2.5.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations': {
+
+  'src/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations',
-              'version': 'version:2@1.17.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit',
+              'version': 'version:2@4.4.1.201607150455-r.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_container_default': {
+
+  'src/third_party/android_deps/libs/org_hamcrest_hamcrest': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_container_default',
-              'version': 'version:2@1.0-alpha-9-stable-1.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_hamcrest_hamcrest',
+              'version': 'version:2@2.2.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_interpolation': {
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_interpolation',
-              'version': 'version:2@1.11.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7',
+              'version': 'version:2@1.8.20.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_utils': {
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_utils',
-              'version': 'version:2@1.5.15.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8',
+              'version': 'version:2@1.8.20.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit': {
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit',
-              'version': 'version:2@4.4.1.201607150455-r.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android',
+              'version': 'version:2@1.6.4.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_annotations': {
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_annotations',
-              'version': 'version:2@13.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm',
+              'version': 'version:2@1.6.4.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib': {
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_guava': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib',
-              'version': 'version:2@1.6.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_guava',
+              'version': 'version:2@1.6.4.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_common': {
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_common',
-              'version': 'version:2@1.6.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm',
+              'version': 'version:2@0.1.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7': {
+
+  'src/third_party/android_deps/libs/org_jsoup_jsoup': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7',
-              'version': 'version:2@1.5.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_jsoup_jsoup',
+              'version': 'version:2@1.15.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8': {
+
+  'src/third_party/android_deps/libs/org_mockito_mockito_android': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8',
-              'version': 'version:2@1.5.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_android',
+              'version': 'version:2@5.4.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android': {
+
+  'src/third_party/android_deps/libs/org_mockito_mockito_core': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android',
-              'version': 'version:2@1.5.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_core',
+              'version': 'version:2@5.4.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm': {
+
+  'src/third_party/android_deps/libs/org_mockito_mockito_subclass': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm',
-              'version': 'version:2@1.5.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_subclass',
+              'version': 'version:2@5.4.0.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm': {
+
+  'src/third_party/android_deps/libs/org_objenesis_objenesis': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm',
-              'version': 'version:2@0.1.0.cr0',
+              'package': 'chromium/third_party/android_deps/libs/org_objenesis_objenesis',
+              'version': 'version:2@3.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_ow2_asm_asm': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm',
-              'version': 'version:2@7.0.cr0',
+              'version': 'version:2@9.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_ow2_asm_asm_analysis': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_analysis',
-              'version': 'version:2@7.0.cr0',
+              'version': 'version:2@9.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_ow2_asm_asm_commons': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_commons',
-              'version': 'version:2@7.0.cr0',
+              'version': 'version:2@9.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_ow2_asm_asm_tree': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_tree',
-              'version': 'version:2@7.0.cr0',
+              'version': 'version:2@9.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_ow2_asm_asm_util': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_util',
-              'version': 'version:2@7.0.cr0',
+              'version': 'version:2@9.5.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_pcollections_pcollections': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_pcollections_pcollections',
-              'version': 'version:2@2.1.2.cr0',
+              'version': 'version:2@3.1.4.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_annotations': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_annotations',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_junit': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_junit',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_nativeruntime': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_nativeruntime',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_nativeruntime_dist_compat': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_nativeruntime_dist_compat',
+              'version': 'version:2@1.0.1.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_pluginapi': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_pluginapi',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_resources': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_resources',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_robolectric': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_robolectric',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_sandbox': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_sandbox',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_shadowapi': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadowapi',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_shadows_framework': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_framework',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_shadows_playservices': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_playservices',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_utils': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/android_deps/libs/org_robolectric_utils_reflector': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils_reflector',
-              'version': 'version:2@4.3.1.cr0',
+              'version': 'version:2@4.10.3.cr1',
           },
       ],
       'condition': 'checkout_android',
@@ -2197,30 +2391,75 @@ hooks = [
     'condition': 'checkout_mac',
   },
   {
-    'name': 'msan_chained_origins',
+    'name': 'msan_chained_origins_focal',
+    'pattern': '.',
+    'condition': 'checkout_instrumented_libraries',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-instrumented-libraries',
+                '-s', 'src/third_party/instrumented_libraries/binaries/msan-chained-origins-focal.tgz.sha1',
+              ],
+  },
+  {
+    'name': 'msan_no_origins_focal',
+    'pattern': '.',
+    'condition': 'checkout_instrumented_libraries',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-instrumented-libraries',
+                '-s', 'src/third_party/instrumented_libraries/binaries/msan-no-origins-focal.tgz.sha1',
+              ],
+  },
+  {
+    'name': 'msan_chained_origins_focal',
     'pattern': '.',
     'condition': 'checkout_instrumented_libraries',
     'action': [ 'python3',
                 'src/third_party/depot_tools/download_from_google_storage.py',
-                "--no_resume",
-                "--no_auth",
-                "--bucket", "chromium-instrumented-libraries",
-                "-s", "src/third_party/instrumented_libraries/binaries/msan-chained-origins.tgz.sha1",
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-instrumented-libraries',
+                '-s', 'src/third_party/instrumented_libraries/binaries/msan-chained-origins-focal.tgz.sha1',
               ],
   },
   {
-    'name': 'msan_no_origins',
+    'name': 'msan_no_origins_focal',
     'pattern': '.',
     'condition': 'checkout_instrumented_libraries',
     'action': [ 'python3',
                 'src/third_party/depot_tools/download_from_google_storage.py',
-                "--no_resume",
-                "--no_auth",
-                "--bucket", "chromium-instrumented-libraries",
-                "-s", "src/third_party/instrumented_libraries/binaries/msan-no-origins.tgz.sha1",
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-instrumented-libraries',
+                '-s', 'src/third_party/instrumented_libraries/binaries/msan-no-origins-focal.tgz.sha1',
               ],
   },
   {
+    'name': 'Download Fuchsia SDK from GCS',
+    'pattern': '.',
+    'condition': 'checkout_fuchsia',
+    'action': [
+      'python3',
+      'src/build/fuchsia/update_sdk.py',
+      '--cipd-prefix={fuchsia_sdk_cipd_prefix}',
+      '--version={fuchsia_version}',
+    ],
+  },
+  {
+    'name': 'Download Fuchsia system images',
+    'pattern': '.',
+    'condition': 'checkout_fuchsia and checkout_fuchsia_product_bundles',
+    'action': [
+      'python3',
+      'src/build/fuchsia/update_product_bundles.py',
+      '{checkout_fuchsia_boot_images}',
+    ],
+  },
+  {
     # Pull clang if needed or requested via GYP_DEFINES.
     # Note: On Win, this should run after win_toolchain, as it may use it.
     'name': 'clang',
@@ -2238,7 +2477,9 @@ hooks = [
   {
     'name': 'clang_format_win',
     'pattern': '.',
-    'action': [ 'download_from_google_storage',
+    'condition': 'host_os == "win"',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
                 '--no_resume',
                 '--platform=win32',
                 '--no_auth',
@@ -2247,21 +2488,38 @@ hooks = [
     ],
   },
   {
-    'name': 'clang_format_mac',
+    'name': 'clang_format_mac_x64',
     'pattern': '.',
-    'action': [ 'download_from_google_storage',
+    'condition': 'host_os == "mac" and host_cpu == "x64"',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
                 '--no_resume',
                 '--platform=darwin',
                 '--no_auth',
                 '--bucket', 'chromium-clang-format',
-                '-s', 'src/buildtools/mac/clang-format.sha1',
+                '-s', 'src/buildtools/mac/clang-format.x64.sha1',
+                '-o', 'src/buildtools/mac/clang-format',
     ],
   },
   {
+    'name': 'clang_format_mac_arm64',
+    'pattern': '.',
+    'condition': 'host_os == "mac" and host_cpu == "arm64"',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-clang-format',
+                '-s', 'src/buildtools/mac/clang-format.arm64.sha1',
+                '-o', 'src/buildtools/mac/clang-format',
+     ],
+  },
+  {
     'name': 'clang_format_linux',
     'pattern': '.',
     'condition': 'host_os == "linux"',
-    'action': [ 'download_from_google_storage',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
                 '--no_resume',
                 '--platform=linux*',
                 '--no_auth',
@@ -2304,18 +2562,6 @@ hooks = [
     ],
   },
   {
-    # We used to use src as a CIPD root. We moved it to a different directory
-    # in crrev.com/c/930178 but left the clobber here to ensure that that CL
-    # could be reverted safely. This can be safely removed once crbug.com/794764
-    # is resolved.
-    'name': 'Android Clobber Deprecated CIPD Root',
-    'pattern': '.',
-    'condition': 'checkout_android',
-    'action': ['src/build/cipd/clobber_cipd_root.py',
-               '--root', 'src',
-    ],
-  },
-  {
     'name': 'Generate component metadata for tests',
     'pattern': '.',
     'action': [
diff --git a/files/DIR_METADATA b/DIR_METADATA
index 8bc04f15..8bc04f15 100644
--- a/files/DIR_METADATA
+++ b/DIR_METADATA
diff --git a/LICENSE b/LICENSE
index da40b336..c911747a 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2011, Google Inc. All rights reserved.
+Copyright 2011 The LibYuv Project Authors. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
diff --git a/METADATA b/METADATA
index bff062d8..19d0436e 100644
--- a/METADATA
+++ b/METADATA
@@ -1,14 +1,19 @@
-name: "libyuv"
-description:
-    "libyuv is an open source project that includes YUV scaling and conversion "
-    "functionality."
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update libyuv
+# For more info, check https://cs.android.com/android/platform/superproject/+/main:tools/external_updater/README.md
 
+name: "libyuv"
+description: "libyuv is an open source project that includes YUV scaling and conversion functionality."
 third_party {
-  url {
-    type: GIT
+  license_type: NOTICE
+  last_upgrade_date {
+    year: 2024
+    month: 1
+    day: 11
+  }
+  identifier {
+    type: "Git"
     value: "https://chromium.googlesource.com/libyuv/libyuv/"
+    version: "af6ac8265bbd07bcf977526458b60305c4304288"
   }
-  version: "d53f1beecdd8d959f7a3f2e19bd0bd7e7227a233"
-  last_upgrade_date { year: 2022 month: 8 day: 5 }
-  license_type: NOTICE
 }
diff --git a/OWNERS b/OWNERS
index a607e727..f11a7bfd 100644
--- a/OWNERS
+++ b/OWNERS
@@ -1,4 +1,11 @@
-fbarchard@google.com
-phoglund@google.com
-magjed@google.com
-chz@google.com
+mbonadei@chromium.org
+fbarchard@chromium.org
+magjed@chromium.org
+wtc@google.com
+jansson@google.com
+
+per-file *.gn=mbonadei@chromium.org,jansson@google.com
+per-file .gitignore=*
+per-file AUTHORS=*
+per-file DEPS=*
+per-file PRESUBMIT.py=mbonadei@chromium.org,jansson@google.com
diff --git a/OWNERS.android b/OWNERS.android
new file mode 100644
index 00000000..7529cb92
--- /dev/null
+++ b/OWNERS.android
@@ -0,0 +1 @@
+include platform/system/core:/janitors/OWNERS
diff --git a/files/PATENTS b/PATENTS
index 64aa5c90..64aa5c90 100644
--- a/files/PATENTS
+++ b/PATENTS
diff --git a/files/PRESUBMIT.py b/PRESUBMIT.py
index d3901caf..d3901caf 100644
--- a/files/PRESUBMIT.py
+++ b/PRESUBMIT.py
diff --git a/files/README.chromium b/README.chromium
index 3f68e21e..1389f285 100644
--- a/files/README.chromium
+++ b/README.chromium
@@ -1,8 +1,9 @@
 Name: libyuv
-URL: http://code.google.com/p/libyuv/
-Version: 1837
+URL: https://chromium.googlesource.com/libyuv/libyuv/
+Version: 1883
 License: BSD
 License File: LICENSE
+Shipped: yes
 
 Description:
 libyuv is an open source project that includes YUV conversion and scaling functionality.
diff --git a/files/README.md b/README.md
index db70b7f0..95eeb04c 100644
--- a/files/README.md
+++ b/README.md
@@ -7,6 +7,7 @@
 * Optimized for SSSE3/AVX2 on x86/x64.
 * Optimized for Neon on Arm.
 * Optimized for MSA on Mips.
+* Optimized for RVV on RISC-V.
 
 ### Development
 
diff --git a/README.version b/README.version
deleted file mode 100644
index 5deb188e..00000000
--- a/README.version
+++ /dev/null
@@ -1,8 +0,0 @@
-Version: r1837
-BugComponent: 42195
-Owner: lajos
-Local Modifications:
- * Remove files/Android.mk (it messes with the android build system).
- * Remove OWNERS files within files/ and all the subdirectories (except for
-   files/fuzz). Having these files breaks repo presubmit hooks since they
-   contain non @google.com email addresses.
diff --git a/UPDATING b/UPDATING
deleted file mode 100644
index 2679284c..00000000
--- a/UPDATING
+++ /dev/null
@@ -1,36 +0,0 @@
-To sync the libyuv checkout to an upstream revision, do the following:
-
-These commands are known to work from the external/libyuv directory of the
-Android tree's checkout.
-
-Step 1: Remove the files/ subdirectory.
-
-$ rm -rf files
-
-Step 2: Clone the libyuv repository from upstream.
-
-$ git clone https://chromium.googlesource.com/libyuv/libyuv files
-
-Step 3 (optional): Checkout a specific commit/tag.
-
-$ cd files
-$ git checkout <commit_or_tag>
-$ cd ..
-
-Step 4: Remove files that aren't necessary (Android.mk, .git and OWNERS).
-
-$ rm files/Android.mk
-$ rm -rf files/.git
-$ find files/ -name "OWNERS" | xargs rm
-
-Step 5: Update the version and last_upgrade_date fields in the METADATA file.
-
-Step 6: Update README.version with the version (can be found in
-        files/include/libyuv/version.h)
-
-Step 7: If any local modifications are being done, update README.version and
-        this file with updated instructions.
-
-Step 8: Ensure that libyuv builds and camera and media related CTS tests are
-        passing. If there are any linker errors about missing symbols, try
-	updating frameworks/av/media/libstagefright/export.lds.
diff --git a/files/build_overrides/build.gni b/build_overrides/build.gni
index c8490313..d9d01d51 100644
--- a/files/build_overrides/build.gni
+++ b/build_overrides/build.gni
@@ -13,6 +13,9 @@ build_with_chromium = false
 # Some non-Chromium builds don't support building java targets.
 enable_java_templates = true
 
+# Enables assertions on safety checks in libc++.
+enable_safe_libcxx = true
+
 # Allow using custom suppressions files (currently not used by libyuv).
 asan_suppressions_file = "//build/sanitizers/asan_suppressions.cc"
 lsan_suppressions_file = "//build/sanitizers/lsan_suppressions.cc"
diff --git a/files/build_overrides/gtest.gni b/build_overrides/gtest.gni
index d3c3f68c..d3c3f68c 100644
--- a/files/build_overrides/gtest.gni
+++ b/build_overrides/gtest.gni
diff --git a/build_overrides/partition_alloc.gni b/build_overrides/partition_alloc.gni
new file mode 100644
index 00000000..dcf8ac2d
--- /dev/null
+++ b/build_overrides/partition_alloc.gni
@@ -0,0 +1,17 @@
+# Copyright 2022 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Use default values for PartitionAlloc as standalone library from
+# base/allocator/partition_allocator/build_overrides/partition_alloc.gni
+use_partition_alloc_as_malloc_default = false
+use_allocator_shim_default = false
+enable_backup_ref_ptr_support_default = false
+enable_mte_checked_ptr_support_default = false
+put_ref_count_in_previous_slot_default = false
+enable_backup_ref_ptr_slow_checks_default = false
+enable_dangling_raw_ptr_checks_default = false
diff --git a/files/cleanup_links.py b/cleanup_links.py
index 7d1eba9b..7d1eba9b 100755
--- a/files/cleanup_links.py
+++ b/cleanup_links.py
diff --git a/codereview.settings b/codereview.settings
index 9782886f..b226fae5 100644
--- a/codereview.settings
+++ b/codereview.settings
@@ -1,5 +1,5 @@
-# This file is used by git cl to get repository specific information.
+# This file is used by `git cl` to get repository specific information.
+CODE_REVIEW_SERVER: codereview.chromium.org
 GERRIT_HOST: True
 PROJECT: libyuv
-TRY_ON_UPLOAD: False
 VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/
diff --git a/files/docs/deprecated_builds.md b/docs/deprecated_builds.md
index ba42966c..8edefd78 100644
--- a/files/docs/deprecated_builds.md
+++ b/docs/deprecated_builds.md
@@ -165,11 +165,11 @@ mipsel
 
 arm32 disassembly:
 
-    third_party/android_ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-objdump -d out/Release/obj/source/libyuv.row_neon.o
+    llvm-objdump -d out/Release/obj/source/libyuv.row_neon.o
 
 arm64 disassembly:
 
-    third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d out/Release/obj/source/libyuv.row_neon64.o
+    llvm-objdump -d out/Release/obj/source/libyuv.row_neon64.o
 
 Running tests:
 
diff --git a/files/docs/environment_variables.md b/docs/environment_variables.md
index dd5d59fb..4eb09659 100644
--- a/files/docs/environment_variables.md
+++ b/docs/environment_variables.md
@@ -40,6 +40,9 @@ By default the cpu is detected and the most advanced form of SIMD is used.  But
     LIBYUV_DISABLE_LSX
     LIBYUV_DISABLE_LASX
 
+## RISCV CPUs
+    LIBYUV_DISABLE_RVV
+
 # Test Width/Height/Repeat
 
 The unittests default to a small image (128x72) to run fast.  This can be set by environment variable to test a specific resolutions.
diff --git a/files/docs/filtering.md b/docs/filtering.md
index 8696976e..8696976e 100644
--- a/files/docs/filtering.md
+++ b/docs/filtering.md
diff --git a/files/docs/formats.md b/docs/formats.md
index 12ea9465..12ea9465 100644
--- a/files/docs/formats.md
+++ b/docs/formats.md
diff --git a/files/docs/getting_started.md b/docs/getting_started.md
index 15b19ab2..f2f71b8b 100644
--- a/files/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -139,11 +139,11 @@ mips
 
 arm disassembly:
 
-    third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv/row_common.o >row_common.txt
+    llvm-objdump -d ./out/Release/obj/libyuv/row_common.o >row_common.txt
 
-    third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon.o >row_neon.txt
+    llvm-objdump -d ./out/Release/obj/libyuv_neon/row_neon.o >row_neon.txt
 
-    third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
+    llvm-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
 
     Caveat: Disassembly may require optimize_max be disabled in BUILD.gn
 
@@ -220,6 +220,47 @@ Install cmake: http://www.cmake.org/
     make -j4
     make package
 
+## Building RISC-V target with cmake
+
+### Prerequisite: build risc-v clang toolchain and qemu
+
+If you don't have prebuilt clang and riscv64 qemu, run the script to download source and build them.
+
+    ./riscv_script/prepare_toolchain_qemu.sh
+
+After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`.
+
+### Cross-compile for RISC-V target
+    cmake -B out/Release/ -DUNIT_TEST=ON \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_TOOLCHAIN_FILE="./riscv_script/riscv-clang.cmake" \
+          -DTOOLCHAIN_PATH={TOOLCHAIN_PATH} \
+          -DUSE_RVV=ON .
+    cmake --build out/Release/
+
+#### Customized Compiler Flags
+
+Customized compiler flags are supported by `-DRISCV_COMPILER_FLAGS="xxx"`.
+If `-DRISCV_COMPILER_FLAGS="xxx"` is manually assigned, other compile flags(e.g disable -march=xxx) will not be appended.
+
+Example:
+
+    cmake -B out/Release/ -DUNIT_TEST=ON \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_TOOLCHAIN_FILE="./riscv_script/riscv-clang.cmake" \
+          -DRISCV_COMPILER_FLAGS="-mcpu=sifive-x280" \
+          .
+
+### Run on QEMU
+
+#### Run libyuv_unittest on QEMU
+    cd out/Release/
+    USE_RVV=ON \
+    TOOLCHAIN_PATH={TOOLCHAIN_PATH} \
+    QEMU_PREFIX_PATH={QEMU_PREFIX_PATH} \
+    ../../riscv_script/run_qemu.sh libyuv_unittest
+
+
 ## Setup for Arm Cross compile
 
 See also https://www.ccoderun.ca/programming/2015-12-20_CrossCompiling/index.html
diff --git a/files/docs/rotation.md b/docs/rotation.md
index a08430fd..a08430fd 100644
--- a/files/docs/rotation.md
+++ b/docs/rotation.md
diff --git a/files/download_vs_toolchain.py b/download_vs_toolchain.py
index 6bc086d6..6bc086d6 100644
--- a/files/download_vs_toolchain.py
+++ b/download_vs_toolchain.py
diff --git a/files/Android.bp b/files/Android.bp
deleted file mode 100644
index 36156287..00000000
--- a/files/Android.bp
+++ /dev/null
@@ -1,196 +0,0 @@
-package {
-    default_applicable_licenses: ["external_libyuv_files_license"],
-}
-
-// Added automatically by a large-scale-change
-//
-// large-scale-change included anything that looked like it might be a license
-// text as a license_text. e.g. LICENSE, NOTICE, COPYING etc.
-//
-// Please consider removing redundant or irrelevant files from 'license_text:'.
-// See: http://go/android-license-faq
-license {
-    name: "external_libyuv_files_license",
-    visibility: [":__subpackages__"],
-    license_kinds: [
-        "SPDX-license-identifier-BSD",
-    ],
-    license_text: [
-        "LICENSE",
-        "PATENTS",
-    ],
-}
-
-cc_library {
-    name: "libyuv",
-    vendor_available: true,
-    product_available: true,
-    host_supported: true,
-    vndk: {
-        enabled: true,
-    },
-
-    srcs: [
-        "source/compare.cc",
-        "source/compare_common.cc",
-        "source/compare_gcc.cc",
-        "source/compare_msa.cc",
-        "source/compare_neon.cc",
-        "source/compare_neon64.cc",
-        "source/convert.cc",
-        "source/convert_argb.cc",
-        "source/convert_from.cc",
-        "source/convert_from_argb.cc",
-        "source/convert_jpeg.cc",
-        "source/convert_to_argb.cc",
-        "source/convert_to_i420.cc",
-        "source/cpu_id.cc",
-        "source/mjpeg_decoder.cc",
-        "source/mjpeg_validate.cc",
-        "source/planar_functions.cc",
-        "source/rotate.cc",
-        "source/rotate_any.cc",
-        "source/rotate_argb.cc",
-        "source/rotate_common.cc",
-        "source/rotate_gcc.cc",
-        "source/rotate_msa.cc",
-        "source/rotate_neon.cc",
-        "source/rotate_neon64.cc",
-        "source/row_any.cc",
-        "source/row_common.cc",
-        "source/row_gcc.cc",
-        "source/row_msa.cc",
-        "source/row_neon.cc",
-        "source/row_neon64.cc",
-        "source/scale.cc",
-        "source/scale_any.cc",
-        "source/scale_argb.cc",
-        "source/scale_common.cc",
-        "source/scale_gcc.cc",
-        "source/scale_msa.cc",
-        "source/scale_neon.cc",
-        "source/scale_neon64.cc",
-        "source/scale_rgb.cc",
-        "source/scale_uv.cc",
-        "source/video_common.cc",
-    ],
-
-    cflags: [
-        "-Wall",
-        "-Werror",
-        "-Wno-unused-parameter",
-        "-fexceptions",
-        "-DHAVE_JPEG",
-    ],
-
-    arch: {
-        arm: {
-            cflags: ["-mfpu=neon"],
-        },
-    },
-
-    shared_libs: ["libjpeg"],
-
-    export_include_dirs: ["include"],
-
-    apex_available: [
-        "//apex_available:platform",
-        "com.android.media.swcodec",
-    ],
-    min_sdk_version: "29",
-}
-
-// compatibilty static library until all uses of libyuv_static are replaced
-// with libyuv (b/37646797)
-cc_library_static {
-    name: "libyuv_static",
-    vendor_available: true,
-    whole_static_libs: ["libyuv"],
-    apex_available: [
-        "//apex_available:platform",
-        "com.android.media.swcodec",
-    ],
-    min_sdk_version: "29",
-}
-
-cc_test {
-    name: "libyuv_unittest",
-    static_libs: ["libyuv"],
-    shared_libs: ["libjpeg"],
-    cflags: ["-Wall", "-Werror"],
-    srcs: [
-        "unit_test/basictypes_test.cc",
-        "unit_test/color_test.cc",
-        "unit_test/compare_test.cc",
-        "unit_test/convert_test.cc",
-        "unit_test/cpu_test.cc",
-        "unit_test/cpu_thread_test.cc",
-        "unit_test/math_test.cc",
-        "unit_test/planar_test.cc",
-        "unit_test/rotate_argb_test.cc",
-        "unit_test/rotate_test.cc",
-        "unit_test/scale_argb_test.cc",
-        "unit_test/scale_rgb_test.cc",
-        "unit_test/scale_test.cc",
-        "unit_test/scale_uv_test.cc",
-        "unit_test/unit_test.cc",
-        "unit_test/video_common_test.cc",
-    ],
-}
-
-cc_test {
-    name: "compare",
-    gtest: false,
-    srcs: [
-        "util/compare.cc",
-    ],
-    static_libs: ["libyuv"],
-}
-
-cc_test {
-    name: "cpuid",
-    gtest: false,
-    srcs: [
-        "util/cpuid.c",
-    ],
-    static_libs: ["libyuv"],
-}
-
-cc_test {
-    name: "i444tonv12_eg",
-    gtest: false,
-    srcs: [
-        "util/i444tonv12_eg.cc",
-    ],
-    static_libs: ["libyuv"],
-}
-
-cc_test {
-    name: "psnr",
-    gtest: false,
-    srcs: [
-        "util/psnr_main.cc",
-        "util/psnr.cc",
-        "util/ssim.cc",
-    ],
-    static_libs: ["libyuv"],
-}
-
-cc_test {
-    name: "yuvconstants",
-    gtest: false,
-    srcs: [
-        "util/yuvconstants.c",
-    ],
-    static_libs: ["libyuv"],
-}
-
-cc_test {
-    name: "yuvconvert",
-    gtest: false,
-    srcs: [
-        "util/yuvconvert.cc",
-    ],
-    static_libs: ["libyuv"],
-    shared_libs: ["libjpeg"],
-}
diff --git a/files/LICENSE b/files/LICENSE
deleted file mode 100644
index c911747a..00000000
--- a/files/LICENSE
+++ /dev/null
@@ -1,29 +0,0 @@
-Copyright 2011 The LibYuv Project Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-
-  * Redistributions in binary form must reproduce the above copyright
-    notice, this list of conditions and the following disclaimer in
-    the documentation and/or other materials provided with the
-    distribution.
-
-  * Neither the name of Google nor the names of its contributors may
-    be used to endorse or promote products derived from this software
-    without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/files/codereview.settings b/files/codereview.settings
deleted file mode 100644
index b226fae5..00000000
--- a/files/codereview.settings
+++ /dev/null
@@ -1,5 +0,0 @@
-# This file is used by `git cl` to get repository specific information.
-CODE_REVIEW_SERVER: codereview.chromium.org
-GERRIT_HOST: True
-PROJECT: libyuv
-VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/
diff --git a/files/public.mk b/files/public.mk
deleted file mode 100644
index 1342307a..00000000
--- a/files/public.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-# This file contains all the common make variables which are useful for
-# anyone depending on this library.
-# Note that dependencies on NDK are not directly listed since NDK auto adds
-# them.
-
-LIBYUV_INCLUDES := $(LIBYUV_PATH)/include
-
-LIBYUV_C_FLAGS :=
-
-LIBYUV_CPP_FLAGS :=
-
-LIBYUV_LDLIBS :=
-LIBYUV_DEP_MODULES :=
diff --git a/files/source/compare_mmi.cc b/files/source/compare_mmi.cc
deleted file mode 100644
index 7640d946..00000000
--- a/files/source/compare_mmi.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// Hakmem method for hamming distance.
-uint32_t HammingDistance_MMI(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
-  uint32_t diff = 0u;
-
-  uint64_t temp = 0, temp1 = 0, ta = 0, tb = 0;
-  uint64_t c1 = 0x5555555555555555;
-  uint64_t c2 = 0x3333333333333333;
-  uint64_t c3 = 0x0f0f0f0f0f0f0f0f;
-  uint32_t c4 = 0x01010101;
-  uint64_t s1 = 1, s2 = 2, s3 = 4;
-  __asm__ volatile(
-      "1:	\n\t"
-      "ldc1   %[ta],    0(%[src_a])          \n\t"
-      "ldc1   %[tb],    0(%[src_b])          \n\t"
-      "xor    %[temp],  %[ta],      %[tb]    \n\t"
-      "psrlw  %[temp1], %[temp],    %[s1]    \n\t"  // temp1=x>>1
-      "and    %[temp1], %[temp1],   %[c1]    \n\t"  // temp1&=c1
-      "psubw  %[temp1], %[temp],    %[temp1] \n\t"  // x-temp1
-      "and    %[temp],  %[temp1],   %[c2]    \n\t"  // t = (u&c2)
-      "psrlw  %[temp1], %[temp1],   %[s2]    \n\t"  // u>>2
-      "and    %[temp1], %[temp1],   %[c2]    \n\t"  // u>>2 & c2
-      "paddw  %[temp1], %[temp1],   %[temp]  \n\t"  // t1 = t1+t
-      "psrlw  %[temp],  %[temp1],   %[s3]    \n\t"  // u>>4
-      "paddw  %[temp1], %[temp1],   %[temp]  \n\t"  // u+(u>>4)
-      "and    %[temp1], %[temp1],   %[c3]    \n\t"  //&c3
-      "dmfc1  $t0,      %[temp1]             \n\t"
-      "dsrl32 $t0,      $t0,        0        \n\t "
-      "mul    $t0,      $t0,        %[c4]    \n\t"
-      "dsrl   $t0,      $t0,        24       \n\t"
-      "dadd   %[diff],  %[diff],    $t0      \n\t"
-      "dmfc1  $t0,      %[temp1]             \n\t"
-      "mul    $t0,      $t0,        %[c4]    \n\t"
-      "dsrl   $t0,      $t0,        24       \n\t"
-      "dadd   %[diff],  %[diff],    $t0      \n\t"
-      "daddiu %[src_a], %[src_a],   8        \n\t"
-      "daddiu %[src_b], %[src_b],   8        \n\t"
-      "addiu  %[count], %[count],  -8        \n\t"
-      "bgtz   %[count], 1b \n\t"
-      "nop                            \n\t"
-      : [diff] "+r"(diff), [src_a] "+r"(src_a), [src_b] "+r"(src_b),
-        [count] "+r"(count), [ta] "+f"(ta), [tb] "+f"(tb), [temp] "+f"(temp),
-        [temp1] "+f"(temp1)
-      : [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "r"(c4), [s1] "f"(s1),
-        [s2] "f"(s2), [s3] "f"(s3)
-      : "memory");
-  return diff;
-}
-
-uint32_t SumSquareError_MMI(const uint8_t* src_a,
-                            const uint8_t* src_b,
-                            int count) {
-  uint32_t sse = 0u;
-  uint32_t sse_hi = 0u, sse_lo = 0u;
-
-  uint64_t src1, src2;
-  uint64_t diff, diff_hi, diff_lo;
-  uint64_t sse_sum, sse_tmp;
-
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "xor        %[sse_sum],      %[sse_sum],        %[sse_sum]    \n\t"
-
-      "1:                                                           \n\t"
-      "ldc1       %[src1],         0x00(%[src_a])                   \n\t"
-      "ldc1       %[src2],         0x00(%[src_b])                   \n\t"
-      "pasubub    %[diff],         %[src1],           %[src2]       \n\t"
-      "punpcklbh  %[diff_lo],      %[diff],           %[mask]       \n\t"
-      "punpckhbh  %[diff_hi],      %[diff],           %[mask]       \n\t"
-      "pmaddhw    %[sse_tmp],      %[diff_lo],        %[diff_lo]    \n\t"
-      "paddw      %[sse_sum],      %[sse_sum],        %[sse_tmp]    \n\t"
-      "pmaddhw    %[sse_tmp],      %[diff_hi],        %[diff_hi]    \n\t"
-      "paddw      %[sse_sum],      %[sse_sum],        %[sse_tmp]    \n\t"
-
-      "daddiu     %[src_a],        %[src_a],          0x08          \n\t"
-      "daddiu     %[src_b],        %[src_b],          0x08          \n\t"
-      "daddiu     %[count],        %[count],         -0x08          \n\t"
-      "bnez       %[count],        1b                               \n\t"
-
-      "mfc1       %[sse_lo],       %[sse_sum]                       \n\t"
-      "mfhc1      %[sse_hi],       %[sse_sum]                       \n\t"
-      "daddu      %[sse],          %[sse_hi],         %[sse_lo]     \n\t"
-      : [sse] "+&r"(sse), [diff] "=&f"(diff), [src1] "=&f"(src1),
-        [src2] "=&f"(src2), [diff_lo] "=&f"(diff_lo), [diff_hi] "=&f"(diff_hi),
-        [sse_sum] "=&f"(sse_sum), [sse_tmp] "=&f"(sse_tmp),
-        [sse_hi] "+&r"(sse_hi), [sse_lo] "+&r"(sse_lo)
-      : [src_a] "r"(src_a), [src_b] "r"(src_b), [count] "r"(count),
-        [mask] "f"(mask)
-      : "memory");
-
-  return sse;
-}
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/rotate_common.cc b/files/source/rotate_common.cc
deleted file mode 100644
index ff212ade..00000000
--- a/files/source/rotate_common.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-void TransposeWx8_C(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    dst[0] = src[0 * src_stride];
-    dst[1] = src[1 * src_stride];
-    dst[2] = src[2 * src_stride];
-    dst[3] = src[3 * src_stride];
-    dst[4] = src[4 * src_stride];
-    dst[5] = src[5 * src_stride];
-    dst[6] = src[6 * src_stride];
-    dst[7] = src[7 * src_stride];
-    ++src;
-    dst += dst_stride;
-  }
-}
-
-void TransposeUVWx8_C(const uint8_t* src,
-                      int src_stride,
-                      uint8_t* dst_a,
-                      int dst_stride_a,
-                      uint8_t* dst_b,
-                      int dst_stride_b,
-                      int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    dst_a[0] = src[0 * src_stride + 0];
-    dst_b[0] = src[0 * src_stride + 1];
-    dst_a[1] = src[1 * src_stride + 0];
-    dst_b[1] = src[1 * src_stride + 1];
-    dst_a[2] = src[2 * src_stride + 0];
-    dst_b[2] = src[2 * src_stride + 1];
-    dst_a[3] = src[3 * src_stride + 0];
-    dst_b[3] = src[3 * src_stride + 1];
-    dst_a[4] = src[4 * src_stride + 0];
-    dst_b[4] = src[4 * src_stride + 1];
-    dst_a[5] = src[5 * src_stride + 0];
-    dst_b[5] = src[5 * src_stride + 1];
-    dst_a[6] = src[6 * src_stride + 0];
-    dst_b[6] = src[6 * src_stride + 1];
-    dst_a[7] = src[7 * src_stride + 0];
-    dst_b[7] = src[7 * src_stride + 1];
-    src += 2;
-    dst_a += dst_stride_a;
-    dst_b += dst_stride_b;
-  }
-}
-
-void TransposeWxH_C(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    int j;
-    for (j = 0; j < height; ++j) {
-      dst[i * dst_stride + j] = src[j * src_stride + i];
-    }
-  }
-}
-
-void TransposeUVWxH_C(const uint8_t* src,
-                      int src_stride,
-                      uint8_t* dst_a,
-                      int dst_stride_a,
-                      uint8_t* dst_b,
-                      int dst_stride_b,
-                      int width,
-                      int height) {
-  int i;
-  for (i = 0; i < width * 2; i += 2) {
-    int j;
-    for (j = 0; j < height; ++j) {
-      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
-      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
-    }
-  }
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/rotate_mmi.cc b/files/source/rotate_mmi.cc
deleted file mode 100644
index f8de6083..00000000
--- a/files/source/rotate_mmi.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-void TransposeWx8_MMI(const uint8_t* src,
-                      int src_stride,
-                      uint8_t* dst,
-                      int dst_stride,
-                      int width) {
-  uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-  uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
-  uint8_t* src_tmp = nullptr;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "ldc1       %[tmp12],        0x00(%[src])                     \n\t"
-      "dadd       %[src_tmp],      %[src],         %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp0 = (00 10 01 11 02 12 03 13) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (04 14 05 15 06 16 07 17) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (20 30 21 31 22 32 23 33) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (24 34 25 35 26 36 27 37) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp4 = (00 10 20 30 01 11 21 31) */
-      "punpcklhw  %[tmp4],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp5 = (02 12 22 32 03 13 23 33) */
-      "punpckhhw  %[tmp5],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp6 = (04 14 24 34 05 15 25 35) */
-      "punpcklhw  %[tmp6],         %[tmp1],        %[tmp3]          \n\t"
-      /* tmp7 = (06 16 26 36 07 17 27 37) */
-      "punpckhhw  %[tmp7],         %[tmp1],        %[tmp3]          \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp0 = (40 50 41 51 42 52 43 53) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (44 54 45 55 46 56 47 57) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (60 70 61 71 62 72 63 73) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (64 74 65 75 66 76 67 77) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp8 = (40 50 60 70 41 51 61 71) */
-      "punpcklhw  %[tmp8],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp9 = (42 52 62 72 43 53 63 73) */
-      "punpckhhw  %[tmp9],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp10 = (44 54 64 74 45 55 65 75) */
-      "punpcklhw  %[tmp10],        %[tmp1],        %[tmp3]          \n\t"
-      /* tmp11 = (46 56 66 76 47 57 67 77) */
-      "punpckhhw  %[tmp11],        %[tmp1],        %[tmp3]          \n\t"
-
-      /* tmp0 = (00 10 20 30 40 50 60 70) */
-      "punpcklwd  %[tmp0],         %[tmp4],        %[tmp8]          \n\t"
-      /* tmp1 = (01 11 21 31 41 51 61 71) */
-      "punpckhwd  %[tmp1],         %[tmp4],        %[tmp8]          \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      /* tmp0 = (02 12 22 32 42 52 62 72) */
-      "punpcklwd  %[tmp0],         %[tmp5],        %[tmp9]          \n\t"
-      /* tmp1 = (03 13 23 33 43 53 63 73) */
-      "punpckhwd  %[tmp1],         %[tmp5],        %[tmp9]          \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      /* tmp0 = (04 14 24 34 44 54 64 74) */
-      "punpcklwd  %[tmp0],         %[tmp6],        %[tmp10]         \n\t"
-      /* tmp1 = (05 15 25 35 45 55 65 75) */
-      "punpckhwd  %[tmp1],         %[tmp6],        %[tmp10]         \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      /* tmp0 = (06 16 26 36 46 56 66 76) */
-      "punpcklwd  %[tmp0],         %[tmp7],        %[tmp11]         \n\t"
-      /* tmp1 = (07 17 27 37 47 57 67 77) */
-      "punpckhwd  %[tmp1],         %[tmp7],        %[tmp11]         \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "daddi      %[src],          %[src],          0x08            \n\t"
-      "daddi      %[width],        %[width],       -0x08            \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
-        [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
-        [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
-        [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
-        [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst),
-        [src_tmp] "+&r"(src_tmp)
-      : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride),
-        [dst_stride] "r"(dst_stride)
-      : "memory");
-}
-
-void TransposeUVWx8_MMI(const uint8_t* src,
-                        int src_stride,
-                        uint8_t* dst_a,
-                        int dst_stride_a,
-                        uint8_t* dst_b,
-                        int dst_stride_b,
-                        int width) {
-  uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-  uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
-  uint8_t* src_tmp = nullptr;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */
-      "ldc1       %[tmp12],        0x00(%[src])                     \n\t"
-      "dadd       %[src_tmp],      %[src],         %[src_stride]    \n\t"
-      /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                  \n\t"
-
-      /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */
-      "punpcklhw  %[tmp4],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */
-      "punpckhhw  %[tmp5],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */
-      "punpcklhw  %[tmp6],         %[tmp1],        %[tmp3]          \n\t"
-      /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */
-      "punpckhhw  %[tmp7],         %[tmp1],        %[tmp3]          \n\t"
-
-      "dadd       %[src_tmp],     %[src_tmp],      %[src_stride]    \n\t"
-      /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */
-      "punpcklhw  %[tmp8],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */
-      "punpckhhw  %[tmp9],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */
-      "punpcklhw  %[tmp10],        %[tmp1],        %[tmp3]          \n\t"
-      /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */
-      "punpckhhw  %[tmp11],        %[tmp1],        %[tmp3]          \n\t"
-
-      /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */
-      "punpcklwd  %[tmp0],         %[tmp4],        %[tmp8]          \n\t"
-      /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */
-      "punpckhwd  %[tmp1],         %[tmp4],        %[tmp8]          \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */
-      "punpcklwd  %[tmp0],         %[tmp5],        %[tmp9]          \n\t"
-      /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */
-      "punpckhwd  %[tmp1],         %[tmp5],        %[tmp9]          \n\t"
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */
-      "punpcklwd  %[tmp0],         %[tmp6],        %[tmp10]         \n\t"
-      /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */
-      "punpckhwd  %[tmp1],         %[tmp6],        %[tmp10]         \n\t"
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */
-      "punpcklwd  %[tmp0],         %[tmp7],        %[tmp11]         \n\t"
-      /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */
-      "punpckhwd  %[tmp1],         %[tmp7],        %[tmp11]         \n\t"
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "daddiu     %[src],          %[src],          0x08            \n\t"
-      "daddi      %[width],        %[width],       -0x04            \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
-        [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
-        [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
-        [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
-        [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a),
-        [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp)
-      : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a),
-        [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride)
-      : "memory");
-}
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/row_mmi.cc b/files/source/row_mmi.cc
deleted file mode 100644
index 362fd1cf..00000000
--- a/files/source/row_mmi.cc
+++ /dev/null
@@ -1,7842 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "libyuv/row.h"
-
-#include <string.h>  // For memcpy and memset.
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// clang-format off
-
-void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24,
-                        uint8_t* dst_argb,
-                        int width) {
-  uint64_t src0, src1, dest;
-  const uint64_t mask = 0xff000000ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask]       \n\t"
-      "or         %[src1],         %[src1],           %[mask]       \n\t"
-      "punpcklwd  %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src0],         0x09(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x06(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x0c(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x09(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask]       \n\t"
-      "or         %[src1],         %[src1],           %[mask]       \n\t"
-      "punpcklwd  %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  uint64_t src0, src1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0xff000000ULL;
-  const uint64_t mask2 = 0xc6;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask1]      \n\t"
-      "punpcklbh  %[src0],         %[src0],           %[mask0]      \n\t"
-      "pshufh     %[src0],         %[src0],           %[mask2]      \n\t"
-      "or         %[src1],         %[src1],           %[mask1]      \n\t"
-      "punpcklbh  %[src1],         %[src1],           %[mask0]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask2]      \n\t"
-      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src0],         0x09(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x06(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x0c(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x09(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask1]      \n\t"
-      "punpcklbh  %[src0],         %[src0],           %[mask0]      \n\t"
-      "pshufh     %[src0],         %[src0],           %[mask2]      \n\t"
-      "or         %[src1],         %[src1],           %[mask1]      \n\t"
-      "punpcklbh  %[src1],         %[src1],           %[mask0]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask2]      \n\t"
-      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width)
-      : "memory");
-}
-
-void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[4];
-  uint64_t mask0 = 0xc6;
-  uint64_t mask1 = 0x6c;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_raw])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_raw])                 \n\t"
-      "gslwrc1    %[src1],         0x08(%[src_raw])                 \n\t"
-      "gslwlc1    %[src1],         0x0b(%[src_raw])                 \n\t"
-
-      "punpcklbh  %[ftmp0],        %[src0],           %[zero]       \n\t"
-      "pshufh     %[ftmp0],        %[ftmp0],          %[mask0]      \n\t"
-      "punpckhbh  %[ftmp1],        %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[src1],         %[src1],           %[zero]       \n\t"
-      "pextrh     %[ftmp2],        %[ftmp0],          %[three]      \n\t"
-      "pextrh     %[ftmp3],        %[ftmp1],          %[one]        \n\t"
-      "pinsrh_3   %[ftmp0],        %[ftmp0],          %[ftmp3]      \n\t"
-      "pextrh     %[ftmp3],        %[ftmp1],          %[two]        \n\t"
-      "pinsrh_1   %[ftmp1],        %[ftmp1],          %[ftmp2]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
-      "pextrh     %[ftmp2],        %[src1],           %[zero]       \n\t"
-      "pinsrh_2   %[ftmp1],        %[ftmp1],          %[ftmp2]      \n\t"
-      "pinsrh_0   %[src1],         %[src1],           %[ftmp3]      \n\t"
-      "packushb   %[ftmp0],        %[ftmp0],          %[ftmp1]      \n\t"
-      "packushb   %[src1],         %[src1],           %[zero]       \n\t"
-
-      "gssdrc1    %[ftmp0],        0x00(%[dst_rgb24])               \n\t"
-      "gssdlc1    %[ftmp0],        0x07(%[dst_rgb24])               \n\t"
-      "gsswrc1    %[src1],         0x08(%[dst_rgb24])               \n\t"
-      "gsswlc1    %[src1],         0x0b(%[dst_rgb24])               \n\t"
-
-      "daddiu     %[src_raw],      %[src_raw],        0x0c          \n\t"
-      "daddiu     %[dst_rgb24],    %[dst_rgb24],      0x0c          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
-        [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3])
-      : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width),
-        [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
-        [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03)
-      : "memory");
-}
-
-void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
-                         uint8_t* dst_argb,
-                         int width) {
-  uint64_t ftmp[5];
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0007000700070007;
-  __asm__ volatile(
-      "1:                                                      \n\t"
-      "gsldrc1   %[src0],       0x00(%[src_rgb565])            \n\t"
-      "gsldlc1   %[src0],       0x07(%[src_rgb565])            \n\t"
-      "psrlh     %[src1],       %[src0],             %[eight]  \n\t"
-      "and       %[b],          %[src0],             %[c0]     \n\t"
-      "and       %[src0],       %[src0],             %[c1]     \n\t"
-      "psrlh     %[src0],       %[src0],             %[five]   \n\t"
-      "and       %[g],          %[src1],             %[c2]     \n\t"
-      "psllh     %[g],          %[g],                %[three]  \n\t"
-      "or        %[g],          %[src0],             %[g]      \n\t"
-      "psrlh     %[r],          %[src1],             %[three]  \n\t"
-      "psllh     %[src0],       %[b],                %[three]  \n\t"
-      "psrlh     %[src1],       %[b],                %[two]    \n\t"
-      "or        %[b],          %[src0],             %[src1]   \n\t"
-      "psllh     %[src0],       %[g],                %[two]    \n\t"
-      "psrlh     %[src1],       %[g],                %[four]   \n\t"
-      "or        %[g],          %[src0],             %[src1]   \n\t"
-      "psllh     %[src0],       %[r],                %[three]  \n\t"
-      "psrlh     %[src1],       %[r],                %[two]    \n\t"
-      "or        %[r],          %[src0],             %[src1]   \n\t"
-      "packushb  %[b],          %[b],                %[r]      \n\t"
-      "packushb  %[g],          %[g],                %[c1]     \n\t"
-      "punpcklbh %[src0],       %[b],                %[g]      \n\t"
-      "punpckhbh %[src1],       %[b],                %[g]      \n\t"
-      "punpcklhw %[r],          %[src0],             %[src1]   \n\t"
-      "gssdrc1   %[r],          0x00(%[dst_argb])              \n\t"
-      "gssdlc1   %[r],          0x07(%[dst_argb])              \n\t"
-      "punpckhhw %[r],          %[src0],             %[src1]   \n\t"
-      "gssdrc1   %[r],          0x08(%[dst_argb])              \n\t"
-      "gssdlc1   %[r],          0x0f(%[dst_argb])              \n\t"
-      "daddiu    %[src_rgb565], %[src_rgb565],       0x08      \n\t"
-      "daddiu    %[dst_argb],   %[dst_argb],         0x10      \n\t"
-      "daddiu    %[width],      %[width],           -0x04      \n\t"
-      "bgtz      %[width],     1b                              \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
-        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4])
-      : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02),
-        [four] "f"(0x04)
-      : "memory");
-}
-
-void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
-                           uint8_t* dst_argb,
-                           int width) {
-  uint64_t ftmp[6];
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0003000300030003;
-  uint64_t c3 = 0x007c007c007c007c;
-  uint64_t c4 = 0x0001000100010001;
-  __asm__ volatile(
-      "1:                                                         \n\t"
-      "gsldrc1   %[src0],         0x00(%[src_argb1555])           \n\t"
-      "gsldlc1   %[src0],         0x07(%[src_argb1555])           \n\t"
-      "psrlh     %[src1],         %[src0],              %[eight]  \n\t"
-      "and       %[b],            %[src0],              %[c0]     \n\t"
-      "and       %[src0],         %[src0],              %[c1]     \n\t"
-      "psrlh     %[src0],         %[src0],              %[five]   \n\t"
-      "and       %[g],            %[src1],              %[c2]     \n\t"
-      "psllh     %[g],            %[g],                 %[three]  \n\t"
-      "or        %[g],            %[src0],              %[g]      \n\t"
-      "and       %[r],            %[src1],              %[c3]     \n\t"
-      "psrlh     %[r],            %[r],                 %[two]    \n\t"
-      "psrlh     %[a],            %[src1],              %[seven]  \n\t"
-      "psllh     %[src0],         %[b],                 %[three]  \n\t"
-      "psrlh     %[src1],         %[b],                 %[two]    \n\t"
-      "or        %[b],            %[src0],              %[src1]   \n\t"
-      "psllh     %[src0],         %[g],                 %[three]  \n\t"
-      "psrlh     %[src1],         %[g],                 %[two]    \n\t"
-      "or        %[g],            %[src0],              %[src1]   \n\t"
-      "psllh     %[src0],         %[r],                 %[three]  \n\t"
-      "psrlh     %[src1],         %[r],                 %[two]    \n\t"
-      "or        %[r],            %[src0],              %[src1]   \n\t"
-      "xor       %[a],            %[a],                 %[c1]     \n\t"
-      "paddb     %[a],            %[a],                 %[c4]     \n\t"
-      "packushb  %[b],            %[b],                 %[r]      \n\t"
-      "packushb  %[g],            %[g],                 %[a]      \n\t"
-      "punpcklbh %[src0],         %[b],                 %[g]      \n\t"
-      "punpckhbh %[src1],         %[b],                 %[g]      \n\t"
-      "punpcklhw %[r],            %[src0],              %[src1]   \n\t"
-      "gssdrc1   %[r],            0x00(%[dst_argb])               \n\t"
-      "gssdlc1   %[r],            0x07(%[dst_argb])               \n\t"
-      "punpckhhw %[r],            %[src0],              %[src1]   \n\t"
-      "gssdrc1   %[r],            0x08(%[dst_argb])               \n\t"
-      "gssdlc1   %[r],            0x0f(%[dst_argb])               \n\t"
-      "daddiu    %[src_argb1555], %[src_argb1555],      0x08      \n\t"
-      "daddiu    %[dst_argb],     %[dst_argb],          0x10      \n\t"
-      "daddiu    %[width],        %[width],            -0x04      \n\t"
-      "bgtz      %[width],        1b                              \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
-        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
-      : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05),
-        [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
-      : "memory");
-}
-
-void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
-                           uint8_t* dst_argb,
-                           int width) {
-  uint64_t ftmp[6];
-  uint64_t c0 = 0x000f000f000f000f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  __asm__ volatile(
-      "1:                                                          \n\t"
-      "gsldrc1   %[src0],         0x00(%[src_argb4444])            \n\t"
-      "gsldlc1   %[src0],         0x07(%[src_argb4444])            \n\t"
-      "psrlh     %[src1],         %[src0],              %[eight]   \n\t"
-      "and       %[b],            %[src0],              %[c0]      \n\t"
-      "and       %[src0],         %[src0],              %[c1]      \n\t"
-      "psrlh     %[g],            %[src0],              %[four]    \n\t"
-      "and       %[r],            %[src1],              %[c0]      \n\t"
-      "psrlh     %[a],            %[src1],              %[four]    \n\t"
-      "psllh     %[src0],         %[b],                 %[four]    \n\t"
-      "or        %[b],            %[src0],              %[b]       \n\t"
-      "psllh     %[src0],         %[g],                 %[four]    \n\t"
-      "or        %[g],            %[src0],              %[g]       \n\t"
-      "psllh     %[src0],         %[r],                 %[four]    \n\t"
-      "or        %[r],            %[src0],              %[r]       \n\t"
-      "psllh     %[src0],         %[a],                 %[four]    \n\t"
-      "or        %[a],            %[src0],              %[a]       \n\t"
-      "packushb  %[b],            %[b],                 %[r]       \n\t"
-      "packushb  %[g],            %[g],                 %[a]       \n\t"
-      "punpcklbh %[src0],         %[b],                 %[g]       \n\t"
-      "punpckhbh %[src1],         %[b],                 %[g]       \n\t"
-      "punpcklhw %[r],            %[src0],              %[src1]    \n\t"
-      "gssdrc1   %[r],            0x00(%[dst_argb])                \n\t"
-      "gssdlc1   %[r],            0x07(%[dst_argb])                \n\t"
-      "punpckhhw %[r],            %[src0],              %[src1]    \n\t"
-      "gssdrc1   %[r],            0x08(%[dst_argb])                \n\t"
-      "gssdlc1   %[r],            0x0f(%[dst_argb])                \n\t"
-      "daddiu    %[src_argb4444], %[src_argb4444],      0x08       \n\t"
-      "daddiu    %[dst_argb],     %[dst_argb],          0x10       \n\t"
-      "daddiu    %[width],        %[width],            -0x04       \n\t"
-      "bgtz      %[width],        1b                               \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
-        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
-      : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08),
-        [four] "f"(0x04)
-      : "memory");
-}
-
-void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  uint64_t src;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src],          0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x03(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x00(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x04(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x06(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x03(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src],          0x0b(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x08(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x09(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x06(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x0c(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x0c(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x09(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x0c          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[3];
-  uint64_t mask0 = 0xc6;
-  uint64_t mask1 = 0x18;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[ftmp0],        %[src0],           %[zero]       \n\t"
-      "pshufh     %[ftmp0],        %[ftmp0],          %[mask0]      \n\t"
-      "punpckhbh  %[ftmp1],        %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[ftmp2],        %[src1],           %[zero]       \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]       \n\t"
-
-      "pextrh     %[src0],         %[ftmp1],          %[two]        \n\t"
-      "pinsrh_3   %[ftmp0],        %[ftmp0],          %[src0]       \n\t"
-      "pshufh     %[ftmp1],        %[ftmp1],          %[one]        \n\t"
-
-      "pextrh     %[src0],         %[ftmp2],          %[two]        \n\t"
-      "pinsrh_2   %[ftmp1],        %[ftmp1],          %[src0]       \n\t"
-      "pextrh     %[src0],         %[ftmp2],          %[one]        \n\t"
-      "pinsrh_3   %[ftmp1],        %[ftmp1],          %[src0]       \n\t"
-      "pextrh     %[src0],         %[ftmp2],          %[zero]       \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
-      "pinsrh_0   %[src1],         %[src1],           %[src0]       \n\t"
-      "packushb   %[ftmp0],        %[ftmp0],          %[ftmp1]      \n\t"
-      "packushb   %[src1],         %[src1],           %[zero]       \n\t"
-
-      "gssdrc1    %[ftmp0],        0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[ftmp0],        0x07(%[dst_rgb])                 \n\t"
-      "gsswrc1    %[src1],         0x08(%[dst_rgb])                 \n\t"
-      "gsswlc1    %[src1],         0x0b(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x0c          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
-        [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
-        [one] "f"(0x01), [two] "f"(0x02)
-      : "memory");
-}
-
-void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[3];
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-
-      "psrlh      %[b],            %[b],              %[three]      \n\t"
-      "psrlh      %[g],            %[g],              %[two]        \n\t"
-      "psrlh      %[r],            %[r],              %[three]      \n\t"
-
-      "psllh      %[g],            %[g],              %[five]       \n\t"
-      "psllh      %[r],            %[r],              %[eleven]     \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05),
-        [eleven] "f"(0x0b)
-      : "memory");
-}
-
-// dither4 is a row of 4 values from 4x4 dither matrix.
-// The 4x4 matrix contains values to increase RGB.  When converting to
-// fewer bits (565) this provides an ordered dither.
-// The order in the 4x4 matrix in first byte is upper left.
-// The 4 values are passed as an int, then referenced as an array, so
-// endian will not affect order of the original matrix.  But the dither4
-// will containing the first pixel in the lower byte for little endian
-// or the upper byte for big endian.
-void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
-                               uint8_t* dst_rgb,
-                               const uint32_t dither4,
-                               int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[3];
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-
-  __asm__ volatile(
-      "punpcklbh  %[dither],       %[dither],         %[zero]       \n\t"
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-
-      "paddh      %[b],            %[b],              %[dither]     \n\t"
-      "paddh      %[g],            %[g],              %[dither]     \n\t"
-      "paddh      %[r],            %[r],              %[dither]     \n\t"
-      "pcmpgth    %[src0],         %[b],              %[c0]         \n\t"
-      "or         %[src0],         %[src0],           %[b]          \n\t"
-      "and        %[b],            %[src0],           %[c0]         \n\t"
-      "pcmpgth    %[src0],         %[g],              %[c0]         \n\t"
-      "or         %[src0],         %[src0],           %[g]          \n\t"
-      "and        %[g],            %[src0],           %[c0]         \n\t"
-      "pcmpgth    %[src0],         %[r],              %[c0]         \n\t"
-      "or         %[src0],         %[src0],           %[r]          \n\t"
-      "and        %[r],            %[src0],           %[c0]         \n\t"
-
-      "psrlh      %[b],            %[b],              %[three]      \n\t"
-      "psrlh      %[g],            %[g],              %[two]        \n\t"
-      "psrlh      %[r],            %[r],              %[three]      \n\t"
-
-      "psllh      %[g],            %[g],              %[five]       \n\t"
-      "psllh      %[r],            %[r],              %[eleven]     \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02),
-        [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b)
-      : "memory");
-}
-
-void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[4];
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-      "punpckhbh  %[a],            %[src1],           %[zero]       \n\t"
-
-      "psrlh      %[b],            %[b],              %[three]      \n\t"
-      "psrlh      %[g],            %[g],              %[three]      \n\t"
-      "psrlh      %[r],            %[r],              %[three]      \n\t"
-      "psrlh      %[a],            %[a],              %[seven]      \n\t"
-
-      "psllh      %[g],            %[g],              %[five]       \n\t"
-      "psllh      %[r],            %[r],              %[ten]        \n\t"
-      "psllh      %[a],            %[a],              %[fifteen]    \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-      "or         %[b],            %[b],              %[a]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05),
-        [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f)
-      : "memory");
-}
-
-void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[4];
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-      "punpckhbh  %[a],            %[src1],           %[zero]       \n\t"
-
-      "psrlh      %[b],            %[b],              %[four]       \n\t"
-      "psrlh      %[g],            %[g],              %[four]       \n\t"
-      "psrlh      %[r],            %[r],              %[four]       \n\t"
-      "psrlh      %[a],            %[a],              %[four]       \n\t"
-
-      "psllh      %[g],            %[g],              %[four]       \n\t"
-      "psllh      %[r],            %[r],              %[eight]      \n\t"
-      "psllh      %[a],            %[a],              %[twelve]     \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-      "or         %[b],            %[b],              %[a]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08),
-        [twelve] "f"(0x0c)
-      : "memory");
-}
-
-void ARGBToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001004200810019;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void ARGBToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0013002500380002;
-  const uint64_t mask_v = 0x00020038002f0009;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0019008100420001;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void BGRAToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0002003800250013;
-  const uint64_t mask_v = 0x0009002f00380002;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]        \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001001900810042;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void ABGRToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0002003800250013;
-  const uint64_t mask_v = 0x0009002F00380002;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest0_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest1_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest2_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest3_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0042008100190001;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void RGBAToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0013002500380002;
-  const uint64_t mask_v = 0x00020038002f0009;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest0_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest1_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest2_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest3_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001004200810019;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0d(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x06(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x13(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x0c(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x19(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x12(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x18          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void RGB24ToUVRow_MMI(const uint8_t* src_rgb,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0013002500380002;
-  const uint64_t mask_v = 0x00020038002f0009;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x06(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0d(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x0c(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x13(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x12(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x19(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x1e(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x25(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x24(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2b(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x2a(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x31(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x30              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001001900810042;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0d(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x06(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x13(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x0c(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x19(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x12(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x18          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void RAWToUVRow_MMI(const uint8_t* src_rgb,
-                    int src_stride_rgb,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0002003800250013;
-  const uint64_t mask_v = 0x0009002f00380002;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest0_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x06(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0d(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x0c(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x13(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest1_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x12(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x19(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest2_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x1e(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x25(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x24(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2b(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest3_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x2a(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x31(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x30              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void ARGBToYJRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest, dest0, dest1, dest2, dest3;
-  uint64_t tmp0, tmp1;
-  const uint64_t shift = 0x08;
-  const uint64_t value = 0x80;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x0001004D0096001DULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x08(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x10(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[shift]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x18(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[shift]      \n\t"
-
-      "packsswh   %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[tmp1],         %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest],         %[tmp0],           %[tmp1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
-        [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0),
-        [tmp1] "=&f"(tmp1)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value),
-        [width] "r"(width)
-      : "memory");
-}
-
-void ARGBToUVJRow_MMI(const uint8_t* src_rgb,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0015002a003f0002;
-  const uint64_t mask_v = 0x0002003f0035000a;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  uint64_t ftmp[11];
-  const uint64_t value = 0x1080108010801080;
-  const uint64_t mask = 0x0001004200810019;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0007000700070007;
-  __asm__ volatile(
-      "1:                                                            \n\t"
-      "gsldrc1    %[src0],        0x00(%[src_rgb565])                \n\t"
-      "gsldlc1    %[src0],        0x07(%[src_rgb565])                \n\t"
-      "psrlh      %[src1],        %[src0],             %[eight]      \n\t"
-      "and        %[b],           %[src0],             %[c0]         \n\t"
-      "and        %[src0],        %[src0],             %[c1]         \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]       \n\t"
-      "and        %[g],           %[src1],             %[c2]         \n\t"
-      "psllh      %[g],           %[g],                %[three]      \n\t"
-      "or         %[g],           %[src0],             %[g]          \n\t"
-      "psrlh      %[r],           %[src1],             %[three]      \n\t"
-      "psllh      %[src0],        %[b],                %[three]      \n\t"
-      "psrlh      %[src1],        %[b],                %[two]        \n\t"
-      "or         %[b],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[g],                %[two]        \n\t"
-      "psrlh      %[src1],        %[g],                %[four]       \n\t"
-      "or         %[g],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[r],                %[three]      \n\t"
-      "psrlh      %[src1],        %[r],                %[two]        \n\t"
-      "or         %[r],           %[src0],             %[src1]       \n\t"
-      "punpcklhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpcklhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest0],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest0],       %[dest0],            %[eight]      \n\t"
-
-      "punpckhhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpckhhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest1],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest1],       %[dest1],            %[eight]      \n\t"
-
-      "gsldrc1    %[src0],        0x08(%[src_rgb565])                \n\t"
-      "gsldlc1    %[src0],        0x0f(%[src_rgb565])                \n\t"
-      "psrlh      %[src1],        %[src0],             %[eight]      \n\t"
-      "and        %[b],           %[src0],             %[c0]         \n\t"
-      "and        %[src0],        %[src0],             %[c1]         \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]       \n\t"
-      "and        %[g],           %[src1],             %[c2]         \n\t"
-      "psllh      %[g],           %[g],                %[three]      \n\t"
-      "or         %[g],           %[src0],             %[g]          \n\t"
-      "psrlh      %[r],           %[src1],             %[three]      \n\t"
-      "psllh      %[src0],        %[b],                %[three]      \n\t"
-      "psrlh      %[src1],        %[b],                %[two]        \n\t"
-      "or         %[b],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[g],                %[two]        \n\t"
-      "psrlh      %[src1],        %[g],                %[four]       \n\t"
-      "or         %[g],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[r],                %[three]      \n\t"
-      "psrlh      %[src1],        %[r],                %[two]        \n\t"
-      "or         %[r],           %[src0],             %[src1]       \n\t"
-      "punpcklhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpcklhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest2],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest2],       %[dest2],            %[eight]      \n\t"
-
-      "punpckhhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpckhhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest3],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest3],       %[dest3],            %[eight]      \n\t"
-
-      "packsswh   %[src_lo],      %[dest0],            %[dest1]      \n\t"
-      "packsswh   %[src_hi],      %[dest2],            %[dest3]      \n\t"
-      "packushb   %[dest0],       %[src_lo],           %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],       0x07(%[dst_y])                     \n\t"
-      "gssdrc1    %[dest0],       0x00(%[dst_y])                     \n\t"
-
-      "daddiu    %[src_rgb565],   %[src_rgb565],       0x10          \n\t"
-      "daddiu    %[dst_y],        %[dst_y],            0x08          \n\t"
-      "daddiu    %[width],        %[width],           -0x08          \n\t"
-      "bgtz      %[width],        1b                                 \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
-        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
-        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
-      : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05),
-        [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04)
-      : "memory");
-}
-
-void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555,
-                        uint8_t* dst_y,
-                        int width) {
-  uint64_t ftmp[11];
-  const uint64_t value = 0x1080108010801080;
-  const uint64_t mask = 0x0001004200810019;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0003000300030003;
-  uint64_t c3 = 0x007c007c007c007c;
-  __asm__ volatile(
-      "1:                                                            \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb1555])             \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb1555])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[src0],         %[src0],              %[five]     \n\t"
-      "and        %[g],            %[src1],              %[c2]       \n\t"
-      "psllh      %[g],            %[g],                 %[three]    \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "and        %[r],            %[src1],              %[c3]       \n\t"
-      "psrlh      %[r],            %[r],                 %[two]      \n\t"
-      "psllh      %[src0],         %[b],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[b],                 %[two]      \n\t"
-      "or         %[b],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[g],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[g],                 %[two]      \n\t"
-      "or         %[g],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[r],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[r],                 %[two]      \n\t"
-      "or         %[r],            %[src0],              %[src1]     \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest0],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest0],        %[dest0],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest1],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest1],        %[dest1],             %[eight]    \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_argb1555])             \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_argb1555])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[src0],         %[src0],              %[five]     \n\t"
-      "and        %[g],            %[src1],              %[c2]       \n\t"
-      "psllh      %[g],            %[g],                 %[three]    \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "and        %[r],            %[src1],              %[c3]       \n\t"
-      "psrlh      %[r],            %[r],                 %[two]      \n\t"
-      "psllh      %[src0],         %[b],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[b],                 %[two]      \n\t"
-      "or         %[b],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[g],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[g],                 %[two]      \n\t"
-      "or         %[g],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[r],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[r],                 %[two]      \n\t"
-      "or         %[r],            %[src0],              %[src1]     \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest2],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest2],        %[dest2],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest3],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest3],        %[dest3],             %[eight]    \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],             %[dest1]    \n\t"
-      "packsswh   %[src_hi],       %[dest2],             %[dest3]    \n\t"
-      "packushb   %[dest0],        %[src_lo],            %[src_hi]   \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                    \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                    \n\t"
-
-      "daddiu     %[src_argb1555], %[src_argb1555],      0x10        \n\t"
-      "daddiu     %[dst_y],        %[dst_y],             0x08        \n\t"
-      "daddiu     %[width],        %[width],            -0x08        \n\t"
-      "bgtz       %[width],        1b                                \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
-        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
-        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
-      : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y),
-        [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
-        [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08),
-        [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
-      : "memory");
-}
-
-void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444,
-                        uint8_t* dst_y,
-                        int width) {
-  uint64_t ftmp[11];
-  uint64_t value = 0x1080108010801080;
-  uint64_t mask = 0x0001004200810019;
-  uint64_t c0 = 0x000f000f000f000f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  __asm__ volatile(
-      "1:                                                            \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb4444])             \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb4444])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[g],            %[src0],              %[four]     \n\t"
-      "and        %[r],            %[src1],              %[c0]       \n\t"
-      "psllh      %[src0],         %[b],                 %[four]     \n\t"
-      "or         %[b],            %[src0],              %[b]        \n\t"
-      "psllh      %[src0],         %[g],                 %[four]     \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "psllh      %[src0],         %[r],                 %[four]     \n\t"
-      "or         %[r],            %[src0],              %[r]        \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest0],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest0],        %[dest0],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest1],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest1],        %[dest1],             %[eight]    \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_argb4444])             \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_argb4444])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[g],            %[src0],              %[four]     \n\t"
-      "and        %[r],            %[src1],              %[c0]       \n\t"
-      "psllh      %[src0],         %[b],                 %[four]     \n\t"
-      "or         %[b],            %[src0],              %[b]        \n\t"
-      "psllh      %[src0],         %[g],                 %[four]     \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "psllh      %[src0],         %[r],                 %[four]     \n\t"
-      "or         %[r],            %[src0],              %[r]        \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest2],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest2],        %[dest2],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest3],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest3],        %[dest3],             %[eight]    \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],             %[dest1]    \n\t"
-      "packsswh   %[src_hi],       %[dest2],             %[dest3]    \n\t"
-      "packushb   %[dest0],        %[src_lo],            %[src_hi]   \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                    \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                    \n\t"
-
-      "daddiu     %[src_argb4444], %[src_argb4444],      0x10        \n\t"
-      "daddiu     %[dst_y],        %[dst_y],             0x08        \n\t"
-      "daddiu     %[width],        %[width],            -0x08        \n\t"
-      "bgtz       %[width],        1b                                \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
-        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
-        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
-      : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y),
-        [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
-        [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04)
-      : "memory");
-}
-
-void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
-                       int src_stride_rgb565,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  uint64_t ftmp[13];
-  uint64_t value = 0x2020202020202020;
-  uint64_t mask_u = 0x0026004a00700002;
-  uint64_t mask_v = 0x00020070005e0012;
-  uint64_t mask = 0x93;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0007000700070007;
-  __asm__ volatile(
-      "daddu      %[next_rgb565], %[src_rgb565],       %[next_rgb565]   \n\t"
-      "1:                                                               \n\t"
-      "gsldrc1    %[src0],        0x00(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x07(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x00(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x07(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest0_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest0_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest0_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest0_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest0_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest0_v],     %[dest0_v],          %[three]         \n\t"
-      "or         %[dest0_v],     %[src1],             %[dest0_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest0_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest0_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest0_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest0_u],          %[dest0_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest0_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest0_u],          %[dest0_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest0_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest0_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest0_u],     %[dest0_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest0_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest0_u],          %[b0]            \n\t"
-      "psubw      %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest0_u],     %[dest0_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest0_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest0_v],          %[g0]            \n\t"
-      "psubw      %[dest0_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest0_v],     %[dest0_v],          %[eight]         \n\t"
-
-      "gsldrc1    %[src0],        0x08(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x0f(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x08(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x0f(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest1_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest1_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest1_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest1_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest1_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest1_v],     %[dest1_v],          %[three]         \n\t"
-      "or         %[dest1_v],     %[src1],             %[dest1_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest1_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest1_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest1_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest1_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest1_u],          %[dest1_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest1_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest1_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest1_u],          %[dest1_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest1_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest1_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest1_u],     %[dest1_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest1_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest1_u],          %[b0]            \n\t"
-      "psubw      %[dest1_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest1_u],     %[dest1_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest1_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest1_v],          %[g0]            \n\t"
-      "psubw      %[dest1_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest1_v],     %[dest1_v],          %[eight]         \n\t"
-
-      "gsldrc1    %[src0],        0x10(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x17(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x10(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x17(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest2_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest2_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest2_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest2_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest2_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest2_v],     %[dest2_v],          %[three]         \n\t"
-      "or         %[dest2_v],     %[src1],             %[dest2_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest2_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest2_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest2_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest2_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest2_u],          %[dest2_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest2_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest2_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest2_u],          %[dest2_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest2_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest2_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest2_u],     %[dest2_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest2_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest2_u],          %[b0]            \n\t"
-      "psubw      %[dest2_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest2_u],     %[dest2_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest2_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest2_v],          %[g0]            \n\t"
-      "psubw      %[dest2_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest2_v],     %[dest2_v],          %[eight]         \n\t"
-
-      "gsldrc1    %[src0],        0x18(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x1f(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x18(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x1f(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest3_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest3_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest3_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest3_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest3_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest3_v],     %[dest3_v],          %[three]         \n\t"
-      "or         %[dest3_v],     %[src1],             %[dest3_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest3_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest3_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest3_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest3_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest3_u],          %[dest3_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest3_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest3_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest3_u],          %[dest3_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest3_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest3_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest3_u],     %[dest3_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest3_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest3_u],          %[b0]            \n\t"
-      "psubw      %[dest3_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest3_u],     %[dest3_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest3_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest3_v],          %[g0]            \n\t"
-      "psubw      %[dest3_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest3_v],     %[dest3_v],          %[eight]         \n\t"
-
-      "packsswh   %[src0],        %[dest0_u],          %[dest1_u]       \n\t"
-      "packsswh   %[src1],        %[dest2_u],          %[dest3_u]       \n\t"
-      "packushb   %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "gssdlc1    %[dest0_u],     0x07(%[dst_u])                        \n\t"
-      "gssdrc1    %[dest0_u],     0x00(%[dst_u])                        \n\t"
-      "packsswh   %[src0],        %[dest0_v],          %[dest1_v]       \n\t"
-      "packsswh   %[src1],        %[dest2_v],          %[dest3_v]       \n\t"
-      "packushb   %[dest0_v],     %[src0],             %[src1]          \n\t"
-      "gssdlc1    %[dest0_v],     0x07(%[dst_v])                        \n\t"
-      "gssdrc1    %[dest0_v],     0x00(%[dst_v])                        \n\t"
-
-      "daddiu    %[src_rgb565],   %[src_rgb565],       0x20             \n\t"
-      "daddiu    %[next_rgb565],  %[next_rgb565],      0x20             \n\t"
-      "daddiu    %[dst_u],        %[dst_u],            0x08             \n\t"
-      "daddiu    %[dst_v],        %[dst_v],            0x08             \n\t"
-      "daddiu    %[width],        %[width],           -0x10             \n\t"
-      "bgtz      %[width],        1b                                    \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
-        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
-        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
-        [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
-        [dest3_v] "=&f"(ftmp[12])
-      : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
-        [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
-        [one] "f"(0x01)
-      : "memory");
-}
-
-void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
-                         int src_stride_argb1555,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  uint64_t ftmp[11];
-  uint64_t value = 0x2020202020202020;
-  uint64_t mask_u = 0x0026004a00700002;
-  uint64_t mask_v = 0x00020070005e0012;
-  uint64_t mask = 0x93;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0003000300030003;
-  uint64_t c3 = 0x007c007c007c007c;
-  __asm__ volatile(
-      "daddu      %[next_argb1555], %[src_argb1555],      %[next_argb1555] \n\t"
-      "1:                                                                  \n\t"
-      "gsldrc1    %[src0],          0x00(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x07(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x00(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x07(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest0_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest0_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest0_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest0_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest0_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest0_v],       %[dest0_v],            %[three]        \n\t"
-      "or         %[dest0_v],       %[src1],               %[dest0_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest0_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest0_u],            %[dest0_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest0_u],            %[dest0_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest0_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest0_u],       %[dest0_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest0_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_u],            %[b0]           \n\t"
-      "psubw      %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest0_u],       %[dest0_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
-      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x08(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x0f(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x08(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x0f(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest1_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest1_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest1_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest1_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest1_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest1_v],       %[dest1_v],            %[three]        \n\t"
-      "or         %[dest1_v],       %[src1],               %[dest1_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest1_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest1_u],            %[dest1_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest1_u],            %[dest1_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest1_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest1_u],       %[dest1_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest1_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_u],            %[b0]           \n\t"
-      "psubw      %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest1_u],       %[dest1_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
-      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
-
-      "packsswh   %[dest0_u],       %[dest0_u],            %[dest1_u]      \n\t"
-      "packsswh   %[dest1_u],       %[dest0_v],            %[dest1_v]      \n\t"
-
-      "gsldrc1    %[src0],          0x10(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x17(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x10(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x17(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest2_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest2_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest2_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest2_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest0_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest0_v],       %[dest0_v],            %[three]        \n\t"
-      "or         %[dest0_v],       %[src1],               %[dest0_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest2_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest2_u],            %[dest0_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest2_u],            %[dest0_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest2_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest2_u],       %[dest2_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest2_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest2_u],            %[b0]           \n\t"
-      "psubw      %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest2_u],       %[dest2_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
-      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x18(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x1f(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x18(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x1f(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest3_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest3_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest3_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest3_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest1_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest1_v],       %[dest1_v],            %[three]        \n\t"
-      "or         %[dest1_v],       %[src1],               %[dest1_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest3_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest3_u],            %[dest1_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest3_u],            %[dest1_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest3_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest3_u],       %[dest3_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest3_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest3_u],            %[b0]           \n\t"
-      "psubw      %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest3_u],       %[dest3_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
-      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
-
-      "packsswh   %[src1],          %[dest2_u],            %[dest3_u]      \n\t"
-      "packushb   %[dest0_u],       %[dest0_u],            %[src1]         \n\t"
-      "gssdlc1    %[dest0_u],       0x07(%[dst_u])                         \n\t"
-      "gssdrc1    %[dest0_u],       0x00(%[dst_u])                         \n\t"
-      "packsswh   %[src1],          %[dest0_v],            %[dest1_v]      \n\t"
-      "packushb   %[dest0_v],       %[dest1_u],            %[src1]         \n\t"
-      "gssdlc1    %[dest0_v],       0x07(%[dst_v])                         \n\t"
-      "gssdrc1    %[dest0_v],       0x00(%[dst_v])                         \n\t"
-
-      "daddiu    %[src_argb1555],   %[src_argb1555],       0x20            \n\t"
-      "daddiu    %[next_argb1555],  %[next_argb1555],      0x20            \n\t"
-      "daddiu    %[dst_u],          %[dst_u],              0x08            \n\t"
-      "daddiu    %[dst_v],          %[dst_v],              0x08            \n\t"
-      "daddiu    %[width],          %[width],             -0x10            \n\t"
-      "bgtz      %[width],          1b                                     \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
-        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
-        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
-        [dest1_v] "=&f"(ftmp[10])
-      : [src_argb1555] "r"(src_argb1555),
-        [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u),
-        [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
-        [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3),
-        [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
-        [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
-        [two] "f"(0x02), [one] "f"(0x01)
-      : "memory");
-}
-
-void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
-                         int src_stride_argb4444,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  uint64_t ftmp[13];
-  uint64_t value = 0x2020202020202020;
-  uint64_t mask_u = 0x0026004a00700002;
-  uint64_t mask_v = 0x00020070005e0012;
-  uint64_t mask = 0x93;
-  uint64_t c0 = 0x000f000f000f000f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  __asm__ volatile(
-      "daddu      %[next_argb4444], %[src_argb4444],      %[next_argb4444] \n\t"
-      "1:                                                                  \n\t"
-      "gsldrc1    %[src0],          0x00(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x07(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x00(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x07(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest0_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest0_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest0_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest0_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest0_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest0_u],            %[dest0_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest0_u],            %[dest0_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest0_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest0_u],       %[dest0_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest0_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_u],            %[b0]           \n\t"
-      "psubw      %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest0_u],       %[dest0_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
-      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x08(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x0f(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x08(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x0f(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest1_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest1_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest1_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest1_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest1_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest1_u],            %[dest1_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest1_u],            %[dest1_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest1_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest1_u],       %[dest1_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest1_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_u],            %[b0]           \n\t"
-      "psubw      %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest1_u],       %[dest1_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
-      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x10(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x17(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x10(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x17(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest2_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest2_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest2_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest2_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest2_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest2_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest2_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest2_u],            %[dest2_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest2_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest2_u],            %[dest2_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest2_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest2_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest2_u],       %[dest2_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest2_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest2_u],            %[b0]           \n\t"
-      "psubw      %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest2_u],       %[dest2_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest2_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest2_v],            %[g0]           \n\t"
-      "psubw      %[dest2_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest2_v],       %[dest2_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x18(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x1f(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x18(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x1f(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest3_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest3_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest3_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest3_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest3_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest3_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest3_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest3_u],            %[dest3_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest3_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest3_u],            %[dest3_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest3_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest3_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest3_u],       %[dest3_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest3_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest3_u],            %[b0]           \n\t"
-      "psubw      %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest3_u],       %[dest3_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest3_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest3_v],            %[g0]           \n\t"
-      "psubw      %[dest3_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest3_v],       %[dest3_v],            %[eight]        \n\t"
-
-      "packsswh   %[src0],          %[dest0_u],            %[dest1_u]      \n\t"
-      "packsswh   %[src1],          %[dest2_u],            %[dest3_u]      \n\t"
-      "packushb   %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "gssdlc1    %[dest0_u],       0x07(%[dst_u])                         \n\t"
-      "gssdrc1    %[dest0_u],       0x00(%[dst_u])                         \n\t"
-      "packsswh   %[src0],          %[dest0_v],            %[dest1_v]      \n\t"
-      "packsswh   %[src1],          %[dest2_v],            %[dest3_v]      \n\t"
-      "packushb   %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "gssdlc1    %[dest0_v],       0x07(%[dst_v])                         \n\t"
-      "gssdrc1    %[dest0_v],       0x00(%[dst_v])                         \n\t"
-
-      "daddiu    %[src_argb4444],   %[src_argb4444],       0x20            \n\t"
-      "daddiu    %[next_argb4444],  %[next_argb4444],      0x20            \n\t"
-      "daddiu    %[dst_u],          %[dst_u],              0x08            \n\t"
-      "daddiu    %[dst_v],          %[dst_v],              0x08            \n\t"
-      "daddiu    %[width],          %[width],             -0x10            \n\t"
-      "bgtz      %[width],          1b                                     \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
-        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
-        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
-        [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
-        [dest3_v] "=&f"(ftmp[12])
-      : [src_argb4444] "r"(src_argb4444),
-        [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u),
-        [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
-        [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u),
-        [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04),
-        [two] "f"(0x02)
-      : "memory");
-}
-
-void ARGBToUV444Row_MMI(const uint8_t* src_argb,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0026004a00700002;
-  const uint64_t mask_v = 0x00020070005e0012;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest0_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest1_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest2_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest3_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x20              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x08              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]),
-        [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]),
-        [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]),
-        [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]),
-        [dest3_v] "=&f"(ftmp[11])
-      : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
-        [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10),
-        [eight] "f"(0x08)
-      : "memory");
-}
-
-void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi;
-  uint64_t tmp0, tmp1;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x01;
-  const uint64_t mask2 = 0x0080004D0096001DULL;
-  const uint64_t mask3 = 0xFF000000FF000000ULL;
-  const uint64_t mask4 = ~mask3;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "and        %[src37],        %[src],            %[mask3]      \n\t"
-
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "pmaddhw    %[dest_lo],      %[src_lo],         %[mask2]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_lo]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_lo]    \n\t"
-      "paddw      %[dest_lo],      %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "packsswh   %[dest_lo],      %[dest_lo],        %[dest_lo]    \n\t"
-
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "pmaddhw    %[dest_hi],      %[src_hi],         %[mask2]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest_hi],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_hi],        %[dest_hi]    \n\t"
-      "paddw      %[dest_hi],      %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-      "packsswh   %[dest_hi],      %[dest_hi],        %[dest_hi]    \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "and        %[dest],         %[dest],           %[mask4]      \n\t"
-      "or         %[dest],         %[dest],           %[src37]      \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0),
-        [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest),
-        [src37] "=&f"(src37)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
-        [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4)
-      : "memory");
-}
-
-// Convert a row of image to Sepia tone.
-void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) {
-  uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2;
-  uint64_t tmp0, tmp1;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x002300440011ULL;
-  const uint64_t mask2 = 0x002D00580016ULL;
-  const uint64_t mask3 = 0x003200620018ULL;
-  const uint64_t mask4 = 0xFF000000FF000000ULL;
-  const uint64_t shift = 0x07;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "and        %[dest37],       %[dest],           %[mask4]      \n\t"
-
-      "punpcklbh  %[dest_lo],      %[dest],           %[mask0]      \n\t"
-      "pmaddhw    %[dest0],        %[dest_lo],        %[mask1]      \n\t"
-      "pmaddhw    %[dest1],        %[dest_lo],        %[mask2]      \n\t"
-      "pmaddhw    %[dest2],        %[dest_lo],        %[mask3]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest0],          %[dest1]      \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest2],          %[mask0]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest2],          %[mask0]      \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
-
-      "punpckhbh  %[dest_hi],      %[dest],           %[mask0]      \n\t"
-      "pmaddhw    %[dest0],        %[dest_hi],        %[mask1]      \n\t"
-      "pmaddhw    %[dest1],        %[dest_hi],        %[mask2]      \n\t"
-      "pmaddhw    %[dest2],        %[dest_hi],        %[mask3]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest0],          %[dest1]      \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest2],          %[mask0]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest2],          %[mask0]      \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_hi],      %[dest0],          %[dest1]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "or         %[dest],         %[dest],           %[dest37]     \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1),
-        [dest] "=&f"(dest)
-      : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
-        [mask4] "f"(mask4), [shift] "f"(shift)
-      : "memory");
-}
-
-// Apply color matrix to a row of image. Matrix is signed.
-// TODO(fbarchard): Consider adding rounding (+32).
-void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            const int8_t* matrix_argb,
-                            int width) {
-  uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2,
-      dest3;
-  uint64_t matrix, matrix_hi, matrix_lo;
-  uint64_t tmp0, tmp1;
-  const uint64_t shift0 = 0x06;
-  const uint64_t shift1 = 0x08;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-
-      "gsldlc1    %[matrix],       0x07(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x00(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_lo],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_lo],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest0],        %[dest0],          %[shift0]     \n\t"
-
-      "gsldlc1    %[matrix],       0x0f(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x08(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_lo],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_lo],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest1],        %[dest1],          %[shift0]     \n\t"
-
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-
-      "gsldlc1    %[matrix],       0x07(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x00(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_hi],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_hi],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest2],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest2],        %[dest2],          %[shift0]     \n\t"
-
-      "gsldlc1    %[matrix],       0x0f(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x08(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_hi],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_hi],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest3],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest3],        %[dest3],          %[shift0]     \n\t"
-
-      "packsswh   %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[tmp1],         %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest],         %[tmp0],           %[tmp1]       \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest),
-        [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi),
-        [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix)
-      : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0),
-        [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1)
-      : "memory");
-}
-
-void ARGBShadeRow_MMI(const uint8_t* src_argb,
-                      uint8_t* dst_argb,
-                      int width,
-                      uint32_t value) {
-  uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[src]        \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[src]        \n\t"
-
-      "punpcklbh  %[value],        %[value],          %[value]      \n\t"
-
-      "pmulhuh    %[dest_lo],      %[src_lo],         %[value]      \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "pmulhuh    %[dest_hi],      %[src_hi],         %[value]      \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src),
-        [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
-        [value] "f"(value), [shift] "f"(shift)
-      : "memory");
-}
-
-void ARGBMultiplyRow_MMI(const uint8_t* src_argb,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width) {
-  uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo;
-  uint64_t dest, dest_lo, dest_hi;
-  const uint64_t mask = 0x0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[src0_lo],      %[src0],           %[src0]       \n\t"
-      "punpckhbh  %[src0_hi],      %[src0],           %[src0]       \n\t"
-
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src1_lo],      %[src1],           %[mask]       \n\t"
-      "punpckhbh  %[src1_hi],      %[src1],           %[mask]       \n\t"
-
-      "pmulhuh    %[dest_lo],      %[src0_lo],        %[src1_lo]    \n\t"
-      "pmulhuh    %[dest_hi],      %[src0_hi],        %[src1_hi]    \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
-        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0),
-        [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask)
-      : "memory");
-}
-
-void ARGBAddRow_MMI(const uint8_t* src_argb,
-                    const uint8_t* src_argb1,
-                    uint8_t* dst_argb,
-                    int width) {
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "paddusb    %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBSubtractRow_MMI(const uint8_t* src_argb,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width) {
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "psubusb    %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width)
-      : "memory");
-}
-
-// Sobel functions which mimics SSSE3.
-void SobelXRow_MMI(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   const uint8_t* src_y2,
-                   uint8_t* dst_sobelx,
-                   int width) {
-  uint64_t y00 = 0, y10 = 0, y20 = 0;
-  uint64_t y02 = 0, y12 = 0, y22 = 0;
-  uint64_t zero = 0x0;
-  uint64_t sobel = 0x0;
-  __asm__ volatile(
-      "1:	                                         \n\t"
-      "gsldlc1   %[y00],        0x07(%[src_y0])          \n\t"  // a=src_y0[i]
-      "gsldrc1   %[y00],        0x00(%[src_y0])          \n\t"
-      "gsldlc1   %[y02],        0x09(%[src_y0])          \n\t"  // a_sub=src_y0[i+2]
-      "gsldrc1   %[y02],        0x02(%[src_y0])          \n\t"
-
-      "gsldlc1   %[y10],        0x07(%[src_y1])          \n\t"  // b=src_y1[i]
-      "gsldrc1   %[y10],        0x00(%[src_y1])          \n\t"
-      "gsldlc1   %[y12],        0x09(%[src_y1])          \n\t"  // b_sub=src_y1[i+2]
-      "gsldrc1   %[y12],        0x02(%[src_y1])          \n\t"
-
-      "gsldlc1   %[y20],        0x07(%[src_y2])          \n\t"  // c=src_y2[i]
-      "gsldrc1   %[y20],        0x00(%[src_y2])          \n\t"
-      "gsldlc1   %[y22],        0x09(%[src_y2])          \n\t"  // c_sub=src_y2[i+2]
-      "gsldrc1   %[y22],        0x02(%[src_y2])          \n\t"
-
-      "punpcklbh %[y00],        %[y00],          %[zero] \n\t"
-      "punpcklbh %[y10],        %[y10],          %[zero] \n\t"
-      "punpcklbh %[y20],        %[y20],          %[zero] \n\t"
-
-      "punpcklbh %[y02],        %[y02],          %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],          %[zero] \n\t"
-      "punpcklbh %[y22],        %[y22],          %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],          %[y10]  \n\t"  // a+b
-      "paddh     %[y20],        %[y20],          %[y10]  \n\t"  // c+b
-      "paddh     %[y00],        %[y00],          %[y20]  \n\t"  // a+2b+c
-
-      "paddh     %[y02],        %[y02],          %[y12]  \n\t"  // a_sub+b_sub
-      "paddh     %[y22],        %[y22],          %[y12]  \n\t"  // c_sub+b_sub
-      "paddh     %[y02],        %[y02],          %[y22]  \n\t"  // a_sub+2b_sub+c_sub
-
-      "pmaxsh    %[y10],        %[y00],          %[y02]  \n\t"
-      "pminsh    %[y20],        %[y00],          %[y02]  \n\t"
-      "psubh     %[sobel],      %[y10],          %[y20]  \n\t"  // Abs
-
-      "gsldlc1   %[y00],        0x0B(%[src_y0])          \n\t"
-      "gsldrc1   %[y00],        0x04(%[src_y0])          \n\t"
-      "gsldlc1   %[y02],        0x0D(%[src_y0])          \n\t"
-      "gsldrc1   %[y02],        0x06(%[src_y0])          \n\t"
-
-      "gsldlc1   %[y10],        0x0B(%[src_y1])          \n\t"
-      "gsldrc1   %[y10],        0x04(%[src_y1])          \n\t"
-      "gsldlc1   %[y12],        0x0D(%[src_y1])          \n\t"
-      "gsldrc1   %[y12],        0x06(%[src_y1])          \n\t"
-
-      "gsldlc1   %[y20],        0x0B(%[src_y2])          \n\t"
-      "gsldrc1   %[y20],        0x04(%[src_y2])          \n\t"
-      "gsldlc1   %[y22],        0x0D(%[src_y2])          \n\t"
-      "gsldrc1   %[y22],        0x06(%[src_y2])          \n\t"
-
-      "punpcklbh %[y00],        %[y00],          %[zero] \n\t"
-      "punpcklbh %[y10],        %[y10],          %[zero] \n\t"
-      "punpcklbh %[y20],        %[y20],          %[zero] \n\t"
-
-      "punpcklbh %[y02],        %[y02],          %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],          %[zero] \n\t"
-      "punpcklbh %[y22],        %[y22],          %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],          %[y10]  \n\t"
-      "paddh     %[y20],        %[y20],          %[y10]  \n\t"
-      "paddh     %[y00],        %[y00],          %[y20]  \n\t"
-
-      "paddh     %[y02],        %[y02],          %[y12]  \n\t"
-      "paddh     %[y22],        %[y22],          %[y12]  \n\t"
-      "paddh     %[y02],        %[y02],          %[y22]  \n\t"
-
-      "pmaxsh    %[y10],        %[y00],          %[y02]  \n\t"
-      "pminsh    %[y20],        %[y00],          %[y02]  \n\t"
-      "psubh     %[y00],        %[y10],          %[y20]  \n\t"
-
-      "packushb  %[sobel],      %[sobel],        %[y00]  \n\t"  // clamp255
-      "gssdrc1   %[sobel],      0(%[dst_sobelx])         \n\t"
-      "gssdlc1   %[sobel],      7(%[dst_sobelx])         \n\t"
-
-      "daddiu    %[src_y0],     %[src_y0],      8        \n\t"
-      "daddiu    %[src_y1],     %[src_y1],      8        \n\t"
-      "daddiu    %[src_y2],     %[src_y2],      8        \n\t"
-      "daddiu    %[dst_sobelx], %[dst_sobelx],  8        \n\t"
-      "daddiu    %[width],      %[width],      -8        \n\t"
-      "bgtz      %[width],      1b                       \n\t"
-      "nop                                               \n\t"
-      : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10),
-        [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22)
-      : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2),
-        [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero)
-      : "memory");
-}
-
-void SobelYRow_MMI(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   uint8_t* dst_sobely,
-                   int width) {
-  uint64_t y00 = 0, y01 = 0, y02 = 0;
-  uint64_t y10 = 0, y11 = 0, y12 = 0;
-  uint64_t zero = 0x0;
-  uint64_t sobel = 0x0;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldlc1   %[y00],        0x07(%[src_y0])         \n\t"  // a=src_y0[i]
-      "gsldrc1   %[y00],        0x00(%[src_y0])         \n\t"
-      "gsldlc1   %[y01],        0x08(%[src_y0])         \n\t"  // b=src_y0[i+1]
-      "gsldrc1   %[y01],        0x01(%[src_y0])         \n\t"
-      "gsldlc1   %[y02],        0x09(%[src_y0])         \n\t"  // c=src_y0[i+2]
-      "gsldrc1   %[y02],        0x02(%[src_y0])         \n\t"
-
-      "gsldlc1   %[y10],        0x07(%[src_y1])         \n\t"  // a_sub=src_y1[i]
-      "gsldrc1   %[y10],        0x00(%[src_y1])         \n\t"
-      "gsldlc1   %[y11],        0x08(%[src_y1])         \n\t"  // b_sub=src_y1[i+1]
-      "gsldrc1   %[y11],        0x01(%[src_y1])         \n\t"
-      "gsldlc1   %[y12],        0x09(%[src_y1])         \n\t"  // c_sub=src_y1[i+2]
-      "gsldrc1   %[y12],        0x02(%[src_y1])         \n\t"
-
-      "punpcklbh %[y00],        %[y00],         %[zero] \n\t"
-      "punpcklbh %[y01],        %[y01],         %[zero] \n\t"
-      "punpcklbh %[y02],        %[y02],         %[zero] \n\t"
-
-      "punpcklbh %[y10],        %[y10],         %[zero] \n\t"
-      "punpcklbh %[y11],        %[y11],         %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],         %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],         %[y01]  \n\t"  // a+b
-      "paddh     %[y02],        %[y02],         %[y01]  \n\t"  // c+b
-      "paddh     %[y00],        %[y00],         %[y02]  \n\t"  // a+2b+c
-
-      "paddh     %[y10],        %[y10],         %[y11]  \n\t"  // a_sub+b_sub
-      "paddh     %[y12],        %[y12],         %[y11]  \n\t"  // c_sub+b_sub
-      "paddh     %[y10],        %[y10],         %[y12]  \n\t"  // a_sub+2b_sub+c_sub
-
-      "pmaxsh    %[y02],        %[y00],         %[y10]  \n\t"
-      "pminsh    %[y12],        %[y00],         %[y10]  \n\t"
-      "psubh     %[sobel],      %[y02],         %[y12]  \n\t"  // Abs
-
-      "gsldlc1   %[y00],        0x0B(%[src_y0])         \n\t"
-      "gsldrc1   %[y00],        0x04(%[src_y0])         \n\t"
-      "gsldlc1   %[y01],        0x0C(%[src_y0])         \n\t"
-      "gsldrc1   %[y01],        0x05(%[src_y0])         \n\t"
-      "gsldlc1   %[y02],        0x0D(%[src_y0])         \n\t"
-      "gsldrc1   %[y02],        0x06(%[src_y0])         \n\t"
-
-      "gsldlc1   %[y10],        0x0B(%[src_y1])         \n\t"
-      "gsldrc1   %[y10],        0x04(%[src_y1])         \n\t"
-      "gsldlc1   %[y11],        0x0C(%[src_y1])         \n\t"
-      "gsldrc1   %[y11],        0x05(%[src_y1])         \n\t"
-      "gsldlc1   %[y12],        0x0D(%[src_y1])         \n\t"
-      "gsldrc1   %[y12],        0x06(%[src_y1])         \n\t"
-
-      "punpcklbh %[y00],        %[y00],         %[zero] \n\t"
-      "punpcklbh %[y01],        %[y01],         %[zero] \n\t"
-      "punpcklbh %[y02],        %[y02],         %[zero] \n\t"
-
-      "punpcklbh %[y10],        %[y10],         %[zero] \n\t"
-      "punpcklbh %[y11],        %[y11],         %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],         %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],         %[y01]  \n\t"
-      "paddh     %[y02],        %[y02],         %[y01]  \n\t"
-      "paddh     %[y00],        %[y00],         %[y02]  \n\t"
-
-      "paddh     %[y10],        %[y10],         %[y11]  \n\t"
-      "paddh     %[y12],        %[y12],         %[y11]  \n\t"
-      "paddh     %[y10],        %[y10],         %[y12]  \n\t"
-
-      "pmaxsh    %[y02],        %[y00],         %[y10]  \n\t"
-      "pminsh    %[y12],        %[y00],         %[y10]  \n\t"
-      "psubh     %[y00],        %[y02],         %[y12]  \n\t"
-
-      "packushb  %[sobel],      %[sobel],       %[y00]  \n\t"  // clamp255
-      "gssdrc1   %[sobel],      0(%[dst_sobely])        \n\t"
-      "gssdlc1   %[sobel],      7(%[dst_sobely])        \n\t"
-
-      "daddiu    %[src_y0],     %[src_y0],      8       \n\t"
-      "daddiu    %[src_y1],     %[src_y1],      8       \n\t"
-      "daddiu    %[dst_sobely], %[dst_sobely],  8       \n\t"
-      "daddiu    %[width],      %[width],      -8       \n\t"
-      "bgtz      %[width],      1b                      \n\t"
-      "nop                                              \n\t"
-      : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01),
-        [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12)
-      : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1),
-        [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero)
-      : "memory");
-}
-
-void SobelRow_MMI(const uint8_t* src_sobelx,
-                  const uint8_t* src_sobely,
-                  uint8_t* dst_argb,
-                  int width) {
-  double temp[3];
-  uint64_t c1 = 0xff000000ff000000;
-  __asm__ volatile(
-      "1:	                                          \n\t"
-      "gsldlc1   %[t0],         0x07(%[src_sobelx])       \n\t"  // a=src_sobelx[i]
-      "gsldrc1   %[t0],         0x00(%[src_sobelx])       \n\t"
-      "gsldlc1   %[t1],         0x07(%[src_sobely])       \n\t"  // b=src_sobely[i]
-      "gsldrc1   %[t1],         0x00(%[src_sobely])       \n\t"
-      // s7 s6 s5 s4 s3 s2 s1 s0 = a+b
-      "paddusb   %[t2] ,        %[t0],              %[t1] \n\t"
-
-      // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0
-      "punpcklbh %[t0],         %[t2],              %[t2] \n\t"
-
-      // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0
-      "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      // 255 s1 s1 s1 s55 s0 s0 s0
-      "gssdrc1   %[t1],         0x00(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x07(%[dst_argb])         \n\t"
-
-      // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2
-      "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      // 255 s3 s3 s3 255 s2 s2 s2
-      "gssdrc1   %[t1],         0x08(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x0f(%[dst_argb])         \n\t"
-
-      // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4
-      "punpckhbh %[t0],         %[t2],              %[t2] \n\t"
-
-      // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4
-      "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      "gssdrc1   %[t1],         0x10(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x17(%[dst_argb])         \n\t"
-
-      // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6
-      "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      "gssdrc1   %[t1],         0x18(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x1f(%[dst_argb])         \n\t"
-
-      "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
-      "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
-      "daddiu    %[src_sobely], %[src_sobely],      8     \n\t"
-      "daddiu    %[width],      %[width],          -8     \n\t"
-      "bgtz      %[width],      1b                        \n\t"
-      "nop                                                \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
-      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
-        [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
-      : "memory");
-}
-
-void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
-                         const uint8_t* src_sobely,
-                         uint8_t* dst_y,
-                         int width) {
-  uint64_t tr = 0;
-  uint64_t tb = 0;
-  __asm__ volatile(
-      "1:	                                       \n\t"
-      "gsldrc1 %[tr],         0x0(%[src_sobelx])       \n\t"
-      "gsldlc1 %[tr],         0x7(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
-      "gsldrc1 %[tb],         0x0(%[src_sobely])       \n\t"
-      "gsldlc1 %[tb],         0x7(%[src_sobely])       \n\t"  // b=src_sobely[i]
-      "paddusb %[tr],         %[tr],             %[tb] \n\t"  // g
-      "gssdrc1 %[tr],         0x0(%[dst_y])	       \n\t"
-      "gssdlc1 %[tr],         0x7(%[dst_y])            \n\t"
-
-      "daddiu  %[dst_y],      %[dst_y],          8     \n\t"
-      "daddiu  %[src_sobelx], %[src_sobelx],     8     \n\t"
-      "daddiu  %[src_sobely], %[src_sobely],     8     \n\t"
-      "daddiu  %[width],      %[width],         -8     \n\t"
-      "bgtz    %[width],      1b                       \n\t"
-      "nop                                             \n\t"
-      : [tr] "=&f"(tr), [tb] "=&f"(tb)
-      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
-        [dst_y] "r"(dst_y), [width] "r"(width)
-      : "memory");
-}
-
-void SobelXYRow_MMI(const uint8_t* src_sobelx,
-                    const uint8_t* src_sobely,
-                    uint8_t* dst_argb,
-                    int width) {
-  uint64_t temp[3];
-  uint64_t result = 0;
-  uint64_t gb = 0;
-  uint64_t cr = 0;
-  uint64_t c1 = 0xffffffffffffffff;
-  __asm__ volatile(
-      "1:	                                          \n\t"
-      "gsldlc1   %[tr],         0x07(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
-      "gsldrc1   %[tr],         0x00(%[src_sobelx])       \n\t"
-      "gsldlc1   %[tb],         0x07(%[src_sobely])       \n\t"  // b=src_sobely[i]
-      "gsldrc1   %[tb],         0x00(%[src_sobely])       \n\t"
-      "paddusb   %[tg] ,        %[tr],              %[tb] \n\t"  // g
-
-      // g3 b3 g2 b2 g1 b1 g0 b0
-      "punpcklbh %[gb],         %[tb],              %[tg] \n\t"
-      // c3 r3 r2 r2 c1 r1 c0 r0
-      "punpcklbh %[cr],         %[tr],              %[c1] \n\t"
-      // c1 r1 g1 b1 c0 r0 g0 b0
-      "punpcklhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x00(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x07(%[dst_argb])         \n\t"
-      // c3 r3 g3 b3 c2 r2 g2 b2
-      "punpckhhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x08(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x0f(%[dst_argb])         \n\t"
-
-      // g7 b7 g6 b6 g5 b5 g4 b4
-      "punpckhbh %[gb],         %[tb],              %[tg] \n\t"
-      // c7 r7 c6 r6 c5 r5 c4 r4
-      "punpckhbh %[cr],         %[tr],              %[c1] \n\t"
-      // c5 r5 g5 b5 c4 r4 g4 b4
-      "punpcklhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x10(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x17(%[dst_argb])         \n\t"
-      // c7 r7 g7 b7 c6 r6 g6 b6
-      "punpckhhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x18(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x1f(%[dst_argb])         \n\t"
-
-      "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
-      "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
-      "daddiu    %[src_sobely], %[src_sobely],      8     \n\t"
-      "daddiu    %[width],      %[width],          -8     \n\t"
-      "bgtz      %[width],      1b                        \n\t"
-      "nop                                                \n\t"
-      : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]),
-        [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result)
-      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
-        [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
-      : "memory");
-}
-
-void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  // Copy a Y to RGB.
-  uint64_t src, dest;
-  const uint64_t mask0 = 0x00ffffff00ffffffULL;
-  const uint64_t mask1 = ~mask0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src],          0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src],          %[src],            %[src]        \n\t"
-      "punpcklhw  %[dest],         %[src],            %[src]        \n\t"
-      "and        %[dest],         %[dest],           %[mask0]      \n\t"
-      "or         %[dest],         %[dest],           %[mask1]      \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-
-      "punpckhhw  %[dest],         %[src],            %[src]        \n\t"
-      "and        %[dest],         %[dest],           %[mask0]      \n\t"
-      "or         %[dest],         %[dest],           %[mask1]      \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x04          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-// TODO - respect YuvConstants
-void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf,
-                       const struct YuvConstants*, int width) {
-  uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x55;
-  const uint64_t mask2 = 0xAA;
-  const uint64_t mask3 = 0xFF;
-  const uint64_t mask4 = 0x4A354A354A354A35ULL;
-  const uint64_t mask5 = 0x0488048804880488ULL;
-  const uint64_t shift0 = 0x08;
-  const uint64_t shift1 = 0x06;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-
-      "pshufh     %[src],          %[src_lo],         %[mask0]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_lo],         %[mask1]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "pshufh     %[src],          %[src_lo],         %[mask2]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_lo],         %[mask3]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "pshufh     %[src],          %[src_hi],         %[mask0]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_hi],         %[mask1]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
-
-      "pshufh     %[src],          %[src_hi],         %[mask2]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_hi],         %[mask3]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-
-      "daddi      %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x20          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo)
-      : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
-        [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0),
-        [shift1] "f"(shift1), [width] "r"(width)
-      : "memory");
-}
-
-void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  uint64_t source, src0, src1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x1b;
-
-  src += width - 1;
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[source],       0(%[src_ptr])                    \n\t"
-      "gsldrc1    %[source],       -7(%[src_ptr])                   \n\t"
-      "punpcklbh  %[src0],         %[source],         %[mask0]      \n\t"
-      "pshufh     %[src0],         %[src0],           %[mask1]      \n\t"
-      "punpckhbh  %[src1],         %[source],         %[mask0]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
-      "packushb   %[dest],         %[src1],           %[src0]       \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddi      %[src_ptr],      %[src_ptr],       -0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0),
-        [src1] "=&f"(src1)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void MirrorSplitUVRow_MMI(const uint8_t* src_uv,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width) {
-  uint64_t src0, src1, dest0, dest1;
-  const uint64_t mask0 = 0x00ff00ff00ff00ffULL;
-  const uint64_t mask1 = 0x1b;
-  const uint64_t shift = 0x08;
-
-  src_uv += (width - 1) << 1;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         1(%[src_ptr])                    \n\t"
-      "gsldrc1    %[src0],         -6(%[src_ptr])                   \n\t"
-      "gsldlc1    %[src1],         -7(%[src_ptr])                   \n\t"
-      "gsldrc1    %[src1],         -14(%[src_ptr])                  \n\t"
-
-      "and        %[dest0],        %[src0],           %[mask0]      \n\t"
-      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
-      "and        %[dest1],        %[src1],           %[mask0]      \n\t"
-      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
-      "packushb   %[dest0],        %[dest0],          %[dest1]      \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dstu_ptr])                \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dstu_ptr])                \n\t"
-
-      "psrlh      %[dest0],        %[src0],           %[shift]      \n\t"
-      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
-      "psrlh      %[dest1],        %[src1],           %[shift]      \n\t"
-      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
-      "packushb   %[dest0],        %[dest0],          %[dest1]      \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dstv_ptr])                \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dstv_ptr])                \n\t"
-
-      "daddi      %[src_ptr],      %[src_ptr],       -0x10          \n\t"
-      "daddiu     %[dstu_ptr],     %[dstu_ptr],       0x08          \n\t"
-      "daddiu     %[dstv_ptr],     %[dstv_ptr],       0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
-        [src1] "=&f"(src1)
-      : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v),
-        [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [shift] "f"(shift)
-      : "memory");
-}
-
-void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  src += (width - 1) * 4;
-  uint64_t temp = 0x0;
-  uint64_t shuff = 0x4e;  // 01 00 11 10
-  __asm__ volatile(
-      "1:                                      \n\t"
-      "gsldlc1 %[temp],  3(%[src])     	       \n\t"
-      "gsldrc1 %[temp], -4(%[src])     	       \n\t"
-      "pshufh  %[temp],  %[temp],    %[shuff]  \n\t"
-      "gssdrc1 %[temp],  0x0(%[dst])           \n\t"
-      "gssdlc1 %[temp],  0x7(%[dst])           \n\t"
-
-      "daddiu  %[src],   %[src],    -0x08      \n\t"
-      "daddiu  %[dst],   %[dst],     0x08      \n\t"
-      "daddiu  %[width], %[width],  -0x02      \n\t"
-      "bnez    %[width], 1b                    \n\t"
-      : [temp] "=&f"(temp)
-      : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff)
-      : "memory");
-}
-
-void SplitUVRow_MMI(const uint8_t* src_uv,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width) {
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[4];
-  uint64_t shift = 0x08;
-  __asm__ volatile(
-      "1:	                                    \n\t"
-      "gsldrc1  %[t0],     0x00(%[src_uv])          \n\t"
-      "gsldlc1  %[t0],     0x07(%[src_uv])          \n\t"
-      "gsldrc1  %[t1],     0x08(%[src_uv])          \n\t"
-      "gsldlc1  %[t1],     0x0f(%[src_uv])          \n\t"
-
-      "and      %[t2],     %[t0],          %[c0]    \n\t"
-      "and      %[t3],     %[t1],          %[c0]    \n\t"
-      "packushb %[t2],     %[t2],          %[t3]    \n\t"
-      "gssdrc1  %[t2],     0x0(%[dst_u])	    \n\t"
-      "gssdlc1  %[t2],     0x7(%[dst_u])            \n\t"
-
-      "psrlh    %[t2],     %[t0],          %[shift] \n\t"
-      "psrlh    %[t3],     %[t1],          %[shift] \n\t"
-      "packushb %[t2],     %[t2],          %[t3]    \n\t"
-      "gssdrc1  %[t2],     0x0(%[dst_v])            \n\t"
-      "gssdlc1  %[t2],     0x7(%[dst_v])            \n\t"
-
-      "daddiu   %[src_uv], %[src_uv],      16       \n\t"
-      "daddiu   %[dst_u],  %[dst_u],       8        \n\t"
-      "daddiu   %[dst_v],  %[dst_v],       8        \n\t"
-      "daddiu   %[width],  %[width],      -8        \n\t"
-      "bgtz     %[width],  1b                       \n\t"
-      "nop                                          \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
-        [t3] "=&f"(temp[3])
-      : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-void MergeUVRow_MMI(const uint8_t* src_u,
-                    const uint8_t* src_v,
-                    uint8_t* dst_uv,
-                    int width) {
-  uint64_t temp[3];
-  __asm__ volatile(
-      "1:	                                 \n\t"
-      "gsldrc1   %[t0],     0x0(%[src_u])        \n\t"
-      "gsldlc1   %[t0],     0x7(%[src_u])        \n\t"
-      "gsldrc1   %[t1],     0x0(%[src_v])        \n\t"
-      "gsldlc1   %[t1],     0x7(%[src_v])        \n\t"
-      "punpcklbh %[t2],     %[t0],         %[t1] \n\t"
-      "gssdrc1   %[t2],     0x0(%[dst_uv])	 \n\t"
-      "gssdlc1   %[t2],     0x7(%[dst_uv])       \n\t"
-      "punpckhbh %[t2],     %[t0],         %[t1] \n\t"
-      "gssdrc1   %[t2],     0x8(%[dst_uv])	 \n\t"
-      "gssdlc1   %[t2],     0xf(%[dst_uv])       \n\t"
-
-      "daddiu    %[src_u],  %[src_u],      8     \n\t"
-      "daddiu    %[src_v],  %[src_v],      8     \n\t"
-      "daddiu    %[dst_uv], %[dst_uv],     16    \n\t"
-      "daddiu    %[width],  %[width],     -8     \n\t"
-      "bgtz      %[width],  1b                   \n\t"
-      "nop                                       \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
-      : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v),
-        [width] "r"(width)
-      : "memory");
-}
-
-void SplitRGBRow_MMI(const uint8_t* src_rgb,
-                     uint8_t* dst_r,
-                     uint8_t* dst_g,
-                     uint8_t* dst_b,
-                     int width) {
-  uint64_t src[4];
-  uint64_t dest_hi, dest_lo, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
-      "punpcklbh  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "gslwlc1    %[src2],         0x09(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src2],         0x06(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src3],         0x0c(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src3],         0x09(%[src_ptr])                 \n\t"
-      "punpcklbh  %[dest_hi],      %[src2],           %[src3]       \n\t"
-
-      "punpcklhw  %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gsswlc1    %[dest],         0x03(%[dstr_ptr])                \n\t"
-      "gsswrc1    %[dest],         0x00(%[dstr_ptr])                \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x03(%[dstg_ptr])                \n\t"
-      "gsswrc1    %[dest],         0x00(%[dstg_ptr])                \n\t"
-      "punpckhhw  %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gsswlc1    %[dest],         0x03(%[dstb_ptr])                \n\t"
-      "gsswrc1    %[dest],         0x00(%[dstb_ptr])                \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
-      "daddiu     %[dstr_ptr],     %[dstr_ptr],       0x04          \n\t"
-      "daddiu     %[dstg_ptr],     %[dstg_ptr],       0x04          \n\t"
-      "daddiu     %[dstb_ptr],     %[dstb_ptr],       0x04          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]),
-        [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g),
-        [dstb_ptr] "r"(dst_b), [width] "r"(width)
-      : "memory");
-}
-
-void MergeRGBRow_MMI(const uint8_t* src_r,
-                     const uint8_t* src_g,
-                     const uint8_t* src_b,
-                     uint8_t* dst_rgb,
-                     int width) {
-  uint64_t srcr, srcg, srcb, dest;
-  uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo;
-  const uint64_t temp = 0x0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[srcr],         0x07(%[srcr_ptr])                \n\t"
-      "gsldrc1    %[srcr],         0x00(%[srcr_ptr])                \n\t"
-      "gsldlc1    %[srcg],         0x07(%[srcg_ptr])                \n\t"
-      "gsldrc1    %[srcg],         0x00(%[srcg_ptr])                \n\t"
-      "punpcklbh  %[srcrg_lo],     %[srcr],           %[srcg]       \n\t"
-      "punpckhbh  %[srcrg_hi],     %[srcr],           %[srcg]       \n\t"
-
-      "gsldlc1    %[srcb],         0x07(%[srcb_ptr])                \n\t"
-      "gsldrc1    %[srcb],         0x00(%[srcb_ptr])                \n\t"
-      "punpcklbh  %[srcbz_lo],     %[srcb],           %[temp]       \n\t"
-      "punpckhbh  %[srcbz_hi],     %[srcb],           %[temp]       \n\t"
-
-      "punpcklhw  %[dest],         %[srcrg_lo],       %[srcbz_lo]   \n\t"
-      "gsswlc1    %[dest],         0x03(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x06(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x03(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest],         %[srcrg_lo],       %[srcbz_lo]   \n\t"
-      "gsswlc1    %[dest],         0x09(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x06(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x0c(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x09(%[dst_ptr])                 \n\t"
-      "punpcklhw  %[dest],         %[srcrg_hi],       %[srcbz_hi]   \n\t"
-      "gsswlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x0c(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x12(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest],         %[srcrg_hi],       %[srcbz_hi]   \n\t"
-      "gsswlc1    %[dest],         0x15(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x12(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x15(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[srcr_ptr],     %[srcr_ptr],       0x08          \n\t"
-      "daddiu     %[srcg_ptr],     %[srcg_ptr],       0x08          \n\t"
-      "daddiu     %[srcb_ptr],     %[srcb_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x18          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb),
-        [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi),
-        [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi),
-        [srcbz_lo] "=&f"(srcbz_lo)
-      : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b),
-        [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp)
-      : "memory");
-}
-
-// Filter 2 rows of YUY2 UV's (422) into U and V (420).
-void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
-                     int src_stride_yuy2,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t c0 = 0xff00ff00ff00ff00;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t temp[3];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  uint64_t src_stride = 0x0;
-  __asm__ volatile(
-      "1:	                                                     \n\t"
-      "gsldrc1  %[t0],         0x00(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t0],         0x07(%[src_yuy2])                     \n\t"
-      "daddu    %[src_stride], %[src_yuy2],       %[src_stride_yuy2] \n\t"
-      "gsldrc1  %[t1],         0x00(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x07(%[src_stride])                   \n\t"
-      "pavgb    %[t0],         %[t0],             %[t1]              \n\t"
-
-      "gsldrc1  %[t2],         0x08(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t2],         0x0f(%[src_yuy2])                     \n\t"
-      "gsldrc1  %[t1],         0x08(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x0f(%[src_stride])                   \n\t"
-      "pavgb    %[t1],         %[t2],             %[t1]              \n\t"
-
-      "and      %[t0],         %[t0],             %[c0]              \n\t"
-      "and      %[t1],         %[t1],             %[c0]              \n\t"
-      "psrlh    %[t0],         %[t0],             %[shift]           \n\t"
-      "psrlh    %[t1],         %[t1],             %[shift]           \n\t"
-      "packushb %[t0],         %[t0],             %[t1]              \n\t"
-      "mov.s    %[t1],         %[t0]                                 \n\t"
-      "and      %[d0],         %[t0],             %[c1]              \n\t"
-      "psrlh    %[d1],         %[t1],             %[shift]           \n\t"
-
-      "gsldrc1  %[t0],         0x10(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t0],         0x17(%[src_yuy2])                     \n\t"
-      "gsldrc1  %[t1],         0x10(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x17(%[src_stride])                   \n\t"
-      "pavgb    %[t0],         %[t0],              %[t1]             \n\t"
-
-      "gsldrc1  %[t2],         0x18(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t2],         0x1f(%[src_yuy2])                     \n\t"
-      "gsldrc1  %[t1],         0x18(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x1f(%[src_stride])                   \n\t"
-      "pavgb    %[t1],         %[t2],              %[t1]             \n\t"
-
-      "and      %[t0],         %[t0],              %[c0]             \n\t"
-      "and      %[t1],         %[t1],              %[c0]             \n\t"
-      "psrlh    %[t0],         %[t0],              %[shift]          \n\t"
-      "psrlh    %[t1],         %[t1],              %[shift]          \n\t"
-      "packushb %[t0],         %[t0],              %[t1]             \n\t"
-      "mov.s    %[t1],         %[t0]                                 \n\t"
-      "and      %[d2],         %[t0],              %[c1]             \n\t"
-      "psrlh    %[d3],         %[t1],              %[shift]          \n\t"
-
-      "packushb %[d0],         %[d0],              %[d2]             \n\t"
-      "packushb %[d1],         %[d1],              %[d3]             \n\t"
-      "gssdrc1  %[d0],         0x0(%[dst_u])	                     \n\t"
-      "gssdlc1  %[d0],         0x7(%[dst_u])                         \n\t"
-      "gssdrc1  %[d1],         0x0(%[dst_v])	                     \n\t"
-      "gssdlc1  %[d1],         0x7(%[dst_v])                         \n\t"
-      "daddiu   %[src_yuy2],   %[src_yuy2],        32                \n\t"
-      "daddiu   %[dst_u],      %[dst_u],           8                 \n\t"
-      "daddiu   %[dst_v],      %[dst_v],           8                 \n\t"
-      "daddiu   %[width],      %[width],          -16                \n\t"
-      "bgtz     %[width],      1b                                    \n\t"
-      "nop                                                           \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
-        [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
-        [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
-      : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of YUY2 UV's (422) into U and V (422).
-void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  uint64_t c0 = 0xff00ff00ff00ff00;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t temp[2];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_yuy2])          \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_yuy2])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "psrlh    %[t0],       %[t0],            %[shift] \n\t"
-      "psrlh    %[t1],       %[t1],            %[shift] \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d0],       %[t0],            %[c1]    \n\t"
-      "psrlh    %[d1],       %[t1],            %[shift] \n\t"
-
-      "gsldrc1  %[t0],       0x10(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t0],       0x17(%[src_yuy2])          \n\t"
-      "gsldrc1  %[t1],       0x18(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t1],       0x1f(%[src_yuy2])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "psrlh    %[t0],       %[t0],            %[shift] \n\t"
-      "psrlh    %[t1],       %[t1],            %[shift] \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d2],       %[t0],            %[c1]    \n\t"
-      "psrlh    %[d3],       %[t1],            %[shift] \n\t"
-
-      "packushb %[d0],       %[d0],            %[d2]    \n\t"
-      "packushb %[d1],       %[d1],            %[d3]    \n\t"
-      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
-      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
-      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
-      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
-      "daddiu   %[src_yuy2], %[src_yuy2],      32       \n\t"
-      "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
-      "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
-      "daddiu   %[width],    %[width],        -16       \n\t"
-      "bgtz     %[width],    1b                         \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
-        [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
-      : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of YUY2 Y's (422) into Y (420/422).
-void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[2];
-  __asm__ volatile(
-      "1:	                                     \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_yuy2])       \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_yuy2])       \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_yuy2])       \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_yuy2])       \n\t"
-      "and      %[t0],       %[t0],            %[c0] \n\t"
-      "and      %[t1],       %[t1],            %[c0] \n\t"
-      "packushb %[t0],       %[t0],            %[t1] \n\t"
-      "gssdrc1  %[t0],       0x0(%[dst_y])	     \n\t"
-      "gssdlc1  %[t0],       0x7(%[dst_y])           \n\t"
-      "daddiu   %[src_yuy2], %[src_yuy2],      16    \n\t"
-      "daddiu   %[dst_y],    %[dst_y],         8     \n\t"
-      "daddiu   %[width],    %[width],        -8     \n\t"
-      "bgtz     %[width],    1b                      \n\t"
-      "nop                                           \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
-      : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width),
-        [c0] "f"(c0)
-      : "memory");
-}
-
-// Filter 2 rows of UYVY UV's (422) into U and V (420).
-void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
-                     int src_stride_uyvy,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  // Output a row of UV values.
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[3];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  uint64_t src_stride = 0x0;
-  __asm__ volatile(
-      "1:	                                                      \n\t"
-      "gsldrc1  %[t0],         0x00(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t0],         0x07(%[src_uyvy])                      \n\t"
-      "daddu    %[src_stride], %[src_uyvy],        %[src_stride_uyvy] \n\t"
-      "gsldrc1  %[t1],         0x00(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x07(%[src_stride])                    \n\t"
-      "pavgb    %[t0],         %[t0],              %[t1]              \n\t"
-
-      "gsldrc1  %[t2],         0x08(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t2],         0x0f(%[src_uyvy])                      \n\t"
-      "gsldrc1  %[t1],         0x08(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x0f(%[src_stride])                    \n\t"
-      "pavgb    %[t1],         %[t2],              %[t1]              \n\t"
-
-      "and      %[t0],         %[t0],              %[c0]              \n\t"
-      "and      %[t1],         %[t1],              %[c0]              \n\t"
-      "packushb %[t0],         %[t0],              %[t1]              \n\t"
-      "mov.s    %[t1],         %[t0]                                  \n\t"
-      "and      %[d0],         %[t0],              %[c0]              \n\t"
-      "psrlh    %[d1],         %[t1],              %[shift]           \n\t"
-
-      "gsldrc1  %[t0],         0x10(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t0],         0x17(%[src_uyvy])                      \n\t"
-      "gsldrc1  %[t1],         0x10(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x17(%[src_stride])                    \n\t"
-      "pavgb    %[t0],         %[t0],              %[t1]              \n\t"
-
-      "gsldrc1  %[t2],         0x18(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t2],         0x1f(%[src_uyvy])                      \n\t"
-      "gsldrc1  %[t1],         0x18(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x1f(%[src_stride])                    \n\t"
-      "pavgb    %[t1],         %[t2],              %[t1]              \n\t"
-
-      "and      %[t0],         %[t0],              %[c0]              \n\t"
-      "and      %[t1],         %[t1],              %[c0]              \n\t"
-      "packushb %[t0],         %[t0],              %[t1]              \n\t"
-      "mov.s    %[t1],         %[t0]                                  \n\t"
-      "and      %[d2],         %[t0],              %[c0]              \n\t"
-      "psrlh    %[d3],         %[t1],              %[shift]           \n\t"
-
-      "packushb %[d0],         %[d0],              %[d2]              \n\t"
-      "packushb %[d1],         %[d1],              %[d3]              \n\t"
-      "gssdrc1  %[d0],         0x0(%[dst_u])	                      \n\t"
-      "gssdlc1  %[d0],         0x7(%[dst_u])                          \n\t"
-      "gssdrc1  %[d1],         0x0(%[dst_v])	                      \n\t"
-      "gssdlc1  %[d1],         0x7(%[dst_v])                          \n\t"
-      "daddiu   %[src_uyvy],   %[src_uyvy],        32                 \n\t"
-      "daddiu   %[dst_u],      %[dst_u],           8                  \n\t"
-      "daddiu   %[dst_v],      %[dst_v],           8                  \n\t"
-      "daddiu   %[width],      %[width],          -16                 \n\t"
-      "bgtz     %[width],      1b                                     \n\t"
-      "nop                                                            \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
-        [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
-        [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
-      : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of UYVY UV's (422) into U and V (422).
-void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  // Output a row of UV values.
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[2];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d0],       %[t0],            %[c0]    \n\t"
-      "psrlh    %[d1],       %[t1],            %[shift] \n\t"
-
-      "gsldrc1  %[t0],       0x10(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t0],       0x17(%[src_uyvy])          \n\t"
-      "gsldrc1  %[t1],       0x18(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t1],       0x1f(%[src_uyvy])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d2],       %[t0],            %[c0]    \n\t"
-      "psrlh    %[d3],       %[t1],            %[shift] \n\t"
-
-      "packushb %[d0],       %[d0],            %[d2]    \n\t"
-      "packushb %[d1],       %[d1],            %[d3]    \n\t"
-      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
-      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
-      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
-      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
-      "daddiu   %[src_uyvy], %[src_uyvy],      32       \n\t"
-      "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
-      "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
-      "daddiu   %[width],    %[width],        -16       \n\t"
-      "bgtz     %[width],    1b                         \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
-        [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
-      : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of UYVY Y's (422) into Y (420/422).
-void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  // Output a row of Y values.
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t shift = 0x08;
-  uint64_t temp[2];
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
-      "dsrl     %[t0],       %[t0],            %[shift] \n\t"
-      "dsrl     %[t1],       %[t1],            %[shift] \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "gssdrc1  %[t0],       0x0(%[dst_y])	        \n\t"
-      "gssdlc1  %[t0],       0x7(%[dst_y])              \n\t"
-      "daddiu   %[src_uyvy], %[src_uyvy],      16       \n\t"
-      "daddiu   %[dst_y],    %[dst_y],         8        \n\t"
-      "daddiu   %[width],    %[width],        -8        \n\t"
-      "bgtz     %[width],    1b                         \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
-      : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width),
-        [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-// Blend src_argb over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb or src_argb1.
-// This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_MMI(const uint8_t* src_argb,
-                      const uint8_t* src_argb1,
-                      uint8_t* dst_argb,
-                      int width) {
-  uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi,
-      dest_lo;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL;
-  const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
-  const uint64_t mask3 = 0xFF;
-  const uint64_t mask4 = ~mask1;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[src0_lo],      %[src0],           %[mask0]      \n\t"
-
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src1_lo],      %[src1],           %[mask0]      \n\t"
-
-      "psubush    %[alpha],        %[mask2],          %[src0_lo]    \n\t"
-      "pshufh     %[alpha],        %[alpha],          %[mask3]      \n\t"
-      "pmullh     %[dest_lo],      %[src1_lo],        %[alpha]      \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src0_lo]    \n\t"
-
-      "punpckhbh  %[src0_hi],      %[src0],           %[mask0]      \n\t"
-      "punpckhbh  %[src1_hi],      %[src1],           %[mask0]      \n\t"
-
-      "psubush    %[alpha],        %[mask2],          %[src0_hi]    \n\t"
-      "pshufh     %[alpha],        %[alpha],          %[mask3]      \n\t"
-      "pmullh     %[dest_hi],      %[src1_hi],        %[alpha]      \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src0_hi]    \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[mask4]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha),
-        [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
-        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4),
-        [shift] "f"(shift), [width] "r"(width)
-      : "memory");
-}
-
-void BlendPlaneRow_MMI(const uint8_t* src0,
-                       const uint8_t* src1,
-                       const uint8_t* alpha,
-                       uint8_t* dst,
-                       int width) {
-  uint64_t source0, source1, dest, alph;
-  uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi,
-      dest_lo;
-  uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL;
-  const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[src0_lo],      %[src0],           %[mask0]      \n\t"
-      "punpckhbh  %[src0_hi],      %[src0],           %[mask0]      \n\t"
-
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src1_lo],      %[src1],           %[mask0]      \n\t"
-      "punpckhbh  %[src1_hi],      %[src1],           %[mask0]      \n\t"
-
-      "gsldlc1    %[alpha],        0x07(%[alpha_ptr])               \n\t"
-      "gsldrc1    %[alpha],        0x00(%[alpha_ptr])               \n\t"
-      "psubusb    %[alpha_r],      %[mask1],          %[alpha]      \n\t"
-      "punpcklbh  %[alpha_lo],     %[alpha],          %[mask0]      \n\t"
-      "punpckhbh  %[alpha_hi],     %[alpha],          %[mask0]      \n\t"
-      "punpcklbh  %[alpha_rlo],    %[alpha_r],        %[mask0]      \n\t"
-      "punpckhbh  %[alpha_rhi],    %[alpha_r],        %[mask0]      \n\t"
-
-      "pmullh     %[dest_lo],      %[src0_lo],        %[alpha_lo]   \n\t"
-      "pmullh     %[dest],         %[src1_lo],        %[alpha_rlo]  \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[dest]       \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[mask2]      \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-
-      "pmullh     %[dest_hi],      %[src0_hi],        %[alpha_hi]   \n\t"
-      "pmullh     %[dest],         %[src1_hi],        %[alpha_rhi]  \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[dest]       \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[mask2]      \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[alpha_ptr],    %[alpha_ptr],      0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph),
-        [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
-        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
-        [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi),
-        [alpha_r] "=&f"(alpha_rev)
-      : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha),
-        [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width)
-      : "memory");
-}
-
-// Multiply source RGB by alpha and store to destination.
-// This code mimics the SSSE3 version for better testability.
-void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
-                          uint8_t* dst_argb,
-                          int width) {
-  uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha;
-  const uint64_t mask0 = 0xFF;
-  const uint64_t mask1 = 0xFF000000FF000000ULL;
-  const uint64_t mask2 = ~mask1;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[src]        \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[src]        \n\t"
-
-      "pshufh     %[alpha],        %[src_lo],         %[mask0]      \n\t"
-      "pmulhuh    %[dest_lo],      %[alpha],          %[src_lo]     \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "pshufh     %[alpha],        %[src_hi],         %[mask0]      \n\t"
-      "pmulhuh    %[dest_hi],      %[alpha],          %[src_hi]     \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "and        %[dest],         %[dest],           %[mask2]      \n\t"
-      "and        %[src],          %[src],            %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[src]        \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift),
-        [width] "r"(width)
-      : "memory");
-}
-
-void ComputeCumulativeSumRow_MMI(const uint8_t* row,
-                                 int32_t* cumsum,
-                                 const int32_t* previous_cumsum,
-                                 int width) {
-  int64_t row_sum[2] = {0, 0};
-  uint64_t src, dest0, dest1, presrc0, presrc1, dest;
-  const uint64_t mask = 0x0;
-
-  __asm__ volatile(
-      "xor        %[row_sum0],     %[row_sum0],       %[row_sum0]   \n\t"
-      "xor        %[row_sum1],     %[row_sum1],       %[row_sum1]   \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[src],          0x03(%[row_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x00(%[row_ptr])                 \n\t"
-
-      "punpcklbh  %[src],          %[src],            %[mask]       \n\t"
-      "punpcklhw  %[dest0],        %[src],            %[mask]       \n\t"
-      "punpckhhw  %[dest1],        %[src],            %[mask]       \n\t"
-
-      "paddw      %[row_sum0],     %[row_sum0],       %[dest0]      \n\t"
-      "paddw      %[row_sum1],     %[row_sum1],       %[dest1]      \n\t"
-
-      "gsldlc1    %[presrc0],      0x07(%[pre_ptr])                 \n\t"
-      "gsldrc1    %[presrc0],      0x00(%[pre_ptr])                 \n\t"
-      "gsldlc1    %[presrc1],      0x0f(%[pre_ptr])                 \n\t"
-      "gsldrc1    %[presrc1],      0x08(%[pre_ptr])                 \n\t"
-
-      "paddw      %[dest0],        %[row_sum0],       %[presrc0]    \n\t"
-      "paddw      %[dest1],        %[row_sum1],       %[presrc1]    \n\t"
-
-      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[row_ptr],      %[row_ptr],        0x04          \n\t"
-      "daddiu     %[pre_ptr],      %[pre_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x01          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]),
-        [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0),
-        [presrc1] "=&f"(presrc1)
-      : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum),
-        [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask)
-      : "memory");
-}
-
-// C version 2x2 -> 2x1.
-void InterpolateRow_MMI(uint8_t* dst_ptr,
-                        const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        int width,
-                        int source_y_fraction) {
-  if (source_y_fraction == 0) {
-    __asm__ volatile(
-        "1:	                              \n\t"
-        "ld     $t0,        0x0(%[src_ptr])   \n\t"
-        "sd     $t0,        0x0(%[dst_ptr])   \n\t"
-        "daddiu %[src_ptr], %[src_ptr],     8 \n\t"
-        "daddiu %[dst_ptr], %[dst_ptr],     8 \n\t"
-        "daddiu %[width],   %[width],      -8 \n\t"
-        "bgtz   %[width],   1b                \n\t"
-        "nop                                  \n\t"
-        :
-        : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width)
-        : "memory");
-    return;
-  }
-  if (source_y_fraction == 128) {
-    uint64_t uv = 0x0;
-    uint64_t uv_stride = 0x0;
-    __asm__ volatile(
-        "1:	                                            \n\t"
-        "gsldrc1 %[uv],        0x0(%[src_ptr])              \n\t"
-        "gsldlc1 %[uv],        0x7(%[src_ptr])              \n\t"
-        "daddu   $t0,          %[src_ptr],     %[stride]    \n\t"
-        "gsldrc1 %[uv_stride], 0x0($t0)                     \n\t"
-        "gsldlc1 %[uv_stride], 0x7($t0)                     \n\t"
-
-        "pavgb   %[uv],        %[uv],          %[uv_stride] \n\t"
-        "gssdrc1 %[uv],        0x0(%[dst_ptr])              \n\t"
-        "gssdlc1 %[uv],        0x7(%[dst_ptr])              \n\t"
-
-        "daddiu  %[src_ptr],   %[src_ptr],     8            \n\t"
-        "daddiu  %[dst_ptr],   %[dst_ptr],     8            \n\t"
-        "daddiu  %[width],     %[width],      -8            \n\t"
-        "bgtz    %[width],     1b                           \n\t"
-        "nop                                                \n\t"
-        : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride)
-        : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width),
-          [stride] "r"((int64_t)src_stride)
-        : "memory");
-    return;
-  }
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  uint64_t temp;
-  uint64_t data[4];
-  uint64_t zero = 0x0;
-  uint64_t c0 = 0x0080008000800080;
-  uint64_t fy0 = 0x0100010001000100;
-  uint64_t shift = 0x8;
-  __asm__ volatile(
-      "pshufh    %[fy1],      %[fy1],          %[zero]  \n\t"
-      "psubh     %[fy0],      %[fy0],          %[fy1]   \n\t"
-      "1:	                                        \n\t"
-      "gsldrc1   %[t0],       0x0(%[src_ptr])           \n\t"
-      "gsldlc1   %[t0],       0x7(%[src_ptr])           \n\t"
-      "punpcklbh %[d0],       %[t0],           %[zero]  \n\t"
-      "punpckhbh %[d1],       %[t0],           %[zero]  \n\t"
-      "gsldrc1   %[t0],       0x0(%[src_ptr1])          \n\t"
-      "gsldlc1   %[t0],       0x7(%[src_ptr1])          \n\t"
-      "punpcklbh %[d2],       %[t0],           %[zero]  \n\t"
-      "punpckhbh %[d3],       %[t0],           %[zero]  \n\t"
-
-      "pmullh    %[d0],       %[d0],           %[fy0]   \n\t"
-      "pmullh    %[d2],       %[d2],           %[fy1]   \n\t"
-      "paddh     %[d0],       %[d0],           %[d2]    \n\t"
-      "paddh     %[d0],       %[d0],           %[c0]    \n\t"
-      "psrlh     %[d0],       %[d0],           %[shift] \n\t"
-
-      "pmullh    %[d1],       %[d1],           %[fy0]   \n\t"
-      "pmullh    %[d3],       %[d3],           %[fy1]   \n\t"
-      "paddh     %[d1],       %[d1],           %[d3]    \n\t"
-      "paddh     %[d1],       %[d1],           %[c0]    \n\t"
-      "psrlh     %[d1],       %[d1],           %[shift] \n\t"
-
-      "packushb  %[d0],       %[d0],           %[d1]    \n\t"
-      "gssdrc1   %[d0],       0x0(%[dst_ptr])           \n\t"
-      "gssdlc1   %[d0],       0x7(%[dst_ptr])           \n\t"
-      "daddiu    %[src_ptr],  %[src_ptr],      8        \n\t"
-      "daddiu    %[src_ptr1], %[src_ptr1],     8        \n\t"
-      "daddiu    %[dst_ptr],  %[dst_ptr],      8        \n\t"
-      "daddiu    %[width],    %[width],       -8        \n\t"
-      "bgtz      %[width],    1b                        \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]),
-        [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
-      : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1),
-        [dst_ptr] "r"(dst_ptr), [width] "r"(width),
-        [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0),
-        [shift] "f"(shift), [zero] "f"(zero)
-      : "memory");
-}
-
-// Use first 4 shuffler values to reorder ARGB channels.
-void ARGBShuffleRow_MMI(const uint8_t* src_argb,
-                        uint8_t* dst_argb,
-                        const uint8_t* shuffler,
-                        int width) {
-  uint64_t source, dest0, dest1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) |
-                         ((shuffler[2] & 0x03) << 4) |
-                         ((shuffler[3] & 0x03) << 6);
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[dest0],        %[src],            %[mask0]      \n\t"
-      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
-      "punpckhbh  %[dest1],        %[src],            %[mask0]      \n\t"
-      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
-      "packushb   %[dest],         %[dest0],          %[dest1]      \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void I422ToYUY2Row_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_frame,
-                       int width) {
-  uint64_t temp[3];
-  uint64_t vu = 0x0;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldlc1   %[ty],        0x7(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gsldrc1   %[ty],        0x0(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gslwlc1   %[tu],        0x3(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tu],        0x0(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwlc1   %[tv],        0x3(%[src_v])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tv],        0x0(%[src_v])            \n\t"  // b=src_sobely[i]
-      "punpcklbh %[vu],        %[tu],             %[tv]	\n\t"  // g
-      "punpcklbh %[tu],        %[ty],             %[vu]	\n\t"  // g
-      "gssdlc1   %[tu],        0x7(%[dst_frame])        \n\t"
-      "gssdrc1   %[tu],        0x0(%[dst_frame])        \n\t"
-      "punpckhbh %[tu],        %[ty],             %[vu]	\n\t"  // g
-      "gssdlc1   %[tu],        0x0F(%[dst_frame])       \n\t"
-      "gssdrc1   %[tu],        0x08(%[dst_frame])       \n\t"
-      "daddiu    %[src_y],     %[src_y],          8     \n\t"
-      "daddiu    %[src_u],     %[src_u],          4     \n\t"
-      "daddiu    %[src_v],     %[src_v],          4     \n\t"
-      "daddiu    %[dst_frame], %[dst_frame],      16    \n\t"
-      "daddiu    %[width],     %[width],         -8     \n\t"
-      "bgtz      %[width],     1b                       \n\t"
-      "nop                                              \n\t"
-      : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
-        [vu] "=&f"(vu)
-      : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
-        [dst_frame] "r"(dst_frame), [width] "r"(width)
-      : "memory");
-}
-
-void I422ToUYVYRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_frame,
-                       int width) {
-  uint64_t temp[3];
-  uint64_t vu = 0x0;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldlc1   %[ty],        0x7(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gsldrc1   %[ty],        0x0(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gslwlc1   %[tu],        0x3(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tu],        0x0(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwlc1   %[tv],        0x3(%[src_v])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tv],        0x0(%[src_v])            \n\t"  // b=src_sobely[i]
-      "punpcklbh %[vu],        %[tu],             %[tv]	\n\t"  // g
-      "punpcklbh %[tu],        %[vu],             %[ty]	\n\t"  // g
-      "gssdlc1   %[tu],        0x7(%[dst_frame])        \n\t"
-      "gssdrc1   %[tu],        0x0(%[dst_frame])        \n\t"
-      "punpckhbh %[tu],        %[vu],             %[ty]	\n\t"  // g
-      "gssdlc1   %[tu],        0x0F(%[dst_frame])       \n\t"
-      "gssdrc1   %[tu],        0x08(%[dst_frame])       \n\t"
-      "daddiu    %[src_y],     %[src_y],          8     \n\t"
-      "daddiu    %[src_u],     %[src_u],          4     \n\t"
-      "daddiu    %[src_v],     %[src_v],          4     \n\t"
-      "daddiu    %[dst_frame], %[dst_frame],      16    \n\t"
-      "daddiu    %[width],     %[width],         -8     \n\t"
-      "bgtz      %[width],     1b                       \n\t"
-      "nop                                              \n\t"
-      : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
-        [vu] "=&f"(vu)
-      : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
-        [dst_frame] "r"(dst_frame), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  uint64_t source, dest;
-  const uint64_t mask0 = 0xff000000ff000000ULL;
-  const uint64_t mask1 = ~mask0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "and        %[src],          %[src],            %[mask0]      \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[src],            %[dest]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(source), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
-                             uint8_t* dst_a,
-                             int width) {
-  uint64_t src, dest0, dest1, dest_lo, dest_hi, dest;
-  const uint64_t mask = 0xff000000ff000000ULL;
-  const uint64_t shift = 0x18;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "and        %[dest0],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "gsldlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x08(%[src_ptr])                 \n\t"
-      "and        %[dest1],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x10(%[src_ptr])                 \n\t"
-      "and        %[dest0],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "gsldlc1    %[src],          0x1f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x18(%[src_ptr])                 \n\t"
-      "and        %[dest1],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_hi],      %[dest0],          %[dest1]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask),
-        [shift] "f"(shift), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  uint64_t source, dest0, dest1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x00ffffff00ffffffULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[dest0],        %[mask0],          %[src]        \n\t"
-      "punpcklhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "punpckhbh  %[dest0],        %[mask0],          %[src]        \n\t"
-      "punpcklhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x20          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void I444ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-  __asm__ volatile (
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"//yg
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"//bb
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"//ub
-    "or         %[ub],           %[ub],             %[mask]       \n\t"//must sign extension
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"//bg
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"//ug
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"//vg
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"//br
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"//vr
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"//sign extension
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"//u
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"//v
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"//u*ug
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"//v*vg
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"//v*vr
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
-    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"//ffffgggg
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//gbgbgbgb
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//frfrfrfr
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-    : [y]"=&f"(y),
-      [u]"=&f"(u),                         [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [alpha]"f"(-1),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask)
-    : "memory"
-  );
-}
-
-// Also used for 420
-void I422ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"//yg
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"//bb
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"//ub
-    "or         %[ub],           %[ub],             %[mask]       \n\t"//must sign extension
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"//bg
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"//ug
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"//vg
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"//br
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"//vr
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"//sign extension
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"//v
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"//u*ug
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"//v*vg
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"//v*vr
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
-    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"//ffffgggg
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//gbgbgbgb
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//frfrfrfr
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),
-      [u]"=&f"(u),                         [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [alpha]"f"(-1),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask)
-    : "memory"
-  );
-}
-
-// 10 bit YUV to ARGB
-void I210ToARGBRow_MMI(const uint16_t* src_y,
-                       const uint16_t* src_u,
-                       const uint16_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gsldlc1    %[y],            0x07(%[y_ptr])                   \n\t"
-    "gsldrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "psllh      %[y],            %[y],              %[six]        \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "punpcklhw  %[u],            %[u],              %[u]          \n\t"
-    "psrah      %[u],            %[u],              %[two]        \n\t"
-    "punpcklhw  %[v],            %[v],              %[v]          \n\t"
-    "psrah      %[v],            %[v],              %[two]        \n\t"
-    "pminsh     %[u],            %[u],              %[mask1]      \n\t"
-    "pminsh     %[v],            %[v],              %[mask1]      \n\t"
-
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x08          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),
-      [u]"=&f"(u),                         [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [alpha]"f"(-1),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask),                     [two]"f"(0x02),
-      [mask1]"f"(0x00ff00ff00ff00ff)
-    : "memory"
-  );
-}
-
-void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            const uint8_t* src_a,
-                            uint8_t* rgb_buf,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  uint64_t y,u,v,a;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-    "gslwlc1    %[a],            0x03(%[a_ptr])                   \n\t"
-    "gslwrc1    %[a],            0x00(%[a_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
-    "packushb   %[g_vec0],       %[g_vec0],         %[a]          \n\t"
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[a]          \n\t"//aaaagggg
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[a_ptr],        %[a_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),                         [a]"=&f"(a),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [a_ptr]"r"(src_a),                   [zero]"f"(0x00),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask)
-    : "memory"
-  );
-}
-
-void I422ToRGB24Row_MMI(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "packushb   %[g_vec0],       %[g_vec0],         %[zero]       \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-
-    "punpckhwd  %[r_vec0],       %[g_vec0],         %[g_vec0]     \n\t"
-    "psllw      %[r_vec1],       %[r_vec0],         %[lmove1]     \n\t"
-    "or         %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-    "psrlw      %[r_vec1],       %[r_vec0],         %[rmove1]     \n\t"
-    "pextrh     %[r_vec1],       %[r_vec1],         %[zero]       \n\t"
-    "pinsrh_2   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-    "pextrh     %[r_vec1],       %[g_vec1],         %[zero]       \n\t"
-    "pinsrh_3   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-    "pextrh     %[r_vec1],       %[g_vec1],         %[one]        \n\t"
-    "punpckhwd  %[g_vec1],       %[g_vec1],         %[g_vec1]     \n\t"
-    "psllw      %[g_vec1],       %[g_vec1],         %[rmove1]     \n\t"
-    "or         %[g_vec1],       %[g_vec1],         %[r_vec1]     \n\t"
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gsswlc1    %[g_vec1],       0x0b(%[rgbbuf_ptr])              \n\t"
-    "gsswrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0c          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask]"f"(mask),
-      [lmove1]"f"(0x18),                   [rmove1]"f"(0x8),
-      [one]"f"(0x1)
-    : "memory"
-  );
-}
-
-void I422ToARGB4444Row_MMI(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_argb4444,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "and        %[g_vec],        %[g_vec],          %[mask1]      \n\t"
-    "psrlw      %[g_vec],        %[g_vec],          %[four]       \n\t"
-    "psrlw      %[r_vec],        %[g_vec],          %[four]       \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
-    "and        %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-
-    "and        %[b_vec],        %[b_vec],          %[mask1]      \n\t"
-    "psrlw      %[b_vec],        %[b_vec],          %[four]       \n\t"
-    "psrlw      %[r_vec],        %[b_vec],          %[four]       \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
-    "and        %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_argb4444])            \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_argb4444])            \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[dst_argb4444], %[dst_argb4444],   0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [dst_argb4444]"r"(dst_argb4444),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask]"f"(0xff00ff00ff00ff00),
-      [four]"f"(0x4),                      [mask1]"f"(0xf0f0f0f0f0f0f0f0),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void I422ToARGB1555Row_MMI(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_argb1555,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "psrlw      %[temp],         %[g_vec],          %[three]      \n\t"
-    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "or         %[g_vec],        %[g_vec],          %[mask3]      \n\t"
-
-    "psrlw      %[temp],         %[b_vec],          %[three]      \n\t"
-    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "or         %[b_vec],        %[b_vec],          %[mask3]      \n\t"
-
-    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_argb1555])            \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_argb1555])            \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[dst_argb1555], %[dst_argb1555],   0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [dst_argb1555]"r"(dst_argb1555),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
-      [eight]"f"(0x8),                     [mask3]"f"(0x800000008000),
-      [lmove5]"f"(0x5)
-    : "memory"
-  );
-}
-
-void I422ToRGB565Row_MMI(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_rgb565,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
-    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
-    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [dst_rgb565]"r"(dst_rgb565),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
-      [eight]"f"(0x8),                     [seven]"f"(0x7),
-      [lmove5]"f"(0x5)
-    : "memory"
-  );
-}
-
-void NV12ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_uv,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void NV21ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_vu,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[ushu]       \n\t"
-    "pshufh     %[u],            %[u],              %[vshu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [vu_ptr]"r"(src_vu),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void NV12ToRGB24Row_MMI(const uint8_t* src_y,
-                        const uint8_t* src_uv,
-                        uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
-    "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
-    "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
-    "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
-    "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
-    "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
-    "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
-    "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1),                      [lmove1]"f"(0x18),
-      [one]"f"(0x1),                       [rmove1]"f"(0x8)
-    : "memory"
-  );
-}
-
-void NV21ToRGB24Row_MMI(const uint8_t* src_y,
-                        const uint8_t* src_vu,
-                        uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[ushu]       \n\t"
-    "pshufh     %[u],            %[u],              %[vshu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
-    "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
-    "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
-    "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
-    "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
-    "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
-    "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
-    "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [vu_ptr]"r"(src_vu),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [lmove1]"f"(0x18),                   [rmove1]"f"(0x8),
-      [one]"f"(0x1)
-    : "memory"
-  );
-}
-
-void NV12ToRGB565Row_MMI(const uint8_t* src_y,
-                         const uint8_t* src_uv,
-                         uint8_t* dst_rgb565,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
-    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psubb      %[y],            %[eight],          %[three]      \n\t"//5
-    "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
-    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psubb      %[y],            %[eight],          %[three]      \n\t"//5
-    "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-	"daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-    "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
-      [dst_rgb565]"r"(dst_rgb565),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
-      [eight]"f"(0x8),                     [seven]"f"(0x7)
-    : "memory"
-  );
-}
-
-void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gsldlc1    %[y],            0x07(%[yuy2_ptr])                \n\t"
-    "gsldrc1    %[y],            0x00(%[yuy2_ptr])                \n\t"
-    "psrlh      %[temp],         %[y],              %[eight]      \n\t"
-    "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
-    "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
-
-    "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
-    "and        %[y],            %[y],              %[temp]       \n\t"
-    "psllh      %[temp],         %[y],              %[eight]      \n\t"
-    "or         %[y],            %[y],              %[temp]       \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[yuy2_ptr],     %[yuy2_ptr],       0x08          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [yuy2_ptr]"r"(src_yuy2),             [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1),                      [eight]"f"(0x8)
-    : "memory"
-  );
-}
-
-void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gsldlc1    %[y],            0x07(%[uyvy_ptr])                \n\t"
-    "gsldrc1    %[y],            0x00(%[uyvy_ptr])                \n\t"
-    "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
-    "and        %[temp],         %[y],              %[temp]       \n\t"
-    "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
-    "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
-
-    "psrlh      %[y],            %[y],              %[eight]      \n\t"
-    "psllh      %[temp],         %[y],              %[eight]      \n\t"
-    "or         %[y],            %[y],              %[temp]       \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[uyvy_ptr],     %[uyvy_ptr],       0x08          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [uyvy_ptr]"r"(src_uyvy),             [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1),                      [eight]"f"(0x8)
-    : "memory"
-  );
-}
-
-void I422ToRGBARow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[alpha],          %[g_vec]      \n\t"
-    "punpcklbh  %[b_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) {
-  __asm__ volatile (
-    "punpcklwd  %[v32],          %[v32],            %[v32]        \n\t"
-    "1:                                                           \n\t"
-    "gssdlc1    %[v32],          0x07(%[dst_ptr])                 \n\t"
-    "gssdrc1    %[v32],          0x00(%[dst_ptr])                 \n\t"
-    "gssdlc1    %[v32],          0x0f(%[dst_ptr])                 \n\t"
-    "gssdrc1    %[v32],          0x08(%[dst_ptr])                 \n\t"
-
-    "daddi      %[width],        %[width],         -0x04          \n\t"
-    "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-    "bnez       %[width],        1b                               \n\t"
-    : [v32]"+&f"(v32)
-    : [dst_ptr]"r"(dst_argb),           [width]"r"(width)
-    : "memory"
-  );
-}
-// clang-format on
-
-// 10 bit YUV to ARGB
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/scale_mmi.cc b/files/source/scale_mmi.cc
deleted file mode 100644
index 1226ef3e..00000000
--- a/files/source/scale_mmi.cc
+++ /dev/null
@@ -1,1168 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"  // For CopyARGB
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// clang-format off
-
-// CPU agnostic row functions
-void ScaleRowDown2_MMI(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1, dest;
-  const uint64_t shift = 0x8ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "psrlh      %[src0],         %[src0],           %[shift]      \n\t"
-
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "psrlh      %[src1],         %[src1],           %[shift]      \n\t"
-
-      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift] "f"(shift)
-      : "memory");
-}
-
-void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst,
-                             int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest0, dest1;
-
-  const uint64_t mask = 0x00ff00ff00ff00ffULL;
-  const uint64_t shift = 0x8ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],          0x00(%[src_ptr])                \n\t"
-      "gsldlc1    %[src0],          0x07(%[src_ptr])                \n\t"
-      "and        %[dest0],         %[src0],          %[mask]       \n\t"
-      "gsldrc1    %[src1],          0x08(%[src_ptr])                \n\t"
-      "gsldlc1    %[src1],          0x0f(%[src_ptr])                \n\t"
-      "and        %[dest1],         %[src1],          %[mask]       \n\t"
-      "packushb   %[dest0],         %[dest0],         %[dest1]      \n\t"
-
-      "psrlh      %[src0],          %[src0],          %[shift]      \n\t"
-      "psrlh      %[src1],          %[src1],          %[shift]      \n\t"
-      "packushb   %[dest1],         %[src0],          %[src1]       \n\t"
-
-      "pavgb      %[dest],          %[dest0],         %[dest1]      \n\t"
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "daddiu     %[src_ptr],       %[src_ptr],        0x10         \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],        0x08         \n\t"
-      "daddi      %[width],         %[width],         -0x08         \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask),
-        [shift] "f"(shift), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width) {
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-
-  uint64_t s0, s1, t0, t1;
-  uint64_t dest, dest0, dest1;
-
-  const uint64_t ph = 0x0002000200020002ULL;
-  const uint64_t mask = 0x00ff00ff00ff00ffULL;
-  const uint64_t shift0 = 0x2ULL;
-  const uint64_t shift1 = 0x8ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
-      "psrlh      %[s1],            %[s0],            %[shift1]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
-      "psrlh      %[t1],            %[t0],            %[shift1]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddh      %[dest0],         %[s0],            %[s1]         \n\t"
-      "paddh      %[dest0],         %[dest0],         %[t0]         \n\t"
-      "paddh      %[dest0],         %[dest0],         %[t1]         \n\t"
-      "paddh      %[dest0],         %[dest0],         %[ph]         \n\t"
-      "psrlh      %[dest0],         %[dest0],         %[shift0]     \n\t"
-
-      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
-      "psrlh      %[s1],            %[s0],            %[shift1]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
-      "psrlh      %[t1],            %[t0],            %[shift1]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddh      %[dest1],         %[s0],            %[s1]         \n\t"
-      "paddh      %[dest1],         %[dest1],         %[t0]         \n\t"
-      "paddh      %[dest1],         %[dest1],         %[t1]         \n\t"
-      "paddh      %[dest1],         %[dest1],         %[ph]         \n\t"
-      "psrlh      %[dest1],         %[dest1],         %[shift0]     \n\t"
-
-      "packushb   %[dest],          %[dest0],         %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[s],            %[s],              0x10          \n\t"
-      "daddiu     %[t],            %[t],              0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest)
-      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_argb,
-                           int dst_width) {
-  (void)src_stride;
-
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[src0],           %[src1]       \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_argb,
-                                 int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "lwc1       %[src0],         0x00(%[src_ptr])                 \n\t"
-      "lwc1       %[src1],         0x08(%[src_ptr])                 \n\t"
-      "punpcklwd  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "lwc1       %[src0],         0x04(%[src_ptr])                 \n\t"
-      "lwc1       %[src1],         0x0c(%[src_ptr])                 \n\t"
-      "punpcklwd  %[dest_hi],      %[src0],           %[src1]       \n\t"
-
-      "pavgb      %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_argb,
-                              int dst_width) {
-  const uint8_t* s = src_argb;
-  const uint8_t* t = src_argb + src_stride;
-
-  uint64_t s0, s_hi, s_lo;
-  uint64_t t0, t_hi, t_lo;
-  uint64_t dest, dest_hi, dest_lo;
-
-  const uint64_t mask = 0x0ULL;
-  const uint64_t ph = 0x0002000200020002ULL;
-  const uint64_t shfit = 0x2ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
-      "punpcklbh  %[s_lo],          %[s0],           %[mask]        \n\t"
-      "punpckhbh  %[s_hi],          %[s0],           %[mask]        \n\t"
-      "paddh      %[dest_lo],       %[s_lo],         %[s_hi]        \n\t"
-
-      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
-      "punpcklbh  %[t_lo],          %[t0],           %[mask]        \n\t"
-      "punpckhbh  %[t_hi],          %[t0],           %[mask]        \n\t"
-      "paddh      %[dest_lo],       %[dest_lo],      %[t_lo]        \n\t"
-      "paddh      %[dest_lo],       %[dest_lo],      %[t_hi]        \n\t"
-
-      "paddh      %[dest_lo],      %[dest_lo],       %[ph]          \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],       %[shfit]       \n\t"
-
-      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
-      "punpcklbh  %[s_lo],          %[s0],           %[mask]        \n\t"
-      "punpckhbh  %[s_hi],          %[s0],           %[mask]        \n\t"
-      "paddh      %[dest_hi],       %[s_lo],         %[s_hi]        \n\t"
-
-      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
-      "punpcklbh  %[t_lo],          %[t0],           %[mask]        \n\t"
-      "punpckhbh  %[t_hi],          %[t0],           %[mask]        \n\t"
-      "paddh      %[dest_hi],       %[dest_hi],      %[t_lo]        \n\t"
-      "paddh      %[dest_hi],       %[dest_hi],      %[t_hi]        \n\t"
-
-      "paddh      %[dest_hi],      %[dest_hi],       %[ph]          \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],       %[shfit]       \n\t"
-
-      "packushb   %[dest],         %[dest_lo],       %[dest_hi]     \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[s],            %[s],              0x10          \n\t"
-      "daddiu     %[t],            %[t],              0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo),
-        [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest)
-      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
-        [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit)
-      : "memory");
-}
-
-void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint16_t* dst,
-                          int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1, dest;
-  const uint64_t shift = 0x10ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
-
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
-
-      "packsswh   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift] "f"(shift)
-      : "memory");
-}
-
-void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint16_t* dst,
-                                int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "punpcklhw  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "punpckhhw  %[dest_hi],      %[src0],           %[src1]       \n\t"
-
-      "punpcklhw  %[src0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhhw  %[src1],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "pavgh      %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint16_t* dst,
-                             int dst_width) {
-  const uint16_t* s = src_ptr;
-  const uint16_t* t = src_ptr + src_stride;
-
-  uint64_t s0, s1, s_hi, s_lo;
-  uint64_t t0, t1, t_hi, t_lo;
-  uint64_t dest, dest0, dest1;
-
-  const uint64_t ph = 0x0000000200000002ULL;
-  const uint64_t mask = 0x0000ffff0000ffffULL;
-  const uint64_t shift0 = 0x10ULL;
-  const uint64_t shift1 = 0x2ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
-      "psrlw      %[s1],            %[s0],            %[shift0]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
-      "psrlw      %[t1],            %[t0],            %[shift0]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddw      %[dest0],         %[s0],            %[s1]         \n\t"
-      "paddw      %[dest0],         %[dest0],         %[t0]         \n\t"
-      "paddw      %[dest0],         %[dest0],         %[t1]         \n\t"
-      "paddw      %[dest0],         %[dest0],         %[ph]         \n\t"
-      "psrlw      %[dest0],         %[dest0],         %[shift1]     \n\t"
-
-      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
-      "psrlw      %[s1],            %[s0],            %[shift0]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
-      "psrlw      %[t1],            %[t0],            %[shift0]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddw      %[dest1],         %[s0],            %[s1]         \n\t"
-      "paddw      %[dest1],         %[dest1],         %[t0]         \n\t"
-      "paddw      %[dest1],         %[dest1],         %[t1]         \n\t"
-      "paddw      %[dest1],         %[dest1],         %[ph]         \n\t"
-      "psrlw      %[dest1],         %[dest1],         %[shift1]     \n\t"
-
-      "packsswh   %[dest],          %[dest0],         %[dest1]      \n\t"
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "daddiu     %[s],             %[s],              0x10         \n\t"
-      "daddiu     %[t],             %[t],              0x10         \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],        0x08         \n\t"
-      "daddi      %[width],         %[width],         -0x04         \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
-        [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi),
-        [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
-        [dest] "=&f"(dest)
-      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleRowDown4_MMI(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  const uint64_t shift = 0x10ULL;
-  const uint64_t mask = 0x000000ff000000ffULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
-      "and        %[src0],         %[src0],           %[mask]       \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
-      "and        %[src1],         %[src1],           %[mask]       \n\t"
-      "packsswh   %[dest_lo],      %[src0],           %[src1]       \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_ptr])                 \n\t"
-      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
-      "and        %[src0],         %[src0],           %[mask]       \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_ptr])                 \n\t"
-      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
-      "and        %[src1],         %[src1],           %[mask]       \n\t"
-      "packsswh   %[dest_hi],      %[src0],           %[src1]       \n\t"
-
-      "packushb   %[dest],         %[dest_lo],         %[dest_hi]   \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift] "f"(shift), [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint16_t* dst,
-                          int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "punpckhhw  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "punpcklhw  %[dest_lo],      %[dest_lo],        %[mask]       \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_ptr])                 \n\t"
-      "punpckhhw  %[dest_hi],      %[src0],           %[src1]       \n\t"
-      "punpcklhw  %[dest_hi],      %[dest_hi],        %[mask]       \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-#define DO_SCALEROWDOWN4BOX_PUNPCKADD()                              \
-  "punpcklbh  %[src_lo],       %[src],           %[mask0]      \n\t" \
-  "punpckhbh  %[src_hi],       %[src],           %[mask0]      \n\t" \
-  "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]     \n\t" \
-  "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]     \n\t"
-
-#define DO_SCALEROWDOWN4BOX_LOOP(reg)                                \
-  "ldc1       %[src],          0x00(%[src0_ptr])               \n\t" \
-  "punpcklbh  %[dest_lo],      %[src],           %[mask0]      \n\t" \
-  "punpckhbh  %[dest_hi],      %[src],           %[mask0]      \n\t" \
-                                                                     \
-  "ldc1       %[src],          0x00(%[src1_ptr])               \n\t" \
-  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
-                                                                     \
-  "ldc1       %[src],          0x00(%[src2_ptr])               \n\t" \
-  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
-                                                                     \
-  "ldc1       %[src],          0x00(%[src3_ptr])               \n\t" \
-  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
-                                                                     \
-  "pmaddhw    %[dest_lo],      %[dest_lo],       %[mask1]      \n\t" \
-  "pmaddhw    %[dest_hi],      %[dest_hi],       %[mask1]      \n\t" \
-  "packsswh   " #reg   ",      %[dest_lo],       %[dest_hi]    \n\t" \
-  "pmaddhw    " #reg   ",      " #reg   ",       %[mask1]      \n\t" \
-  "paddh      " #reg   ",      " #reg   ",       %[ph]         \n\t" \
-  "psrlh      " #reg   ",      " #reg   ",       %[shift]      \n\t" \
-                                                                     \
-  "daddiu     %[src0_ptr],     %[src0_ptr],      0x08          \n\t" \
-  "daddiu     %[src1_ptr],     %[src1_ptr],      0x08          \n\t" \
-  "daddiu     %[src2_ptr],     %[src2_ptr],      0x08          \n\t" \
-  "daddiu     %[src3_ptr],     %[src3_ptr],      0x08          \n\t"
-
-/* LibYUVScaleTest.ScaleDownBy4_Box */
-void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width) {
-  const uint8_t* src0_ptr = src_ptr;
-  const uint8_t* src1_ptr = src_ptr + src_stride;
-  const uint8_t* src2_ptr = src_ptr + src_stride * 2;
-  const uint8_t* src3_ptr = src_ptr + src_stride * 3;
-
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
-
-  const uint64_t mask0 = 0x0ULL;
-  const uint64_t mask1 = 0x0001000100010001ULL;
-  const uint64_t ph = 0x0008000800080008ULL;
-  const uint64_t shift = 0x4ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest0])
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest1])
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest2])
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest3])
-
-      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[dest_hi],      %[dest2],          %[dest3]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
-        [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
-        [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
-        [ph] "f"(ph), [mask1] "f"(mask1)
-      : "memory");
-}
-
-#define DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                            \
-  "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t" \
-  "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t" \
-  "paddh      %[dest_lo],      %[dest_lo],        %[src_lo]     \n\t" \
-  "paddh      %[dest_hi],      %[dest_hi],        %[src_hi]     \n\t"
-
-#define DO_SCALEROWDOWN4BOX_16_LOOP(reg)                              \
-  "ldc1       %[src],          0x00(%[src0_ptr])                \n\t" \
-  "punpcklbh  %[dest_lo],      %[src],            %[mask0]      \n\t" \
-  "punpckhbh  %[dest_hi],      %[src],            %[mask0]      \n\t" \
-                                                                      \
-  "ldc1       %[src],          0x00(%[src1_ptr])                \n\t" \
-  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
-                                                                      \
-  "ldc1       %[src],          0x00(%[src2_ptr])                \n\t" \
-  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
-                                                                      \
-  "ldc1       %[src],          0x00(%[src3_ptr])                \n\t" \
-  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
-                                                                      \
-  "paddw      %[dest],         %[dest_lo],        %[dest_hi]    \n\t" \
-  "punpckhwd  %[dest_hi],      %[dest],           %[dest]       \n\t" \
-  "paddw      %[dest],         %[dest_hi],        %[dest]       \n\t" \
-  "paddw      %[dest],         %[dest],           %[ph]         \n\t" \
-  "psraw      %[dest],         %[dest],           %[shift]      \n\t" \
-  "and        " #reg ",        %[dest],           %[mask1]      \n\t" \
-                                                                      \
-  "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t" \
-  "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t" \
-  "daddiu     %[src2_ptr],     %[src2_ptr],       0x08          \n\t" \
-  "daddiu     %[src3_ptr],     %[src3_ptr],       0x08          \n\t"
-
-/* LibYUVScaleTest.ScaleDownBy4_Box_16 */
-void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint16_t* dst,
-                             int dst_width) {
-  const uint16_t* src0_ptr = src_ptr;
-  const uint16_t* src1_ptr = src_ptr + src_stride;
-  const uint16_t* src2_ptr = src_ptr + src_stride * 2;
-  const uint16_t* src3_ptr = src_ptr + src_stride * 3;
-
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
-
-  const uint64_t mask0 = 0x0ULL;
-  const uint64_t mask1 = 0x00000000ffffffffULL;
-  const uint64_t ph = 0x0000000800000008ULL;
-  const uint64_t shift = 0x04ULL;
-
-  __asm__ volatile(
-      "1:                                                        \n\t"
-
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0])
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1])
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2])
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3])
-      "punpcklwd  %[dest_lo],      %[dest0],          %[dest1]   \n\t"
-      "punpcklwd  %[dest_hi],      %[dest2],          %[dest3]   \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi] \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])              \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])              \n\t"
-
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08       \n\t"
-      "daddi      %[width],        %[width],         -0x04       \n\t"
-      "bnez       %[width],        1b                            \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
-        [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
-        [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
-        [ph] "f"(ph), [mask1] "f"(mask1)
-      : "memory");
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_MMI(uint8_t* dst_ptr,
-                      const uint8_t* src_ptr,
-                      int dst_width,
-                      int x,
-                      int dx) {
-  uint64_t src, dest;
-
-  (void)x;
-  (void)dx;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "lwc1       %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[dest],         %[src],            %[src]        \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x04          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
-                         const uint16_t* src_ptr,
-                         int dst_width,
-                         int x,
-                         int dx) {
-  uint64_t src, dest;
-
-  (void)x;
-  (void)dx;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-
-      "punpcklhw  %[dest],         %[src],            %[src]        \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "punpckhhw  %[dest],         %[src],            %[src]        \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
-  uint64_t src, src_hi, src_lo, dest0, dest1;
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask]       \n\t"
-
-      "gsldrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "paddush    %[dest0],        %[dest0],          %[src_lo]     \n\t"
-      "gsldrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "paddush    %[dest1],        %[dest1],          %[src_hi]     \n\t"
-
-      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [src] "=&f"(src)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
-                        uint32_t* dst_ptr,
-                        int src_width) {
-  uint64_t src, src_hi, src_lo, dest0, dest1;
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "punpcklhw  %[src_lo],       %[src],            %[mask]       \n\t"
-      "punpckhhw  %[src_hi],       %[src],            %[mask]       \n\t"
-
-      "gsldrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src_lo]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-
-      "gsldrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src_hi]     \n\t"
-      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [src] "=&f"(src)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              int src_stepx,
-                              uint8_t* dst_argb,
-                              int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "lwc1       %[src0],          0x00(%[src_ptr])                \n\t"
-      "dadd       %[src_ptr],       %[src_ptr],       %[src_stepx_4]\n\t"
-      "lwc1       %[src1],          0x00(%[src_ptr])                \n\t"
-      "punpcklwd  %[dest],          %[src0],          %[src1]       \n\t"
-
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "dadd       %[src_ptr],       %[src_ptr],       %[src_stepx_4]\n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x08          \n\t"
-      "daddi      %[width],         %[width],        -0x02          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb),
-        [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 int src_stepx,
-                                 uint8_t* dst_argb,
-                                 int dst_width) {
-  const uint8_t* src0_ptr = src_argb;
-  const uint8_t* src1_ptr = src_argb + src_stride;
-
-  uint64_t src0, src1, src_hi, src_lo;
-  uint64_t dest, dest_hi, dest_lo, dest0, dest1;
-
-  const uint64_t mask = 0x0ULL;
-  const uint64_t ph = 0x0002000200020002ULL;
-  const uint64_t shift = 0x2ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-
-      "lwc1       %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_lo],      %[src0],          %[mask]        \n\t"
-      "lwc1       %[src0],         0x04(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_hi],      %[src0],          %[mask]        \n\t"
-
-      "lwc1       %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_lo],       %[src1],          %[mask]        \n\t"
-      "lwc1       %[src1],         0x04(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_hi],       %[src1],          %[mask]        \n\t"
-      "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]      \n\t"
-      "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]      \n\t"
-      "paddh      %[dest0],        %[dest_hi],       %[dest_lo]     \n\t"
-      "paddh      %[dest0],        %[dest0],         %[ph]          \n\t"
-      "psrlh      %[dest0],        %[dest0],         %[shift]       \n\t"
-
-      "dadd       %[src0_ptr],     %[src0_ptr],      %[src_stepx_4] \n\t"
-      "dadd       %[src1_ptr],     %[src1_ptr],      %[src_stepx_4] \n\t"
-
-      "lwc1       %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_lo],      %[src0],          %[mask]        \n\t"
-      "lwc1       %[src0],         0x04(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_hi],      %[src0],          %[mask]        \n\t"
-
-      "lwc1       %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_lo],       %[src1],          %[mask]        \n\t"
-      "lwc1       %[src1],         0x04(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_hi],       %[src1],          %[mask]        \n\t"
-      "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]      \n\t"
-      "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]      \n\t"
-      "paddh      %[dest1],        %[dest_hi],       %[dest_lo]     \n\t"
-      "paddh      %[dest1],        %[dest1],         %[ph]          \n\t"
-      "psrlh      %[dest1],        %[dest1],         %[shift]       \n\t"
-
-      "packushb   %[dest],         %[dest0],          %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "dadd       %[src0_ptr],     %[src0_ptr],      %[src_stepx_4] \n\t"
-      "dadd       %[src1_ptr],     %[src1_ptr],      %[src_stepx_4] \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
-        [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
-        [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
-        [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask),
-        [ph] "f"(ph)
-      : "memory");
-}
-
-// Scales a single row of pixels using point sampling.
-void ScaleARGBCols_MMI(uint8_t* dst_argb,
-                       const uint8_t* src_argb,
-                       int dst_width,
-                       int x,
-                       int dx) {
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-
-  const uint32_t* src_tmp;
-
-  uint64_t dest, offset;
-
-  const uint64_t shift0 = 16;
-  const uint64_t shift1 = 2;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "srav       %[offset],        %[x],             %[shift0]     \n\t"
-      "sllv       %[offset],        %[offset],        %[shift1]     \n\t"
-      "dadd       %[src_tmp],       %[src_ptr],       %[offset]     \n\t"
-      "lwc1       %[dest],          0x00(%[src_tmp])                \n\t"
-      "swc1       %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "dadd       %[x],             %[x],             %[dx]         \n\t"
-
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x04          \n\t"
-      "daddi      %[width],         %[width],        -0x01          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1)
-      : "memory");
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
-                          const uint8_t* src_argb,
-                          int dst_width,
-                          int x,
-                          int dx) {
-  uint64_t src, dest0, dest1;
-  (void)x;
-  (void)dx;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src],           0x00(%[src_ptr])                \n\t"
-      "gsldlc1    %[src],           0x07(%[src_ptr])                \n\t"
-      "punpcklwd  %[dest0],         %[src],           %[src]        \n\t"
-      "gssdlc1    %[dest0],         0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest0],         0x00(%[dst_ptr])                \n\t"
-      "punpckhwd  %[dest1],         %[src],           %[src]        \n\t"
-      "gssdlc1    %[dest1],         0x0f(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest1],         0x08(%[dst_ptr])                \n\t"
-
-      "daddiu     %[src_ptr],       %[src_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x10          \n\t"
-      "daddi      %[width],         %[width],        -0x04          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
-      : "memory");
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-/* LibYUVBaseTest.TestFixedDiv */
-int FixedDiv_MIPS(int num, int div) {
-  int quotient = 0;
-  const int shift = 16;
-
-  asm(
-      "dsll    %[num],     %[num],     %[shift]    \n\t"
-      "ddiv    %[num],     %[div]                  \t\n"
-      "mflo    %[quo]                              \t\n"
-      : [quo] "+&r"(quotient)
-      : [num] "r"(num), [div] "r"(div), [shift] "r"(shift));
-
-  return quotient;
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-/* LibYUVScaleTest.ARGBScaleTo320x240_Linear */
-int FixedDiv1_MIPS(int num, int div) {
-  int quotient = 0;
-  const int shift = 16;
-  const int val1 = 1;
-  const int64_t val11 = 0x00010001ULL;
-
-  asm(
-      "dsll    %[num],     %[num],     %[shift]    \n\t"
-      "dsub    %[num],     %[num],     %[val11]    \n\t"
-      "dsub    %[div],     %[div],     %[val1]     \n\t"
-      "ddiv    %[num],     %[div]                  \t\n"
-      "mflo    %[quo]                              \t\n"
-      : [quo] "+&r"(quotient)
-      : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11),
-        [shift] "r"(shift));
-
-  return quotient;
-}
-
-// Read 8x2 upsample with filtering and write 16x1.
-// actually reads an extra pixel, so 9x2.
-void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint16_t* dst,
-                        int dst_width) {
-  const uint16_t* src2_ptr = src_ptr + src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest04, dest15, dest26, dest37;
-  uint64_t tmp0, tmp1, tmp2, tmp3;
-
-  const uint64_t mask0 = 0x0003000900030009ULL;
-  const uint64_t mask1 = 0x0001000300010003ULL;
-  const uint64_t mask2 = 0x0009000300090003ULL;
-  const uint64_t mask3 = 0x0003000100030001ULL;
-  const uint64_t ph = 0x0000000800000008ULL;
-  const uint64_t shift = 4;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],          0x00(%[src1_ptr])               \n\t"
-      "gsldlc1    %[src0],          0x07(%[src1_ptr])               \n\t"
-      "pmaddhw    %[dest04],        %[src0],          %[mask0]      \n\t"
-      "gsldrc1    %[src1],          0x00(%[src2_ptr])               \n\t"
-      "gsldlc1    %[src1],          0x07(%[src2_ptr])               \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask1]      \n\t"
-      "paddw      %[dest04],        %[dest04],        %[dest]       \n\t"
-      "paddw      %[dest04],        %[dest04],        %[ph]         \n\t"
-      "psrlw      %[dest04],        %[dest04],        %[shift]      \n\t"
-
-      "pmaddhw    %[dest15],        %[src0],          %[mask2]      \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask3]      \n\t"
-      "paddw      %[dest15],        %[dest15],        %[dest]       \n\t"
-      "paddw      %[dest15],        %[dest15],        %[ph]         \n\t"
-      "psrlw      %[dest15],        %[dest15],        %[shift]      \n\t"
-
-      "gsldrc1    %[src0],          0x02(%[src1_ptr])               \n\t"
-      "gsldlc1    %[src0],          0x09(%[src1_ptr])               \n\t"
-      "pmaddhw    %[dest26],        %[src0],          %[mask0]      \n\t"
-      "gsldrc1    %[src1],          0x02(%[src2_ptr])               \n\t"
-      "gsldlc1    %[src1],          0x09(%[src2_ptr])               \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask1]      \n\t"
-      "paddw      %[dest26],        %[dest26],        %[dest]       \n\t"
-      "paddw      %[dest26],        %[dest26],        %[ph]         \n\t"
-      "psrlw      %[dest26],        %[dest26],        %[shift]      \n\t"
-
-      "pmaddhw    %[dest37],        %[src0],          %[mask2]      \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask3]      \n\t"
-      "paddw      %[dest37],        %[dest37],        %[dest]       \n\t"
-      "paddw      %[dest37],        %[dest37],        %[ph]         \n\t"
-      "psrlw      %[dest37],        %[dest37],        %[shift]      \n\t"
-
-      /* tmp0 = ( 00 04 02 06 ) */
-      "packsswh   %[tmp0],          %[dest04],        %[dest26]     \n\t"
-      /* tmp1 = ( 01 05 03 07 ) */
-      "packsswh   %[tmp1],          %[dest15],        %[dest37]     \n\t"
-
-      /* tmp2 = ( 00 01 04 05 )*/
-      "punpcklhw  %[tmp2],          %[tmp0],          %[tmp1]       \n\t"
-      /* tmp3 = ( 02 03 06 07 )*/
-      "punpckhhw  %[tmp3],          %[tmp0],          %[tmp1]       \n\t"
-
-      /* ( 00 01 02 03 ) */
-      "punpcklwd  %[dest],          %[tmp2],          %[tmp3]       \n\t"
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      /* ( 04 05 06 07 ) */
-      "punpckhwd  %[dest],          %[tmp2],          %[tmp3]       \n\t"
-      "gssdlc1    %[dest],          0x0f(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x08(%[dst_ptr])                \n\t"
-
-      "daddiu     %[src1_ptr],      %[src1_ptr],      0x08          \n\t"
-      "daddiu     %[src2_ptr],      %[src2_ptr],      0x08          \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x10          \n\t"
-      "daddi      %[width],         %[width],        -0x08          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04),
-        [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37),
-        [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
-        [tmp3] "=&f"(tmp3), [dest] "=&f"(dest)
-      : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst),
-        [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph)
-      : "memory");
-}
-
-void ScaleRowDown34_MMI(const uint8_t* src_ptr,
-                      ptrdiff_t src_stride,
-                      uint8_t* dst,
-                      int dst_width) {
-  (void)src_stride;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  uint64_t src[2];
-  uint64_t tmp[2];
-  __asm__ volatile (
-    "1:                                                           \n\t"
-    "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-    "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-    "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-    "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-    "and        %[tmp1],         %[src0],        %[mask1]         \n\t"
-    "psrlw      %[tmp0],         %[src0],        %[rmov]          \n\t"
-    "psllw      %[tmp0],         %[tmp0],        %[lmov1]         \n\t"
-    "or         %[src0],         %[tmp0],        %[tmp1]          \n\t"
-    "punpckhwd  %[tmp0],         %[src0],        %[src0]          \n\t"
-    "psllw      %[tmp1],         %[tmp0],        %[rmov]          \n\t"
-    "or         %[src0],         %[src0],        %[tmp1]          \n\t"
-    "psrlw      %[tmp0],         %[tmp0],        %[rmov8]         \n\t"
-    "pextrh     %[tmp0],         %[tmp0],        %[zero]          \n\t"
-    "pinsrh_2   %[src0],         %[src0],        %[tmp0]          \n\t"
-    "pextrh     %[tmp0],         %[src1],        %[zero]          \n\t"
-    "pinsrh_3   %[src0],         %[src0],        %[tmp0]          \n\t"
-
-    "punpckhwd  %[tmp0],         %[src1],        %[src1]          \n\t"
-    "pextrh     %[tmp1],         %[tmp0],        %[zero]          \n\t"
-    "psrlw      %[src1],         %[src1],        %[rmov]          \n\t"
-    "psllw      %[tmp1],         %[tmp1],        %[rmov8]         \n\t"
-    "or         %[src1],         %[src1],        %[tmp1]          \n\t"
-    "and        %[tmp0],         %[tmp0],        %[mask2]         \n\t"
-    "or         %[src1],         %[src1],        %[tmp0]          \n\t"
-
-    "gssdlc1    %[src0],         0x07(%[dst_ptr])                 \n\t"
-    "gssdrc1    %[src0],         0x00(%[dst_ptr])                 \n\t"
-    "gsswlc1    %[src1],         0x0b(%[dst_ptr])                 \n\t"
-    "gsswrc1    %[src1],         0x08(%[dst_ptr])                 \n\t"
-
-    "daddiu     %[src_ptr],      %[src_ptr],     0x10             \n\t"
-    "daddi      %[width],        %[width],      -0x0c             \n\t"
-    "daddiu     %[dst_ptr],      %[dst_ptr],     0x0c             \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [src0]"=&f"(src[0]),              [src1]"=&f"(src[1]),
-      [tmp0]"=&f"(tmp[0]),              [tmp1]"=&f"(tmp[1])
-    : [src_ptr]"r"(src_ptr),            [dst_ptr]"r"(dst),
-      [lmov]"f"(0xc),                   [rmov]"f"(0x18),
-      [mask1]"f"(0xffff0000ffff),       [rmov8]"f"(0x8),
-      [zero]"f"(0x0),                   [mask2]"f"(0xff000000),
-      [width]"r"(dst_width),            [lmov1]"f"(0x10)
-    : "memory"
-  );
-}
-// clang-format on
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/tools_libyuv/autoroller/roll_deps.py b/files/tools_libyuv/autoroller/roll_deps.py
deleted file mode 100755
index 977c86de..00000000
--- a/files/tools_libyuv/autoroller/roll_deps.py
+++ /dev/null
@@ -1,509 +0,0 @@
-#!/usr/bin/env vpython3
-
-# Copyright 2017 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# This is a modified copy of the script in
-# https://webrtc.googlesource.com/src/+/master/tools_webrtc/autoroller/roll_deps.py
-# customized for libyuv.
-
-"""Script to automatically roll dependencies in the libyuv DEPS file."""
-
-import argparse
-import base64
-import collections
-import logging
-import os
-import re
-import subprocess
-import sys
-import urllib.request
-
-
-# Skip these dependencies (list without solution name prefix).
-DONT_AUTOROLL_THESE = [
-  'src/third_party/gflags/src',
-]
-
-LIBYUV_URL = 'https://chromium.googlesource.com/libyuv/libyuv'
-CHROMIUM_SRC_URL = 'https://chromium.googlesource.com/chromium/src'
-CHROMIUM_COMMIT_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s'
-CHROMIUM_LOG_TEMPLATE = CHROMIUM_SRC_URL + '/+log/%s'
-CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s'
-
-COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$')
-CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'([0-9a-z-]+)\'$')
-ROLL_BRANCH_NAME = 'roll_chromium_revision'
-
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-CHECKOUT_SRC_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.pardir,
-                                                 os.pardir))
-CHECKOUT_ROOT_DIR = os.path.realpath(os.path.join(CHECKOUT_SRC_DIR, os.pardir))
-
-sys.path.append(os.path.join(CHECKOUT_SRC_DIR, 'build'))
-import find_depot_tools  # pylint: disable=wrong-import-position
-find_depot_tools.add_depot_tools_to_path()
-
-CLANG_UPDATE_SCRIPT_URL_PATH = 'tools/clang/scripts/update.py'
-CLANG_UPDATE_SCRIPT_LOCAL_PATH = os.path.join(CHECKOUT_SRC_DIR, 'tools',
-                                              'clang', 'scripts', 'update.py')
-
-DepsEntry = collections.namedtuple('DepsEntry', 'path url revision')
-ChangedDep = collections.namedtuple('ChangedDep',
-                                    'path url current_rev new_rev')
-
-class RollError(Exception):
-  pass
-
-
-def VarLookup(local_scope):
-  return lambda var_name: local_scope['vars'][var_name]
-
-
-def ParseDepsDict(deps_content):
-  local_scope = {}
-  global_scope = {
-    'Var': VarLookup(local_scope),
-    'Str': lambda s: s,
-    'deps_os': {},
-  }
-  exec(deps_content, global_scope, local_scope)
-  return local_scope
-
-
-def ParseLocalDepsFile(filename):
-  with open(filename, 'rb') as f:
-    deps_content = f.read().decode('utf-8')
-  return ParseDepsDict(deps_content)
-
-
-def ParseRemoteCrDepsFile(revision):
-  deps_content = ReadRemoteCrFile('DEPS', revision)
-  return ParseDepsDict(deps_content)
-
-
-def ParseCommitPosition(commit_message):
-  for line in reversed(commit_message.splitlines()):
-    m = COMMIT_POSITION_RE.match(line.strip())
-    if m:
-      return int(m.group(1))
-  logging.error('Failed to parse commit position id from:\n%s\n',
-                commit_message)
-  sys.exit(-1)
-
-
-def _RunCommand(command, working_dir=None, ignore_exit_code=False,
-                extra_env=None, input_data=None):
-  """Runs a command and returns the output from that command.
-
-  If the command fails (exit code != 0), the function will exit the process.
-
-  Returns:
-    A tuple containing the stdout and stderr outputs as strings.
-  """
-  working_dir = working_dir or CHECKOUT_SRC_DIR
-  logging.debug('CMD: %s CWD: %s', ' '.join(command), working_dir)
-  env = os.environ.copy()
-  if extra_env:
-    assert all(isinstance(value, str) for value in extra_env.values())
-    logging.debug('extra env: %s', extra_env)
-    env.update(extra_env)
-  p = subprocess.Popen(command,
-                       stdin=subprocess.PIPE,
-                       stdout=subprocess.PIPE,
-                       stderr=subprocess.PIPE,
-                       env=env,
-                       cwd=working_dir,
-                       universal_newlines=True)
-  std_output, err_output = p.communicate(input_data)
-  p.stdout.close()
-  p.stderr.close()
-  if not ignore_exit_code and p.returncode != 0:
-    logging.error('Command failed: %s\n'
-                  'stdout:\n%s\n'
-                  'stderr:\n%s\n', ' '.join(command), std_output, err_output)
-    sys.exit(p.returncode)
-  return std_output, err_output
-
-
-def _GetBranches():
-  """Returns a tuple of active,branches.
-
-  The 'active' is the name of the currently active branch and 'branches' is a
-  list of all branches.
-  """
-  lines = _RunCommand(['git', 'branch'])[0].split('\n')
-  branches = []
-  active = ''
-  for line in lines:
-    if '*' in line:
-      # The assumption is that the first char will always be the '*'.
-      active = line[1:].strip()
-      branches.append(active)
-    else:
-      branch = line.strip()
-      if branch:
-        branches.append(branch)
-  return active, branches
-
-
-def _ReadGitilesContent(url):
-  # Download and decode BASE64 content until
-  # https://code.google.com/p/gitiles/issues/detail?id=7 is fixed.
-  base64_content = ReadUrlContent(url + '?format=TEXT')
-  return base64.b64decode(base64_content[0]).decode('utf-8')
-
-
-def ReadRemoteCrFile(path_below_src, revision):
-  """Reads a remote Chromium file of a specific revision. Returns a string."""
-  return _ReadGitilesContent(CHROMIUM_FILE_TEMPLATE % (revision,
-                                                       path_below_src))
-
-
-def ReadRemoteCrCommit(revision):
-  """Reads a remote Chromium commit message. Returns a string."""
-  return _ReadGitilesContent(CHROMIUM_COMMIT_TEMPLATE % revision)
-
-
-def ReadUrlContent(url):
-  """Connect to a remote host and read the contents. Returns a list of lines."""
-  conn = urllib.request.urlopen(url)
-  try:
-    return conn.readlines()
-  except IOError as e:
-    logging.exception('Error connecting to %s. Error: %s', url, e)
-    raise
-  finally:
-    conn.close()
-
-
-def GetMatchingDepsEntries(depsentry_dict, dir_path):
-  """Gets all deps entries matching the provided path.
-
-  This list may contain more than one DepsEntry object.
-  Example: dir_path='src/testing' would give results containing both
-  'src/testing/gtest' and 'src/testing/gmock' deps entries for Chromium's DEPS.
-  Example 2: dir_path='src/build' should return 'src/build' but not
-  'src/buildtools'.
-
-  Returns:
-    A list of DepsEntry objects.
-  """
-  result = []
-  for path, depsentry in depsentry_dict.items():
-    if path == dir_path:
-      result.append(depsentry)
-    else:
-      parts = path.split('/')
-      if all(part == parts[i]
-             for i, part in enumerate(dir_path.split('/'))):
-        result.append(depsentry)
-  return result
-
-def BuildDepsentryDict(deps_dict):
-  """Builds a dict of paths to DepsEntry objects from a raw deps dict."""
-  result = {}
-
-  def AddDepsEntries(deps_subdict):
-    for path, deps_url_spec in deps_subdict.items():
-      if isinstance(deps_url_spec, dict):
-        if deps_url_spec.get('dep_type') == 'cipd':
-          continue
-        deps_url = deps_url_spec['url']
-      else:
-        deps_url = deps_url_spec
-      if not path in result:
-        url, revision = deps_url.split('@') if deps_url else (None, None)
-        result[path] = DepsEntry(path, url, revision)
-
-  AddDepsEntries(deps_dict['deps'])
-  for deps_os in ['win', 'mac', 'linux', 'android', 'ios', 'unix']:
-    AddDepsEntries(deps_dict.get('deps_os', {}).get(deps_os, {}))
-  return result
-
-
-def CalculateChangedDeps(libyuv_deps, new_cr_deps):
-  """
-  Calculate changed deps entries based on entries defined in the libyuv DEPS
-  file:
-     - If a shared dependency with the Chromium DEPS file: roll it to the same
-       revision as Chromium (i.e. entry in the new_cr_deps dict)
-     - If it's a Chromium sub-directory, roll it to the HEAD revision (notice
-       this means it may be ahead of the chromium_revision, but generally these
-       should be close).
-     - If it's another DEPS entry (not shared with Chromium), roll it to HEAD
-       unless it's configured to be skipped.
-
-  Returns:
-    A list of ChangedDep objects representing the changed deps.
-  """
-  result = []
-  libyuv_entries = BuildDepsentryDict(libyuv_deps)
-  new_cr_entries = BuildDepsentryDict(new_cr_deps)
-  for path, libyuv_deps_entry in libyuv_entries.items():
-    if path in DONT_AUTOROLL_THESE:
-      continue
-    cr_deps_entry = new_cr_entries.get(path)
-    if cr_deps_entry:
-      # Use the revision from Chromium's DEPS file.
-      new_rev = cr_deps_entry.revision
-      assert libyuv_deps_entry.url == cr_deps_entry.url, (
-        'Libyuv DEPS entry %s has a different URL (%s) than Chromium (%s).' %
-        (path, libyuv_deps_entry.url, cr_deps_entry.url))
-    else:
-      # Use the HEAD of the deps repo.
-      stdout, _ = _RunCommand(['git', 'ls-remote', libyuv_deps_entry.url,
-                               'HEAD'])
-      new_rev = stdout.strip().split('\t')[0]
-
-    # Check if an update is necessary.
-    if libyuv_deps_entry.revision != new_rev:
-      logging.debug('Roll dependency %s to %s', path, new_rev)
-      result.append(ChangedDep(path, libyuv_deps_entry.url,
-                               libyuv_deps_entry.revision, new_rev))
-  return sorted(result)
-
-
-def CalculateChangedClang(new_cr_rev):
-  def GetClangRev(lines):
-    for line in lines:
-      match = CLANG_REVISION_RE.match(line)
-      if match:
-        return match.group(1)
-    raise RollError('Could not parse Clang revision from:\n' + '\n'.join('  ' + l for l in lines))
-
-  with open(CLANG_UPDATE_SCRIPT_LOCAL_PATH, 'r') as f:
-    current_lines = f.readlines()
-  current_rev = GetClangRev(current_lines)
-
-  new_clang_update_py = ReadRemoteCrFile(CLANG_UPDATE_SCRIPT_URL_PATH,
-                                             new_cr_rev).splitlines()
-  new_rev = GetClangRev(new_clang_update_py)
-  return ChangedDep(CLANG_UPDATE_SCRIPT_LOCAL_PATH, None, current_rev, new_rev)
-
-
-def GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos,
-                          new_commit_pos, changed_deps_list, clang_change):
-  current_cr_rev = current_cr_rev[0:10]
-  new_cr_rev = new_cr_rev[0:10]
-  rev_interval = '%s..%s' % (current_cr_rev, new_cr_rev)
-  git_number_interval = '%s:%s' % (current_commit_pos, new_commit_pos)
-
-  commit_msg = ['Roll chromium_revision %s (%s)\n' % (rev_interval,
-                                                    git_number_interval)]
-  commit_msg.append('Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval))
-  commit_msg.append('Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE %
-                                         rev_interval))
-  if changed_deps_list:
-    commit_msg.append('Changed dependencies:')
-
-    for c in changed_deps_list:
-      commit_msg.append('* %s: %s/+log/%s..%s' % (c.path, c.url,
-                                                  c.current_rev[0:10],
-                                                  c.new_rev[0:10]))
-    change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval, 'DEPS')
-    commit_msg.append('DEPS diff: %s\n' % change_url)
-  else:
-    commit_msg.append('No dependencies changed.')
-
-  if clang_change.current_rev != clang_change.new_rev:
-    commit_msg.append('Clang version changed %s:%s' %
-                      (clang_change.current_rev, clang_change.new_rev))
-    change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval,
-                                           CLANG_UPDATE_SCRIPT_URL_PATH)
-    commit_msg.append('Details: %s\n' % change_url)
-  else:
-    commit_msg.append('No update to Clang.\n')
-
-  # TBR needs to be non-empty for Gerrit to process it.
-  git_author = _RunCommand(['git', 'config', 'user.email'],
-                           working_dir=CHECKOUT_SRC_DIR)[0].strip()
-  commit_msg.append('TBR=%s' % git_author)
-
-  commit_msg.append('BUG=None')
-  return '\n'.join(commit_msg)
-
-
-def UpdateDepsFile(deps_filename, old_cr_revision, new_cr_revision,
-                   changed_deps):
-  """Update the DEPS file with the new revision."""
-
-  # Update the chromium_revision variable.
-  with open(deps_filename, 'rb') as deps_file:
-    deps_content = deps_file.read().decode('utf-8')
-  deps_content = deps_content.replace(old_cr_revision, new_cr_revision)
-  with open(deps_filename, 'wb') as deps_file:
-    deps_file.write(deps_content.encode('utf-8'))
-
-  # Update each individual DEPS entry.
-  for dep in changed_deps:
-    local_dep_dir = os.path.join(CHECKOUT_ROOT_DIR, dep.path)
-    if not os.path.isdir(local_dep_dir):
-      raise RollError(
-          'Cannot find local directory %s. Make sure the .gclient file\n'
-          'contains all platforms in the target_os list, i.e.\n'
-          'target_os = ["android", "unix", "mac", "ios", "win"];\n'
-          'Then run "gclient sync" again.' % local_dep_dir)
-    _RunCommand(
-      ['gclient', 'setdep', '--revision', '%s@%s' % (dep.path, dep.new_rev)],
-      working_dir=CHECKOUT_SRC_DIR)
-
-
-def _IsTreeClean():
-  stdout, _ = _RunCommand(['git', 'status', '--porcelain'])
-  if len(stdout) == 0:
-    return True
-
-  logging.error('Dirty/unversioned files:\n%s', stdout)
-  return False
-
-
-def _EnsureUpdatedMasterBranch(dry_run):
-  current_branch = _RunCommand(
-      ['git', 'rev-parse', '--abbrev-ref', 'HEAD'])[0].splitlines()[0]
-  if current_branch != 'main':
-    logging.error('Please checkout the main branch and re-run this script.')
-    if not dry_run:
-      sys.exit(-1)
-
-  logging.info('Updating main branch...')
-  _RunCommand(['git', 'pull'])
-
-
-def _CreateRollBranch(dry_run):
-  logging.info('Creating roll branch: %s', ROLL_BRANCH_NAME)
-  if not dry_run:
-    _RunCommand(['git', 'checkout', '-b', ROLL_BRANCH_NAME])
-
-
-def _RemovePreviousRollBranch(dry_run):
-  active_branch, branches = _GetBranches()
-  if active_branch == ROLL_BRANCH_NAME:
-    active_branch = 'main'
-  if ROLL_BRANCH_NAME in branches:
-    logging.info('Removing previous roll branch (%s)', ROLL_BRANCH_NAME)
-    if not dry_run:
-      _RunCommand(['git', 'checkout', active_branch])
-      _RunCommand(['git', 'branch', '-D', ROLL_BRANCH_NAME])
-
-
-def _LocalCommit(commit_msg, dry_run):
-  logging.info('Committing changes locally.')
-  if not dry_run:
-    _RunCommand(['git', 'add', '--update', '.'])
-    _RunCommand(['git', 'commit', '-m', commit_msg])
-
-
-def ChooseCQMode(skip_cq, cq_over, current_commit_pos, new_commit_pos):
-  if skip_cq:
-    return 0
-  if (new_commit_pos - current_commit_pos) < cq_over:
-    return 1
-  return 2
-
-
-def _UploadCL(commit_queue_mode):
-  """Upload the committed changes as a changelist to Gerrit.
-
-  commit_queue_mode:
-    - 2: Submit to commit queue.
-    - 1: Run trybots but do not submit to CQ.
-    - 0: Skip CQ, upload only.
-  """
-  cmd = ['git', 'cl', 'upload', '--force', '--bypass-hooks', '--send-mail']
-  if commit_queue_mode >= 2:
-    logging.info('Sending the CL to the CQ...')
-    cmd.extend(['-o', 'label=Bot-Commit+1'])
-    cmd.extend(['-o', 'label=Commit-Queue+2'])
-  elif commit_queue_mode >= 1:
-    logging.info('Starting CQ dry run...')
-    cmd.extend(['-o', 'label=Commit-Queue+1'])
-  extra_env = {
-      'EDITOR': 'true',
-      'SKIP_GCE_AUTH_FOR_GIT': '1',
-  }
-  stdout, stderr = _RunCommand(cmd, extra_env=extra_env)
-  logging.debug('Output from "git cl upload":\nstdout:\n%s\n\nstderr:\n%s',
-      stdout, stderr)
-
-
-def main():
-  p = argparse.ArgumentParser()
-  p.add_argument('--clean', action='store_true', default=False,
-                 help='Removes any previous local roll branch.')
-  p.add_argument('-r', '--revision',
-                 help=('Chromium Git revision to roll to. Defaults to the '
-                       'Chromium HEAD revision if omitted.'))
-  p.add_argument('--dry-run', action='store_true', default=False,
-                 help=('Calculate changes and modify DEPS, but don\'t create '
-                       'any local branch, commit, upload CL or send any '
-                       'tryjobs.'))
-  p.add_argument('-i', '--ignore-unclean-workdir', action='store_true',
-                 default=False,
-                 help=('Ignore if the current branch is not main or if there '
-                       'are uncommitted changes (default: %(default)s).'))
-  grp = p.add_mutually_exclusive_group()
-  grp.add_argument('--skip-cq', action='store_true', default=False,
-                   help='Skip sending the CL to the CQ (default: %(default)s)')
-  grp.add_argument('--cq-over', type=int, default=1,
-                   help=('Commit queue dry run if the revision difference '
-                         'is below this number (default: %(default)s)'))
-  p.add_argument('-v', '--verbose', action='store_true', default=False,
-                 help='Be extra verbose in printing of log messages.')
-  opts = p.parse_args()
-
-  if opts.verbose:
-    logging.basicConfig(level=logging.DEBUG)
-  else:
-    logging.basicConfig(level=logging.INFO)
-
-  if not opts.ignore_unclean_workdir and not _IsTreeClean():
-    logging.error('Please clean your local checkout first.')
-    return 1
-
-  if opts.clean:
-    _RemovePreviousRollBranch(opts.dry_run)
-
-  if not opts.ignore_unclean_workdir:
-    _EnsureUpdatedMasterBranch(opts.dry_run)
-
-  new_cr_rev = opts.revision
-  if not new_cr_rev:
-    stdout, _ = _RunCommand(['git', 'ls-remote', CHROMIUM_SRC_URL, 'HEAD'])
-    head_rev = stdout.strip().split('\t')[0]
-    logging.info('No revision specified. Using HEAD: %s', head_rev)
-    new_cr_rev = head_rev
-
-  deps_filename = os.path.join(CHECKOUT_SRC_DIR, 'DEPS')
-  libyuv_deps = ParseLocalDepsFile(deps_filename)
-  current_cr_rev = libyuv_deps['vars']['chromium_revision']
-
-  current_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(current_cr_rev))
-  new_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(new_cr_rev))
-
-  new_cr_deps = ParseRemoteCrDepsFile(new_cr_rev)
-  changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps)
-  clang_change = CalculateChangedClang(new_cr_rev)
-  commit_msg = GenerateCommitMessage(current_cr_rev, new_cr_rev,
-                                     current_commit_pos, new_commit_pos,
-                                     changed_deps, clang_change)
-  logging.debug('Commit message:\n%s', commit_msg)
-
-  _CreateRollBranch(opts.dry_run)
-  UpdateDepsFile(deps_filename, current_cr_rev, new_cr_rev, changed_deps)
-  _LocalCommit(commit_msg, opts.dry_run)
-  commit_queue_mode = ChooseCQMode(opts.skip_cq, opts.cq_over,
-                                   current_commit_pos, new_commit_pos)
-  logging.info('Uploading CL...')
-  if not opts.dry_run:
-    _UploadCL(commit_queue_mode)
-  return 0
-
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/files/include/libyuv.h b/include/libyuv.h
index a06e1233..a06e1233 100644
--- a/files/include/libyuv.h
+++ b/include/libyuv.h
diff --git a/files/include/libyuv/basic_types.h b/include/libyuv/basic_types.h
index 1bea67f2..1bea67f2 100644
--- a/files/include/libyuv/basic_types.h
+++ b/include/libyuv/basic_types.h
diff --git a/files/include/libyuv/compare.h b/include/libyuv/compare.h
index 3353ad71..3353ad71 100644
--- a/files/include/libyuv/compare.h
+++ b/include/libyuv/compare.h
diff --git a/files/include/libyuv/compare_row.h b/include/libyuv/compare_row.h
index d8e82d72..8293c919 100644
--- a/files/include/libyuv/compare_row.h
+++ b/include/libyuv/compare_row.h
@@ -28,7 +28,10 @@ extern "C" {
 #endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
diff --git a/files/include/libyuv/convert.h b/include/libyuv/convert.h
index 46d37159..88619a4f 100644
--- a/files/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -151,6 +151,33 @@ int MM21ToI420(const uint8_t* src_y,
                int width,
                int height);
 
+// Convert MM21 to YUY2
+LIBYUV_API
+int MM21ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
+
+// Convert MT2T to P010
+// Note that src_y and src_uv point to packed 10-bit values, so the Y plane will
+// be 10 / 8 times the dimensions of the image. Also for this reason,
+// src_stride_y and src_stride_uv are given in bytes.
+LIBYUV_API
+int MT2TToP010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
 // Convert I422 to NV21.
 LIBYUV_API
 int I422ToNV21(const uint8_t* src_y,
@@ -272,6 +299,23 @@ int I210ToI422(const uint16_t* src_y,
                int width,
                int height);
 
+#define H410ToH420 I410ToI420
+LIBYUV_API
+int I410ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 #define H410ToH444 I410ToI444
 LIBYUV_API
 int I410ToI444(const uint16_t* src_y,
@@ -323,6 +367,23 @@ int I212ToI422(const uint16_t* src_y,
                int width,
                int height);
 
+#define H212ToH420 I212ToI420
+LIBYUV_API
+int I212ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 #define H412ToH444 I412ToI444
 LIBYUV_API
 int I412ToI444(const uint16_t* src_y,
@@ -340,6 +401,23 @@ int I412ToI444(const uint16_t* src_y,
                int width,
                int height);
 
+#define H412ToH420 I412ToI420
+LIBYUV_API
+int I412ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 #define I412ToI012 I410ToI010
 #define H410ToH010 I410ToI010
 #define H412ToH012 I410ToI010
@@ -560,6 +638,36 @@ int NV16ToNV24(const uint8_t* src_y,
                int width,
                int height);
 
+// Convert P010 to I010.
+LIBYUV_API
+int P010ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert P012 to I012.
+LIBYUV_API
+int P012ToI012(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 // Convert P010 to P410.
 LIBYUV_API
 int P010ToP410(const uint16_t* src_y,
@@ -677,6 +785,21 @@ int ARGBToI420(const uint8_t* src_argb,
                int width,
                int height);
 
+// Convert ARGB to I420 with Alpha
+LIBYUV_API
+int ARGBToI420Alpha(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    uint8_t* dst_a,
+                    int dst_stride_a,
+                    int width,
+                    int height);
+
 // BGRA little endian (argb in memory) to I420.
 LIBYUV_API
 int BGRAToI420(const uint8_t* src_bgra,
diff --git a/files/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h
index f66d20ce..35eeac9b 100644
--- a/files/include/libyuv/convert_argb.h
+++ b/include/libyuv/convert_argb.h
@@ -67,6 +67,8 @@ LIBYUV_API extern const struct YuvConstants kYvuV2020Constants;  // BT.2020 full
   I210ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
 #define I410ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
   I410ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I012ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I012ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
 #define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
   I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
 #define I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
@@ -404,6 +406,32 @@ int U444ToABGR(const uint8_t* src_y,
                int width,
                int height);
 
+// Convert I444 to RGB24.
+LIBYUV_API
+int I444ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+// Convert I444 to RAW.
+LIBYUV_API
+int I444ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
 // Convert I010 to ARGB.
 LIBYUV_API
 int I010ToARGB(const uint16_t* src_y,
@@ -1312,6 +1340,32 @@ int J420ToRAW(const uint8_t* src_y,
               int width,
               int height);
 
+// Convert I422 to RGB24.
+LIBYUV_API
+int I422ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+// Convert I422 to RAW.
+LIBYUV_API
+int I422ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
 LIBYUV_API
 int I420ToRGB565(const uint8_t* src_y,
                  int src_stride_y,
@@ -1495,6 +1549,20 @@ int I444ToARGBMatrix(const uint8_t* src_y,
                      int width,
                      int height);
 
+// Convert I444 to RGB24 with matrix.
+LIBYUV_API
+int I444ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+
 // Convert 10 bit 420 YUV to ARGB with matrix.
 LIBYUV_API
 int I010ToAR30Matrix(const uint16_t* src_y,
@@ -1893,6 +1961,20 @@ int I420ToRGB24Matrix(const uint8_t* src_y,
                       int width,
                       int height);
 
+// Convert I422 to RGB24 with matrix.
+LIBYUV_API
+int I422ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+
 // Convert I420 to RGB565 with specified color matrix.
 LIBYUV_API
 int I420ToRGB565Matrix(const uint8_t* src_y,
@@ -1907,6 +1989,20 @@ int I420ToRGB565Matrix(const uint8_t* src_y,
                        int width,
                        int height);
 
+// Convert I422 to RGB565 with specified color matrix.
+LIBYUV_API
+int I422ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height);
+
 // Convert I420 to AR30 with matrix.
 LIBYUV_API
 int I420ToAR30Matrix(const uint8_t* src_y,
@@ -1961,6 +2057,36 @@ int I422ToARGBMatrixFilter(const uint8_t* src_y,
                            int height,
                            enum FilterMode filter);
 
+// Convert I422 to RGB24 with matrix and UV filter mode.
+LIBYUV_API
+int I422ToRGB24MatrixFilter(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgb24,
+                            int dst_stride_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height,
+                            enum FilterMode filter);
+
+// Convert I420 to RGB24 with matrix and UV filter mode.
+LIBYUV_API
+int I420ToRGB24MatrixFilter(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgb24,
+                            int dst_stride_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height,
+                            enum FilterMode filter);
+
 // Convert I010 to AR30 with matrix and UV filter mode.
 LIBYUV_API
 int I010ToAR30MatrixFilter(const uint16_t* src_y,
diff --git a/files/include/libyuv/convert_from.h b/include/libyuv/convert_from.h
index 32f42a63..32f42a63 100644
--- a/files/include/libyuv/convert_from.h
+++ b/include/libyuv/convert_from.h
diff --git a/files/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h
index 2a488838..ff2a581a 100644
--- a/files/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@@ -209,10 +209,10 @@ int ARGBToJ420(const uint8_t* src_argb,
                int src_stride_argb,
                uint8_t* dst_yj,
                int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height);
 
@@ -222,10 +222,10 @@ int ARGBToJ422(const uint8_t* src_argb,
                int src_stride_argb,
                uint8_t* dst_yj,
                int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height);
 
@@ -238,6 +238,41 @@ int ARGBToJ400(const uint8_t* src_argb,
                int width,
                int height);
 
+// Convert ABGR to J420. (JPeg full range I420).
+LIBYUV_API
+int ABGRToJ420(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height);
+
+// Convert ABGR to J422.
+LIBYUV_API
+int ABGRToJ422(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height);
+
+// Convert ABGR to J400. (JPeg full range).
+LIBYUV_API
+int ABGRToJ400(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height);
+
 // Convert RGBA to J400. (JPeg full range).
 LIBYUV_API
 int RGBAToJ400(const uint8_t* src_rgba,
diff --git a/files/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h
index fb90c6c7..5a81e7c9 100644
--- a/files/include/libyuv/cpu_id.h
+++ b/include/libyuv/cpu_id.h
@@ -31,30 +31,36 @@ static const int kCpuHasX86 = 0x10;
 static const int kCpuHasSSE2 = 0x20;
 static const int kCpuHasSSSE3 = 0x40;
 static const int kCpuHasSSE41 = 0x80;
-static const int kCpuHasSSE42 = 0x100;  // unused at this time.
+static const int kCpuHasSSE42 = 0x100;
 static const int kCpuHasAVX = 0x200;
 static const int kCpuHasAVX2 = 0x400;
 static const int kCpuHasERMS = 0x800;
 static const int kCpuHasFMA3 = 0x1000;
 static const int kCpuHasF16C = 0x2000;
-static const int kCpuHasGFNI = 0x4000;
-static const int kCpuHasAVX512BW = 0x8000;
-static const int kCpuHasAVX512VL = 0x10000;
-static const int kCpuHasAVX512VNNI = 0x20000;
-static const int kCpuHasAVX512VBMI = 0x40000;
-static const int kCpuHasAVX512VBMI2 = 0x80000;
-static const int kCpuHasAVX512VBITALG = 0x100000;
-static const int kCpuHasAVX512VPOPCNTDQ = 0x200000;
+static const int kCpuHasAVX512BW = 0x4000;
+static const int kCpuHasAVX512VL = 0x8000;
+static const int kCpuHasAVX512VNNI = 0x10000;
+static const int kCpuHasAVX512VBMI = 0x20000;
+static const int kCpuHasAVX512VBMI2 = 0x40000;
+static const int kCpuHasAVX512VBITALG = 0x80000;
+static const int kCpuHasAVX10 = 0x100000;
+static const int kCpuHasAVXVNNI = 0x200000;
+static const int kCpuHasAVXVNNIINT8 = 0x400000;
 
 // These flags are only valid on MIPS processors.
-static const int kCpuHasMIPS = 0x400000;
-static const int kCpuHasMSA = 0x800000;
+static const int kCpuHasMIPS = 0x800000;
+static const int kCpuHasMSA = 0x1000000;
 
 // These flags are only valid on LOONGARCH processors.
 static const int kCpuHasLOONGARCH = 0x2000000;
 static const int kCpuHasLSX = 0x4000000;
 static const int kCpuHasLASX = 0x8000000;
 
+// These flags are only valid on RISCV processors.
+static const int kCpuHasRISCV = 0x10000000;
+static const int kCpuHasRVV = 0x20000000;
+static const int kCpuHasRVVZVFH = 0x40000000;
+
 // Optional init function. TestCpuFlag does an auto-init.
 // Returns cpu_info flags.
 LIBYUV_API
@@ -78,6 +84,8 @@ LIBYUV_API
 int ArmCpuCaps(const char* cpuinfo_name);
 LIBYUV_API
 int MipsCpuCaps(const char* cpuinfo_name);
+LIBYUV_API
+int RiscvCpuCaps(const char* cpuinfo_name);
 
 // For testing, allow CPU flags to be disabled.
 // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
diff --git a/files/include/libyuv/loongson_intrinsics.h b/include/libyuv/loongson_intrinsics.h
index 1d613def..1d613def 100644
--- a/files/include/libyuv/loongson_intrinsics.h
+++ b/include/libyuv/loongson_intrinsics.h
diff --git a/files/include/libyuv/macros_msa.h b/include/libyuv/macros_msa.h
index b9a44fcc..b9a44fcc 100644
--- a/files/include/libyuv/macros_msa.h
+++ b/include/libyuv/macros_msa.h
diff --git a/files/include/libyuv/mjpeg_decoder.h b/include/libyuv/mjpeg_decoder.h
index 275f8d4c..275f8d4c 100644
--- a/files/include/libyuv/mjpeg_decoder.h
+++ b/include/libyuv/mjpeg_decoder.h
diff --git a/files/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 1ef2256b..f9344721 100644
--- a/files/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -30,7 +30,10 @@ extern "C" {
 #endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
@@ -85,13 +88,23 @@ void SetPlane(uint8_t* dst_y,
 
 // Convert a plane of tiles of 16 x H to linear.
 LIBYUV_API
-void DetilePlane(const uint8_t* src_y,
-                 int src_stride_y,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 int width,
-                 int height,
-                 int tile_height);
+int DetilePlane(const uint8_t* src_y,
+                int src_stride_y,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                int width,
+                int height,
+                int tile_height);
+
+// Convert a plane of 16 bit tiles of 16 x H to linear.
+LIBYUV_API
+int DetilePlane_16(const uint16_t* src_y,
+                   int src_stride_y,
+                   uint16_t* dst_y,
+                   int dst_stride_y,
+                   int width,
+                   int height,
+                   int tile_height);
 
 // Convert a UV plane of tiles of 16 x H into linear U and V planes.
 LIBYUV_API
@@ -105,6 +118,18 @@ void DetileSplitUVPlane(const uint8_t* src_uv,
                         int height,
                         int tile_height);
 
+// Convert a Y and UV plane of tiles into interlaced YUY2.
+LIBYUV_API
+void DetileToYUY2(const uint8_t* src_y,
+                  int src_stride_y,
+                  const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_yuy2,
+                  int dst_stride_yuy2,
+                  int width,
+                  int height,
+                  int tile_height);
+
 // Split interleaved UV plane into separate U and V planes.
 LIBYUV_API
 void SplitUVPlane(const uint8_t* src_uv,
@@ -370,7 +395,26 @@ int I210Copy(const uint16_t* src_y,
              int width,
              int height);
 
+// Copy I410 to I410.
+#define I410ToI410 I410Copy
+LIBYUV_API
+int I410Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
 // Copy NV12. Supports inverting.
+LIBYUV_API
 int NV12Copy(const uint8_t* src_y,
              int src_stride_y,
              const uint8_t* src_uv,
@@ -383,6 +427,7 @@ int NV12Copy(const uint8_t* src_y,
              int height);
 
 // Copy NV21. Supports inverting.
+LIBYUV_API
 int NV21Copy(const uint8_t* src_y,
              int src_stride_y,
              const uint8_t* src_vu,
@@ -785,15 +830,6 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
                      int width,
                      int height);
 
-typedef void (*ARGBBlendRow)(const uint8_t* src_argb0,
-                             const uint8_t* src_argb1,
-                             uint8_t* dst_argb,
-                             int width);
-
-// Get function to Alpha Blend ARGB pixels and store to destination.
-LIBYUV_API
-ARGBBlendRow GetARGBBlend();
-
 // Alpha Blend ARGB images and store to destination.
 // Source is pre-multiplied by alpha using ARGBAttenuate.
 // Alpha of destination is set to 255.
diff --git a/files/include/libyuv/rotate.h b/include/libyuv/rotate.h
index 684ed5e6..37460c4a 100644
--- a/files/include/libyuv/rotate.h
+++ b/include/libyuv/rotate.h
@@ -85,6 +85,60 @@ int I444Rotate(const uint8_t* src_y,
                int height,
                enum RotationMode mode);
 
+// Rotate I010 frame.
+LIBYUV_API
+int I010Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
+// Rotate I210 frame.
+LIBYUV_API
+int I210Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
+// Rotate I410 frame.
+LIBYUV_API
+int I410Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
 // Rotate NV12 input and store in I420.
 LIBYUV_API
 int NV12ToI420Rotate(const uint8_t* src_y,
@@ -156,6 +210,16 @@ void RotatePlane270(const uint8_t* src,
                     int width,
                     int height);
 
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane_16(const uint16_t* src,
+                   int src_stride,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height,
+                   enum RotationMode mode);
+
 // Rotations for when U and V are interleaved.
 // These functions take one UV input pointer and
 // split the data into two buffers while
diff --git a/files/include/libyuv/rotate_argb.h b/include/libyuv/rotate_argb.h
index 20432949..20432949 100644
--- a/files/include/libyuv/rotate_argb.h
+++ b/include/libyuv/rotate_argb.h
diff --git a/files/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h
index aa8528a9..3e6a2fef 100644
--- a/files/include/libyuv/rotate_row.h
+++ b/include/libyuv/rotate_row.h
@@ -28,7 +28,10 @@ extern "C" {
 #endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
@@ -42,6 +45,8 @@ extern "C" {
 // The following are available for GCC 32 or 64 bit:
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
 #define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSE4X4_32_SSE2
+#define HAS_TRANSPOSE4X4_32_AVX2
 #endif
 
 // The following are available for 64 bit GCC:
@@ -54,6 +59,7 @@ extern "C" {
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_TRANSPOSEWX8_NEON
 #define HAS_TRANSPOSEUVWX8_NEON
+#define HAS_TRANSPOSE4X4_32_NEON
 #endif
 
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -215,6 +221,48 @@ void TransposeUVWx16_Any_LSX(const uint8_t* src,
                              uint8_t* dst_b,
                              int dst_stride_b,
                              int width);
+void TransposeWxH_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width,
+                       int height);
+
+void TransposeWx8_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width);
+void TransposeWx1_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width);
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_NEON(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+
+void Transpose4x4_32_SSE2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+
+void Transpose4x4_32_AVX2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+
+void Transpose4x4_32_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/files/include/libyuv/row.h b/include/libyuv/row.h
index 1a1cf4b6..46685a50 100644
--- a/files/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -11,7 +11,8 @@
 #ifndef INCLUDE_LIBYUV_ROW_H_
 #define INCLUDE_LIBYUV_ROW_H_
 
-#include <stdlib.h>  // For malloc.
+#include <stddef.h>  // For NULL
+#include <stdlib.h>  // For malloc
 
 #include "libyuv/basic_types.h"
 
@@ -30,7 +31,10 @@ extern "C" {
 #endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
@@ -75,9 +79,6 @@ extern "C" {
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 // Conversions:
 #define HAS_ABGRTOYROW_SSSE3
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ABGRTOUVROW_SSSE3
-#endif
 #define HAS_ARGB1555TOARGBROW_SSE2
 #define HAS_ARGB4444TOARGBROW_SSE2
 #define HAS_ARGBEXTRACTALPHAROW_SSE2
@@ -92,12 +93,6 @@ extern "C" {
 #define HAS_ARGBTOYJROW_SSSE3
 #define HAS_ARGBTOYROW_SSSE3
 #define HAS_BGRATOYROW_SSSE3
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ARGBTOUV444ROW_SSSE3
-#define HAS_ARGBTOUVJROW_SSSE3
-#define HAS_ARGBTOUVROW_SSSE3
-#define HAS_BGRATOUVROW_SSSE3
-#endif
 #define HAS_COPYROW_ERMS
 #define HAS_COPYROW_SSE2
 #define HAS_H422TOARGBROW_SSSE3
@@ -111,6 +106,7 @@ extern "C" {
 #define HAS_I422TOUYVYROW_SSE2
 #define HAS_I422TOYUY2ROW_SSE2
 #define HAS_I444TOARGBROW_SSSE3
+#define HAS_I444TORGB24ROW_SSSE3
 #define HAS_INTERPOLATEROW_SSSE3
 #define HAS_J400TOARGBROW_SSE2
 #define HAS_J422TOARGBROW_SSSE3
@@ -124,16 +120,13 @@ extern "C" {
 #define HAS_NV21TORGB24ROW_SSSE3
 #define HAS_RAWTOARGBROW_SSSE3
 #define HAS_RAWTORGB24ROW_SSSE3
+#define HAS_RAWTOYJROW_SSSE3
 #define HAS_RAWTOYROW_SSSE3
 #define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB24TOYJROW_SSSE3
 #define HAS_RGB24TOYROW_SSSE3
 #define HAS_RGB565TOARGBROW_SSE2
 #define HAS_RGBATOYROW_SSSE3
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_RGB24TOYJROW_SSSE3
-#define HAS_RAWTOYJROW_SSSE3
-#define HAS_RGBATOUVROW_SSSE3
-#endif
 #define HAS_SETROW_ERMS
 #define HAS_SETROW_X86
 #define HAS_SPLITUVROW_SSE2
@@ -145,13 +138,18 @@ extern "C" {
 #define HAS_YUY2TOUV422ROW_SSE2
 #define HAS_YUY2TOUVROW_SSE2
 #define HAS_YUY2TOYROW_SSE2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_ARGBTOUV444ROW_SSSE3
+#define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_RGBATOUVROW_SSSE3
+#endif
 
 // Effects:
 #define HAS_ARGBADDROW_SSE2
 #define HAS_ARGBAFFINEROW_SSE2
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ARGBATTENUATEROW_SSSE3
-#endif
 #define HAS_ARGBBLENDROW_SSSE3
 #define HAS_ARGBCOLORMATRIXROW_SSSE3
 #define HAS_ARGBCOLORTABLEROW_X86
@@ -166,7 +164,6 @@ extern "C" {
 #define HAS_ARGBSEPIAROW_SSSE3
 #define HAS_ARGBSHADEROW_SSE2
 #define HAS_ARGBSUBTRACTROW_SSE2
-#define HAS_ARGBUNATTENUATEROW_SSE2
 #define HAS_BLENDPLANEROW_SSSE3
 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
@@ -201,17 +198,10 @@ extern "C" {
 #define HAS_ARGBSHUFFLEROW_AVX2
 #define HAS_ARGBTORGB565DITHERROW_AVX2
 #define HAS_ARGBTOYJROW_AVX2
-#define HAS_RAWTOYJROW_AVX2
-#define HAS_RGB24TOYJROW_AVX2
 #define HAS_ARGBTOYROW_AVX2
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ARGBTOUVJROW_AVX2
-#define HAS_ARGBTOUVROW_AVX2
-#endif
 #define HAS_COPYROW_AVX
 #define HAS_H422TOARGBROW_AVX2
 #define HAS_HALFFLOATROW_AVX2
-//  #define HAS_HALFFLOATROW_F16C  // Enable to test halffloat cast
 #define HAS_I422TOARGB1555ROW_AVX2
 #define HAS_I422TOARGB4444ROW_AVX2
 #define HAS_I422TOARGBROW_AVX2
@@ -219,6 +209,7 @@ extern "C" {
 #define HAS_I422TORGB565ROW_AVX2
 #define HAS_I422TORGBAROW_AVX2
 #define HAS_I444TOARGBROW_AVX2
+#define HAS_I444TORGB24ROW_AVX2
 #define HAS_INTERPOLATEROW_AVX2
 #define HAS_J422TOARGBROW_AVX2
 #define HAS_MERGEUVROW_AVX2
@@ -228,6 +219,8 @@ extern "C" {
 #define HAS_NV12TORGB565ROW_AVX2
 #define HAS_NV21TOARGBROW_AVX2
 #define HAS_NV21TORGB24ROW_AVX2
+#define HAS_RAWTOYJROW_AVX2
+#define HAS_RGB24TOYJROW_AVX2
 #define HAS_SPLITUVROW_AVX2
 #define HAS_UYVYTOARGBROW_AVX2
 #define HAS_UYVYTOUV422ROW_AVX2
@@ -237,15 +230,16 @@ extern "C" {
 #define HAS_YUY2TOUV422ROW_AVX2
 #define HAS_YUY2TOUVROW_AVX2
 #define HAS_YUY2TOYROW_AVX2
+//  #define HAS_HALFFLOATROW_F16C  // Enable to test half float cast
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ARGBTOUVJROW_AVX2
+#define HAS_ARGBTOUVROW_AVX2
+#endif
 
 // Effects:
 #define HAS_ARGBADDROW_AVX2
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ARGBATTENUATEROW_AVX2
-#endif
 #define HAS_ARGBMULTIPLYROW_AVX2
 #define HAS_ARGBSUBTRACTROW_AVX2
-#define HAS_ARGBUNATTENUATEROW_AVX2
 #define HAS_BLENDPLANEROW_AVX2
 
 #if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
@@ -282,28 +276,33 @@ extern "C" {
 // The following are available for gcc/clang x86 platforms:
 // TODO(fbarchard): Port to Visual C
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_AB64TOARGBROW_SSSE3
 #define HAS_ABGRTOAR30ROW_SSSE3
+#define HAS_ABGRTOYJROW_SSSE3
+#define HAS_AR64TOARGBROW_SSSE3
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBTOAB64ROW_SSSE3
 #define HAS_ARGBTOAR30ROW_SSSE3
 #define HAS_ARGBTOAR64ROW_SSSE3
-#define HAS_ARGBTOAB64ROW_SSSE3
-#define HAS_AR64TOARGBROW_SSSE3
-#define HAS_AB64TOARGBROW_SSSE3
+#define HAS_ARGBUNATTENUATEROW_SSE2
 #define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_CONVERT8TO16ROW_SSE2
+#define HAS_DETILEROW_16_SSE2
 #define HAS_DETILEROW_SSE2
 #define HAS_DETILESPLITUVROW_SSSE3
+#define HAS_DETILETOYUY2_SSE2
 #define HAS_HALFMERGEUVROW_SSSE3
 #define HAS_I210TOAR30ROW_SSSE3
 #define HAS_I210TOARGBROW_SSSE3
 #define HAS_I212TOAR30ROW_SSSE3
 #define HAS_I212TOARGBROW_SSSE3
 #define HAS_I400TOARGBROW_SSE2
-#define HAS_I422TOAR30ROW_SSSE3
 #define HAS_I410TOAR30ROW_SSSE3
 #define HAS_I410TOARGBROW_SSSE3
+#define HAS_I422TOAR30ROW_SSSE3
 #define HAS_MERGEARGBROW_SSE2
-#define HAS_MERGEXRGBROW_SSE2
 #define HAS_MERGERGBROW_SSSE3
+#define HAS_MERGEXRGBROW_SSE2
 #define HAS_MIRRORUVROW_SSSE3
 #define HAS_NV21TOYUV24ROW_SSSE3
 #define HAS_P210TOAR30ROW_SSSE3
@@ -312,15 +311,17 @@ extern "C" {
 #define HAS_P410TOARGBROW_SSSE3
 #define HAS_RAWTORGBAROW_SSSE3
 #define HAS_RGB24MIRRORROW_SSSE3
-#if !defined(LIBYUV_BIT_EXACT)
 #define HAS_RGBATOYJROW_SSSE3
-#endif
 #define HAS_SPLITARGBROW_SSE2
 #define HAS_SPLITARGBROW_SSSE3
+#define HAS_SPLITRGBROW_SSSE3
 #define HAS_SPLITXRGBROW_SSE2
 #define HAS_SPLITXRGBROW_SSSE3
-#define HAS_SPLITRGBROW_SSSE3
 #define HAS_SWAPUVROW_SSSE3
+#define HAS_YUY2TONVUVROW_SSE2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ABGRTOUVJROW_SSSE3
+#endif
 
 #if defined(__x86_64__) || !defined(__pic__)
 // TODO(fbarchard): fix build error on android_full_debug=1
@@ -335,31 +336,23 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) &&               \
     (defined(__x86_64__) || defined(__i386__)) && \
     (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_AB64TOARGBROW_AVX2
 #define HAS_ABGRTOAR30ROW_AVX2
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ABGRTOUVROW_AVX2
+#define HAS_ABGRTOYJROW_AVX2
 #define HAS_ABGRTOYROW_AVX2
-#endif
+#define HAS_AR64TOARGBROW_AVX2
+#define HAS_ARGBATTENUATEROW_AVX2
+#define HAS_ARGBTOAB64ROW_AVX2
 #define HAS_ARGBTOAR30ROW_AVX2
+#define HAS_ARGBTOAR64ROW_AVX2
 #define HAS_ARGBTORAWROW_AVX2
 #define HAS_ARGBTORGB24ROW_AVX2
-#define HAS_ARGBTOAR64ROW_AVX2
-#define HAS_ARGBTOAB64ROW_AVX2
-#define HAS_AR64TOARGBROW_AVX2
-#define HAS_AB64TOARGBROW_AVX2
+#define HAS_ARGBUNATTENUATEROW_AVX2
 #define HAS_CONVERT16TO8ROW_AVX2
-#define HAS_INTERPOLATEROW_16TO8_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_DETILEROW_16_AVX
 #define HAS_DIVIDEROW_16_AVX2
 #define HAS_HALFMERGEUVROW_AVX2
-#define HAS_MERGEAR64ROW_AVX2
-#define HAS_MERGEARGB16TO8ROW_AVX2
-#define HAS_MERGEARGBROW_AVX2
-#define HAS_MERGEXR30ROW_AVX2
-#define HAS_MERGEXR64ROW_AVX2
-#define HAS_MERGEXRGB16TO8ROW_AVX2
-#define HAS_MERGEXRGBROW_AVX2
-#define HAS_NV21TOYUV24ROW_AVX2
 #define HAS_I210TOAR30ROW_AVX2
 #define HAS_I210TOARGBROW_AVX2
 #define HAS_I212TOAR30ROW_AVX2
@@ -367,23 +360,35 @@ extern "C" {
 #define HAS_I400TOARGBROW_AVX2
 #define HAS_I410TOAR30ROW_AVX2
 #define HAS_I410TOARGBROW_AVX2
-#define HAS_P210TOAR30ROW_AVX2
-#define HAS_P210TOARGBROW_AVX2
-#define HAS_P410TOAR30ROW_AVX2
-#define HAS_P410TOARGBROW_AVX2
 #define HAS_I422TOAR30ROW_AVX2
 #define HAS_I422TOUYVYROW_AVX2
 #define HAS_I422TOYUY2ROW_AVX2
+#define HAS_INTERPOLATEROW_16TO8_AVX2
+#define HAS_MERGEAR64ROW_AVX2
+#define HAS_MERGEARGB16TO8ROW_AVX2
+#define HAS_MERGEARGBROW_AVX2
 #define HAS_MERGEUVROW_16_AVX2
+#define HAS_MERGEXR30ROW_AVX2
+#define HAS_MERGEXR64ROW_AVX2
+#define HAS_MERGEXRGB16TO8ROW_AVX2
+#define HAS_MERGEXRGBROW_AVX2
 #define HAS_MIRRORUVROW_AVX2
 #define HAS_MULTIPLYROW_16_AVX2
-#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_NV21TOYUV24ROW_AVX2
+#define HAS_P210TOAR30ROW_AVX2
+#define HAS_P210TOARGBROW_AVX2
+#define HAS_P410TOAR30ROW_AVX2
+#define HAS_P410TOARGBROW_AVX2
 #define HAS_RGBATOYJROW_AVX2
-#endif
 #define HAS_SPLITARGBROW_AVX2
-#define HAS_SPLITXRGBROW_AVX2
 #define HAS_SPLITUVROW_16_AVX2
+#define HAS_SPLITXRGBROW_AVX2
 #define HAS_SWAPUVROW_AVX2
+#define HAS_YUY2TONVUVROW_AVX2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ABGRTOUVJROW_AVX2
+#define HAS_ABGRTOUVROW_AVX2
+#endif
 
 #if defined(__x86_64__) || !defined(__pic__)
 // TODO(fbarchard): fix build error on android_full_debug=1
@@ -397,8 +402,9 @@ extern "C" {
 // TODO(fbarchard): Port to GCC and Visual C
 // TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
 #if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512))
+    (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512)
 #define HAS_ARGBTORGB24ROW_AVX512VBMI
+#define HAS_MERGEUVROW_AVX512BW
 #endif
 
 // The following are available for AVX512 clang x64 platforms:
@@ -412,7 +418,9 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_AB64TOARGBROW_NEON
+#define HAS_ABGRTOUVJROW_NEON
 #define HAS_ABGRTOUVROW_NEON
+#define HAS_ABGRTOYJROW_NEON
 #define HAS_ABGRTOYROW_NEON
 #define HAS_AR64TOARGBROW_NEON
 #define HAS_ARGB1555TOARGBROW_NEON
@@ -444,8 +452,11 @@ extern "C" {
 #define HAS_BYTETOFLOATROW_NEON
 #define HAS_CONVERT16TO8ROW_NEON
 #define HAS_COPYROW_NEON
+#define HAS_DETILEROW_16_NEON
 #define HAS_DETILEROW_NEON
 #define HAS_DETILESPLITUVROW_NEON
+#define HAS_DETILETOYUY2_NEON
+#define HAS_UNPACKMT2T_NEON
 #define HAS_DIVIDEROW_16_NEON
 #define HAS_HALFFLOATROW_NEON
 #define HAS_HALFMERGEUVROW_NEON
@@ -461,6 +472,7 @@ extern "C" {
 #define HAS_I422TOYUY2ROW_NEON
 #define HAS_I444ALPHATOARGBROW_NEON
 #define HAS_I444TOARGBROW_NEON
+#define HAS_I444TORGB24ROW_NEON
 #define HAS_INTERPOLATEROW_16_NEON
 #define HAS_INTERPOLATEROW_NEON
 #define HAS_J400TOARGBROW_NEON
@@ -513,6 +525,7 @@ extern "C" {
 #define HAS_UYVYTOUVROW_NEON
 #define HAS_UYVYTOYROW_NEON
 #define HAS_YUY2TOARGBROW_NEON
+#define HAS_YUY2TONVUVROW_NEON
 #define HAS_YUY2TOUV422ROW_NEON
 #define HAS_YUY2TOUVROW_NEON
 #define HAS_YUY2TOYROW_NEON
@@ -524,13 +537,13 @@ extern "C" {
 #define HAS_ARGBCOLORMATRIXROW_NEON
 #define HAS_ARGBGRAYROW_NEON
 #define HAS_ARGBMIRRORROW_NEON
-#define HAS_RGB24MIRRORROW_NEON
 #define HAS_ARGBMULTIPLYROW_NEON
 #define HAS_ARGBQUANTIZEROW_NEON
 #define HAS_ARGBSEPIAROW_NEON
 #define HAS_ARGBSHADEROW_NEON
 #define HAS_ARGBSHUFFLEROW_NEON
 #define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_RGB24MIRRORROW_NEON
 #define HAS_SOBELROW_NEON
 #define HAS_SOBELTOPLANEROW_NEON
 #define HAS_SOBELXROW_NEON
@@ -540,12 +553,13 @@ extern "C" {
 
 // The following are available on AArch64 platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_GAUSSCOL_F32_NEON
+#define HAS_GAUSSROW_F32_NEON
 #define HAS_INTERPOLATEROW_16TO8_NEON
 #define HAS_SCALESUMSAMPLES_NEON
-#define HAS_GAUSSROW_F32_NEON
-#define HAS_GAUSSCOL_F32_NEON
 #endif
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_ABGRTOUVJROW_MSA
 #define HAS_ABGRTOUVROW_MSA
 #define HAS_ABGRTOYROW_MSA
 #define HAS_ARGB1555TOARGBROW_MSA
@@ -581,27 +595,25 @@ extern "C" {
 #define HAS_BGRATOYROW_MSA
 #define HAS_HALFFLOATROW_MSA
 #define HAS_I400TOARGBROW_MSA
-#define HAS_I422TOUYVYROW_MSA
-#define HAS_I422TOYUY2ROW_MSA
-#define HAS_I422TOARGBROW_MSA
-#define HAS_I422TORGBAROW_MSA
 #define HAS_I422ALPHATOARGBROW_MSA
+#define HAS_I422TOARGB1555ROW_MSA
+#define HAS_I422TOARGB4444ROW_MSA
+#define HAS_I422TOARGBROW_MSA
 #define HAS_I422TORGB24ROW_MSA
 #define HAS_I422TORGB565ROW_MSA
-#define HAS_I422TOARGB4444ROW_MSA
-#define HAS_I422TOARGB1555ROW_MSA
-#define HAS_NV12TOARGBROW_MSA
-#define HAS_NV12TORGB565ROW_MSA
-#define HAS_NV21TOARGBROW_MSA
-#define HAS_YUY2TOARGBROW_MSA
-#define HAS_UYVYTOARGBROW_MSA
+#define HAS_I422TORGBAROW_MSA
+#define HAS_I422TOUYVYROW_MSA
+#define HAS_I422TOYUY2ROW_MSA
 #define HAS_I444TOARGBROW_MSA
 #define HAS_INTERPOLATEROW_MSA
 #define HAS_J400TOARGBROW_MSA
 #define HAS_MERGEUVROW_MSA
 #define HAS_MIRRORROW_MSA
-#define HAS_MIRRORUVROW_MSA
 #define HAS_MIRRORSPLITUVROW_MSA
+#define HAS_MIRRORUVROW_MSA
+#define HAS_NV12TOARGBROW_MSA
+#define HAS_NV12TORGB565ROW_MSA
+#define HAS_NV21TOARGBROW_MSA
 #define HAS_RAWTOARGBROW_MSA
 #define HAS_RAWTORGB24ROW_MSA
 #define HAS_RAWTOUVROW_MSA
@@ -621,113 +633,226 @@ extern "C" {
 #define HAS_SOBELXYROW_MSA
 #define HAS_SOBELYROW_MSA
 #define HAS_SPLITUVROW_MSA
+#define HAS_UYVYTOARGBROW_MSA
 #define HAS_UYVYTOUVROW_MSA
 #define HAS_UYVYTOYROW_MSA
+#define HAS_YUY2TOARGBROW_MSA
 #define HAS_YUY2TOUV422ROW_MSA
 #define HAS_YUY2TOUVROW_MSA
 #define HAS_YUY2TOYROW_MSA
 #endif
 
 #if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
-#define HAS_ARGB4444TOARGBROW_LSX
+#define HAS_ABGRTOUVROW_LSX
+#define HAS_ABGRTOYROW_LSX
 #define HAS_ARGB1555TOARGBROW_LSX
-#define HAS_RGB565TOARGBROW_LSX
-#define HAS_RGB24TOARGBROW_LSX
-#define HAS_RAWTOARGBROW_LSX
-#define HAS_ARGB1555TOYROW_LSX
 #define HAS_ARGB1555TOUVROW_LSX
-#define HAS_RGB565TOYROW_LSX
-#define HAS_RGB565TOUVROW_LSX
-#define HAS_RGB24TOYROW_LSX
-#define HAS_RGB24TOUVROW_LSX
-#define HAS_RAWTOYROW_LSX
-#define HAS_RAWTOUVROW_LSX
+#define HAS_ARGB1555TOYROW_LSX
+#define HAS_ARGB4444TOARGBROW_LSX
+#define HAS_ARGBADDROW_LSX
+#define HAS_ARGBATTENUATEROW_LSX
+#define HAS_ARGBBLENDROW_LSX
+#define HAS_ARGBCOLORMATRIXROW_LSX
+#define HAS_ARGBEXTRACTALPHAROW_LSX
+#define HAS_ARGBGRAYROW_LSX
+#define HAS_ARGBSEPIAROW_LSX
+#define HAS_ARGBSHADEROW_LSX
+#define HAS_ARGBSHUFFLEROW_LSX
+#define HAS_ARGBSUBTRACTROW_LSX
+#define HAS_ARGBQUANTIZEROW_LSX
+#define HAS_ARGBSETROW_LSX
+#define HAS_ARGBTOARGB1555ROW_LSX
+#define HAS_ARGBTOARGB4444ROW_LSX
+#define HAS_ARGBTORAWROW_LSX
+#define HAS_ARGBTORGB24ROW_LSX
+#define HAS_ARGBTORGB565ROW_LSX
+#define HAS_ARGBTORGB565DITHERROW_LSX
+#define HAS_ARGBTOUVJROW_LSX
+#define HAS_ARGBTOUV444ROW_LSX
+#define HAS_ARGBTOUVROW_LSX
+#define HAS_ARGBTOYJROW_LSX
+#define HAS_ARGBMIRRORROW_LSX
+#define HAS_ARGBMULTIPLYROW_LSX
+#define HAS_BGRATOUVROW_LSX
+#define HAS_BGRATOYROW_LSX
+#define HAS_I400TOARGBROW_LSX
+#define HAS_I444TOARGBROW_LSX
+#define HAS_INTERPOLATEROW_LSX
+#define HAS_I422ALPHATOARGBROW_LSX
+#define HAS_I422TOARGB1555ROW_LSX
+#define HAS_I422TOARGB4444ROW_LSX
+#define HAS_I422TORGB24ROW_LSX
+#define HAS_I422TORGB565ROW_LSX
+#define HAS_I422TORGBAROW_LSX
+#define HAS_I422TOUYVYROW_LSX
+#define HAS_I422TOYUY2ROW_LSX
+#define HAS_J400TOARGBROW_LSX
+#define HAS_MERGEUVROW_LSX
+#define HAS_MIRRORROW_LSX
+#define HAS_MIRRORUVROW_LSX
+#define HAS_MIRRORSPLITUVROW_LSX
 #define HAS_NV12TOARGBROW_LSX
 #define HAS_NV12TORGB565ROW_LSX
 #define HAS_NV21TOARGBROW_LSX
+#define HAS_RAWTOARGBROW_LSX
+#define HAS_RAWTORGB24ROW_LSX
+#define HAS_RAWTOUVROW_LSX
+#define HAS_RAWTOYROW_LSX
+#define HAS_RGB24TOARGBROW_LSX
+#define HAS_RGB24TOUVROW_LSX
+#define HAS_RGB24TOYROW_LSX
+#define HAS_RGB565TOARGBROW_LSX
+#define HAS_RGB565TOUVROW_LSX
+#define HAS_RGB565TOYROW_LSX
+#define HAS_RGBATOUVROW_LSX
+#define HAS_RGBATOYROW_LSX
+#define HAS_SETROW_LSX
 #define HAS_SOBELROW_LSX
 #define HAS_SOBELTOPLANEROW_LSX
 #define HAS_SOBELXYROW_LSX
-#define HAS_ARGBTOYJROW_LSX
-#define HAS_BGRATOYROW_LSX
-#define HAS_BGRATOUVROW_LSX
-#define HAS_ABGRTOYROW_LSX
-#define HAS_ABGRTOUVROW_LSX
-#define HAS_RGBATOYROW_LSX
-#define HAS_RGBATOUVROW_LSX
-#define HAS_ARGBTOUVJROW_LSX
-#define HAS_I444TOARGBROW_LSX
-#define HAS_I400TOARGBROW_LSX
-#define HAS_J400TOARGBROW_LSX
-#define HAS_YUY2TOARGBROW_LSX
-#define HAS_UYVYTOARGBROW_LSX
-#define HAS_INTERPOLATEROW_LSX
-#define HAS_ARGBSETROW_LSX
-#define HAS_RAWTORGB24ROW_LSX
-#define HAS_MERGEUVROW_LSX
-#define HAS_ARGBEXTRACTALPHAROW_LSX
-#define HAS_ARGBBLENDROW_LSX
-#define HAS_ARGBQUANTIZEROW_LSX
-#define HAS_ARGBCOLORMATRIXROW_LSX
 #define HAS_SPLITUVROW_LSX
-#define HAS_SETROW_LSX
-#define HAS_MIRRORSPLITUVROW_LSX
+#define HAS_UYVYTOARGBROW_LSX
+#define HAS_UYVYTOUV422ROW_LSX
+#define HAS_UYVYTOUVROW_LSX
+#define HAS_UYVYTOYROW_LSX
+#define HAS_YUY2TOARGBROW_LSX
+#define HAS_YUY2TOUVROW_LSX
+#define HAS_YUY2TOUV422ROW_LSX
+#define HAS_YUY2TOYROW_LSX
+#define HAS_ARGBTOYROW_LSX
+#define HAS_ABGRTOYJROW_LSX
+#define HAS_RGBATOYJROW_LSX
+#define HAS_RGB24TOYJROW_LSX
+#define HAS_RAWTOYJROW_LSX
+#endif
+
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#define HAS_I422TOARGBROW_LSX
 #endif
 
 #if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
+#define HAS_ARGB1555TOARGBROW_LASX
+#define HAS_ARGB1555TOUVROW_LASX
+#define HAS_ARGB1555TOYROW_LASX
+#define HAS_ARGB4444TOARGBROW_LASX
+#define HAS_ARGBADDROW_LASX
+#define HAS_ARGBATTENUATEROW_LASX
+#define HAS_ARGBGRAYROW_LASX
+#define HAS_ARGBMIRRORROW_LASX
+#define HAS_ARGBMULTIPLYROW_LASX
+#define HAS_ARGBSEPIAROW_LASX
+#define HAS_ARGBSHADEROW_LASX
+#define HAS_ARGBSHUFFLEROW_LASX
+#define HAS_ARGBSUBTRACTROW_LASX
+#define HAS_ARGBTOARGB1555ROW_LASX
+#define HAS_ARGBTOARGB4444ROW_LASX
+#define HAS_ARGBTORAWROW_LASX
+#define HAS_ARGBTORGB24ROW_LASX
+#define HAS_ARGBTORGB565DITHERROW_LASX
+#define HAS_ARGBTORGB565ROW_LASX
+#define HAS_ARGBTOUV444ROW_LASX
+#define HAS_ARGBTOUVJROW_LASX
+#define HAS_ARGBTOUVROW_LASX
+#define HAS_ARGBTOYJROW_LASX
+#define HAS_ARGBTOYROW_LASX
+#define HAS_ABGRTOYJROW_LASX
+#define HAS_ABGRTOYROW_LASX
+#define HAS_I422ALPHATOARGBROW_LASX
+#define HAS_I422TOARGB1555ROW_LASX
+#define HAS_I422TOARGB4444ROW_LASX
 #define HAS_I422TOARGBROW_LASX
+#define HAS_I422TORGB24ROW_LASX
+#define HAS_I422TORGB565ROW_LASX
 #define HAS_I422TORGBAROW_LASX
-#define HAS_I422ALPHATOARGBROW_LASX
-#define HAS_I422TOYUY2ROW_LASX
 #define HAS_I422TOUYVYROW_LASX
+#define HAS_I422TOYUY2ROW_LASX
 #define HAS_MIRRORROW_LASX
 #define HAS_MIRRORUVROW_LASX
-#define HAS_ARGBMIRRORROW_LASX
-#define HAS_I422TORGB24ROW_LASX
-#define HAS_I422TORGB565ROW_LASX
-#define HAS_I422TOARGB4444ROW_LASX
-#define HAS_I422TOARGB1555ROW_LASX
-#define HAS_YUY2TOUVROW_LASX
-#define HAS_YUY2TOYROW_LASX
-#define HAS_YUY2TOUV422ROW_LASX
-#define HAS_UYVYTOYROW_LASX
-#define HAS_UYVYTOUVROW_LASX
-#define HAS_UYVYTOUV422ROW_LASX
-#define HAS_ARGBTOYROW_LASX
-#define HAS_ARGBTOUVROW_LASX
-#define HAS_ARGBTORGB24ROW_LASX
-#define HAS_ARGBTORAWROW_LASX
-#define HAS_ARGBTORGB565ROW_LASX
-#define HAS_ARGBTOARGB1555ROW_LASX
-#define HAS_ARGBTOARGB4444ROW_LASX
-#define HAS_ARGBTOUV444ROW_LASX
-#define HAS_ARGBMULTIPLYROW_LASX
-#define HAS_ARGBADDROW_LASX
-#define HAS_ARGBSUBTRACTROW_LASX
-#define HAS_ARGBATTENUATEROW_LASX
-#define HAS_ARGBTORGB565DITHERROW_LASX
-#define HAS_ARGBSHUFFLEROW_LASX
-#define HAS_ARGBSHADEROW_LASX
-#define HAS_ARGBGRAYROW_LASX
-#define HAS_ARGBSEPIAROW_LASX
-#define HAS_ARGB4444TOARGBROW_LASX
-#define HAS_ARGB1555TOARGBROW_LASX
-#define HAS_RGB565TOARGBROW_LASX
-#define HAS_RGB24TOARGBROW_LASX
-#define HAS_RAWTOARGBROW_LASX
-#define HAS_ARGB1555TOYROW_LASX
-#define HAS_ARGB1555TOUVROW_LASX
-#define HAS_RGB565TOYROW_LASX
-#define HAS_RGB565TOUVROW_LASX
-#define HAS_RGB24TOYROW_LASX
-#define HAS_RGB24TOUVROW_LASX
-#define HAS_RAWTOYROW_LASX
-#define HAS_RAWTOUVROW_LASX
 #define HAS_NV12TOARGBROW_LASX
 #define HAS_NV12TORGB565ROW_LASX
 #define HAS_NV21TOARGBROW_LASX
-#define HAS_ARGBTOYJROW_LASX
-#define HAS_ARGBTOUVJROW_LASX
+#define HAS_RAWTOARGBROW_LASX
+#define HAS_RAWTOUVROW_LASX
+#define HAS_RAWTOYROW_LASX
+#define HAS_RGB24TOARGBROW_LASX
+#define HAS_RGB24TOUVROW_LASX
+#define HAS_RGB24TOYROW_LASX
+#define HAS_RGB565TOARGBROW_LASX
+#define HAS_RGB565TOUVROW_LASX
+#define HAS_RGB565TOYROW_LASX
+#define HAS_UYVYTOUV422ROW_LASX
+#define HAS_UYVYTOUVROW_LASX
+#define HAS_UYVYTOYROW_LASX
+#define HAS_YUY2TOUV422ROW_LASX
+#define HAS_YUY2TOUVROW_LASX
+#define HAS_YUY2TOYROW_LASX
+#define HAS_RGBATOYROW_LASX
+#define HAS_RGBATOYJROW_LASX
+#define HAS_BGRATOYROW_LASX
+#define HAS_RGB24TOYJROW_LASX
+#define HAS_RAWTOYJROW_LASX
+#endif
+
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
+#define HAS_COPYROW_RVV
+#if __riscv_v_intrinsic == 11000
+#define HAS_AB64TOARGBROW_RVV
+#define HAS_ABGRTOYJROW_RVV
+#define HAS_ABGRTOYROW_RVV
+#define HAS_AR64TOARGBROW_RVV
+#define HAS_AR64TOAB64ROW_RVV
+#define HAS_ARGBATTENUATEROW_RVV
+#define HAS_ARGBBLENDROW_RVV
+#define HAS_ARGBCOPYYTOALPHAROW_RVV
+#define HAS_ARGBEXTRACTALPHAROW_RVV
+#define HAS_ARGBTOAB64ROW_RVV
+#define HAS_ARGBTOABGRROW_RVV
+#define HAS_ARGBTOAR64ROW_RVV
+#define HAS_ARGBTOBGRAROW_RVV
+#define HAS_ARGBTORAWROW_RVV
+#define HAS_ARGBTORGB24ROW_RVV
+#define HAS_ARGBTORGBAROW_RVV
+#define HAS_ARGBTOYJROW_RVV
+#define HAS_ARGBTOYMATRIXROW_RVV
+#define HAS_ARGBTOYROW_RVV
+#define HAS_BGRATOYROW_RVV
+#define HAS_BLENDPLANEROW_RVV
+#define HAS_I400TOARGBROW_RVV
+#define HAS_I422ALPHATOARGBROW_RVV
+#define HAS_I422TOARGBROW_RVV
+#define HAS_I422TORGB24ROW_RVV
+#define HAS_I422TORGBAROW_RVV
+#define HAS_I444ALPHATOARGBROW_RVV
+#define HAS_I444TOARGBROW_RVV
+#define HAS_I444TORGB24ROW_RVV
+#define HAS_INTERPOLATEROW_RVV
+#define HAS_J400TOARGBROW_RVV
+#define HAS_MERGEARGBROW_RVV
+#define HAS_MERGERGBROW_RVV
+#define HAS_MERGEUVROW_RVV
+#define HAS_MERGEXRGBROW_RVV
+#define HAS_NV12TOARGBROW_RVV
+#define HAS_NV12TORGB24ROW_RVV
+#define HAS_NV21TOARGBROW_RVV
+#define HAS_NV21TORGB24ROW_RVV
+#define HAS_RAWTOARGBROW_RVV
+#define HAS_RAWTORGB24ROW_RVV
+#define HAS_RAWTORGBAROW_RVV
+#define HAS_RAWTOYJROW_RVV
+#define HAS_RAWTOYROW_RVV
+#define HAS_RGB24TOARGBROW_RVV
+#define HAS_RGB24TOYJROW_RVV
+#define HAS_RGB24TOYROW_RVV
+#define HAS_RGBATOARGBROW_RVV
+#define HAS_RGBATOYJROW_RVV
+#define HAS_RGBATOYMATRIXROW_RVV
+#define HAS_RGBATOYROW_RVV
+#define HAS_RGBTOYMATRIXROW_RVV
+#define HAS_SPLITARGBROW_RVV
+#define HAS_SPLITRGBROW_RVV
+#define HAS_SPLITUVROW_RVV
+#define HAS_SPLITXRGBROW_RVV
+#endif
 #endif
 
 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -789,8 +914,8 @@ typedef uint32_t ulvec32[8];
 typedef uint8_t ulvec8[32];
 #endif
 
-#if defined(__aarch64__) || defined(__arm__)
-// This struct is for ARM color conversion.
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
+// This struct is for ARM and RISC-V color conversion.
 struct YuvConstants {
   uvec8 kUVCoeff;
   vec16 kRGBCoeffBias;
@@ -816,13 +941,13 @@ struct YuvConstants {
 
 #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
 
-#define align_buffer_64(var, size)                                           \
-  uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63));         /* NOLINT */ \
-  uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
+#define align_buffer_64(var, size)                                         \
+  void* var##_mem = malloc((size) + 63);                      /* NOLINT */ \
+  uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */
 
 #define free_aligned_buffer_64(var) \
   free(var##_mem);                  \
-  var = 0
+  var = NULL
 
 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
 #define OMITFP
@@ -894,6 +1019,12 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I444ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
 void I422ToARGBRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -981,6 +1112,50 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I444ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I444ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGBARow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgba,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I444ToARGBRow_MSA(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
@@ -1000,6 +1175,12 @@ void I422ToARGBRow_MSA(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I422ToARGBRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
 void I422ToARGBRow_LASX(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -1012,6 +1193,12 @@ void I422ToRGBARow_MSA(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I422ToRGBARow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
 void I422ToRGBARow_LASX(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -1025,6 +1212,13 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I422AlphaToARGBRow_LSX(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I422AlphaToARGBRow_LASX(const uint8_t* src_y,
                              const uint8_t* src_u,
                              const uint8_t* src_v,
@@ -1038,6 +1232,12 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I422ToRGB24Row_LSX(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I422ToRGB24Row_LASX(const uint8_t* src_y,
                          const uint8_t* src_u,
                          const uint8_t* src_v,
@@ -1050,6 +1250,12 @@ void I422ToRGB565Row_MSA(const uint8_t* src_y,
                          uint8_t* dst_rgb565,
                          const struct YuvConstants* yuvconstants,
                          int width);
+void I422ToRGB565Row_LSX(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
 void I422ToRGB565Row_LASX(const uint8_t* src_y,
                           const uint8_t* src_u,
                           const uint8_t* src_v,
@@ -1062,6 +1268,12 @@ void I422ToARGB4444Row_MSA(const uint8_t* src_y,
                            uint8_t* dst_argb4444,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I422ToARGB4444Row_LSX(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void I422ToARGB4444Row_LASX(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@@ -1074,6 +1286,12 @@ void I422ToARGB1555Row_MSA(const uint8_t* src_y,
                            uint8_t* dst_argb1555,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I422ToARGB1555Row_LSX(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void I422ToARGB1555Row_LASX(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@@ -1142,15 +1360,39 @@ void UYVYToARGBRow_LSX(const uint8_t* src_uyvy,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void NV12ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV21ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV12ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void NV21ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 
 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
@@ -1164,13 +1406,23 @@ void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width);
 void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
 void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width);
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
+void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void ABGRToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
+void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
 void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
 void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYRow_LSX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
 void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
 void ARGBToYJRow_LSX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
+void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
 void ARGBToYJRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
+void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -1189,11 +1441,20 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void ARGBToUVRow_LSX(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
 void ARGBToUVRow_LASX(const uint8_t* src_argb,
                       int src_stride_argb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void ARGBToUV444Row_LSX(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 void ARGBToUV444Row_LASX(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -1203,6 +1464,11 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
+void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_uj,
+                       uint8_t* dst_vj,
+                       int width);
 void BGRAToUVRow_NEON(const uint8_t* src_bgra,
                       int src_stride_bgra,
                       uint8_t* dst_u,
@@ -1258,6 +1524,11 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void ABGRToUVJRow_MSA(const uint8_t* src_rgb,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
 void BGRAToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
@@ -1372,6 +1643,13 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                          uint8_t* dst_y,
                          int width);
+void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width);
 void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
@@ -1384,6 +1662,8 @@ void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width);
 void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555,
                          uint8_t* dst_y,
                          int width);
@@ -1393,9 +1673,15 @@ void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
 void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
 void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width);
 void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width);
+void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width);
 
 void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void ABGRToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
@@ -1409,6 +1695,7 @@ void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
 void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
 void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -1423,6 +1710,7 @@ void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -1453,10 +1741,15 @@ void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr,
 void BGRAToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB24ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB565ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGB1555ToYRow_Any_LSX(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
@@ -1465,7 +1758,14 @@ void RGB565ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB24ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGB1555ToYRow_Any_LASX(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
@@ -1485,6 +1785,11 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
+void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
 void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
@@ -1495,6 +1800,11 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr,
+                        int src_stride_abgr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
                        int src_stride_bgra,
                        uint8_t* dst_u,
@@ -1525,6 +1835,11 @@ void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
+void ABGRToUVJRow_Any_AVX2(const uint8_t* src_ptr,
+                           int src_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
 void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr,
                            int src_stride,
                            uint8_t* dst_u,
@@ -1535,6 +1850,11 @@ void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
+void ABGRToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
+                            int src_stride,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
 void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
                            int src_stride,
                            uint8_t* dst_u,
@@ -1568,11 +1888,20 @@ void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void ARGBToUVRow_Any_LSX(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void ARGBToUVRow_Any_LASX(const uint8_t* src_ptr,
                           int src_stride_ptr,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void ARGBToUV444Row_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
 void ARGBToUV444Row_Any_LASX(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
@@ -1582,6 +1911,11 @@ void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
+void ABGRToUVJRow_Any_NEON(const uint8_t* src_ptr,
+                           int src_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
 void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
                           int src_stride,
                           uint8_t* dst_u,
@@ -1747,16 +2081,16 @@ void ARGBToUVJRow_C(const uint8_t* src_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
+void ABGRToUVJRow_C(const uint8_t* src_rgb,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
 void ARGBToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void ARGBToUVJRow_C(const uint8_t* src_rgb,
-                    int src_stride_rgb,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width);
 void BGRAToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
@@ -1772,6 +2106,11 @@ void RGBAToUVRow_C(const uint8_t* src_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
+void RGBAToUVJRow_C(const uint8_t* src_rgb,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
 void RGB24ToUVRow_C(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
@@ -1826,6 +2165,7 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -1833,17 +2173,20 @@ void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 
 void MirrorSplitUVRow_SSSE3(const uint8_t* src,
@@ -1867,10 +2210,13 @@ void MirrorSplitUVRow_C(const uint8_t* src_uv,
                         uint8_t* dst_v,
                         int width);
 
+void MirrorRow_16_C(const uint16_t* src, uint16_t* dst, int width);
+
 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr,
@@ -1883,6 +2229,7 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
 void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBMirrorRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBMirrorRow_Any_LASX(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
@@ -1925,6 +2272,10 @@ void SplitUVRow_LSX(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
+void SplitUVRow_RVV(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
 void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -1949,7 +2300,6 @@ void DetileRow_C(const uint8_t* src,
                  ptrdiff_t src_tile_stride,
                  uint8_t* dst,
                  int width);
-
 void DetileRow_NEON(const uint8_t* src,
                     ptrdiff_t src_tile_stride,
                     uint8_t* dst,
@@ -1966,6 +2316,42 @@ void DetileRow_Any_SSE2(const uint8_t* src,
                         ptrdiff_t src_tile_stride,
                         uint8_t* dst,
                         int width);
+void DetileRow_AVX(const uint8_t* src,
+                   ptrdiff_t src_tile_stride,
+                   uint8_t* dst,
+                   int width);
+void DetileRow_Any_AVX(const uint8_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint8_t* dst,
+                       int width);
+void DetileRow_16_C(const uint16_t* src,
+                    ptrdiff_t src_tile_stride,
+                    uint16_t* dst,
+                    int width);
+void DetileRow_16_NEON(const uint16_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint16_t* dst,
+                       int width);
+void DetileRow_16_Any_NEON(const uint16_t* src,
+                           ptrdiff_t src_tile_stride,
+                           uint16_t* dst,
+                           int width);
+void DetileRow_16_SSE2(const uint16_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint16_t* dst,
+                       int width);
+void DetileRow_16_Any_SSE2(const uint16_t* src,
+                           ptrdiff_t src_tile_stride,
+                           uint16_t* dst,
+                           int width);
+void DetileRow_16_AVX(const uint16_t* src,
+                      ptrdiff_t src_tile_stride,
+                      uint16_t* dst,
+                      int width);
+void DetileRow_16_Any_AVX(const uint16_t* src,
+                          ptrdiff_t src_tile_stride,
+                          uint16_t* dst,
+                          int width);
 void DetileSplitUVRow_C(const uint8_t* src_uv,
                         ptrdiff_t src_tile_stride,
                         uint8_t* dst_u,
@@ -1991,6 +2377,38 @@ void DetileSplitUVRow_Any_NEON(const uint8_t* src_uv,
                                uint8_t* dst_u,
                                uint8_t* dst_v,
                                int width);
+void DetileToYUY2_C(const uint8_t* src_y,
+                    ptrdiff_t src_y_tile_stride,
+                    const uint8_t* src_uv,
+                    ptrdiff_t src_uv_tile_stride,
+                    uint8_t* dst_yuy2,
+                    int width);
+void DetileToYUY2_SSE2(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width);
+void DetileToYUY2_Any_SSE2(const uint8_t* src_y,
+                           ptrdiff_t src_y_tile_stride,
+                           const uint8_t* src_uv,
+                           ptrdiff_t src_uv_tile_stride,
+                           uint8_t* dst_yuy2,
+                           int width);
+void DetileToYUY2_NEON(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width);
+void DetileToYUY2_Any_NEON(const uint8_t* src_y,
+                           ptrdiff_t src_y_tile_stride,
+                           const uint8_t* src_uv,
+                           ptrdiff_t src_uv_tile_stride,
+                           uint8_t* dst_yuy2,
+                           int width);
+void UnpackMT2T_C(const uint8_t* src, uint16_t* dst, size_t size);
+void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size);
 void MergeUVRow_C(const uint8_t* src_u,
                   const uint8_t* src_v,
                   uint8_t* dst_uv,
@@ -2003,6 +2421,10 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width);
+void MergeUVRow_AVX512BW(const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_uv,
+                         int width);
 void MergeUVRow_NEON(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
@@ -2015,6 +2437,10 @@ void MergeUVRow_LSX(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width);
+void MergeUVRow_RVV(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width);
 void MergeUVRow_Any_SSE2(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
@@ -2023,6 +2449,10 @@ void MergeUVRow_Any_AVX2(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
                          int width);
+void MergeUVRow_Any_AVX512BW(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 void MergeUVRow_Any_NEON(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
@@ -2079,6 +2509,11 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width);
+void SplitRGBRow_RVV(const uint8_t* src_rgb,
+                     uint8_t* dst_r,
+                     uint8_t* dst_g,
+                     uint8_t* dst_b,
+                     int width);
 void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr,
                            uint8_t* dst_r,
                            uint8_t* dst_g,
@@ -2105,6 +2540,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_b,
                       uint8_t* dst_rgb,
                       int width);
+void MergeRGBRow_RVV(const uint8_t* src_r,
+                     const uint8_t* src_g,
+                     const uint8_t* src_b,
+                     uint8_t* dst_rgb,
+                     int width);
 void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
@@ -2139,6 +2579,12 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width);
+void MergeARGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      const uint8_t* src_a,
+                      uint8_t* dst_argb,
+                      int width);
 void MergeARGBRow_Any_SSE2(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
@@ -2187,6 +2633,12 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width);
+void SplitARGBRow_RVV(const uint8_t* src_rgba,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      uint8_t* dst_a,
+                      int width);
 void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr,
                            uint8_t* dst_r,
                            uint8_t* dst_g,
@@ -2231,6 +2683,11 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_argb,
                        int width);
+void MergeXRGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_argb,
+                      int width);
 void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
@@ -2271,6 +2728,11 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width);
+void SplitXRGBRow_RVV(const uint8_t* src_rgba,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width);
 void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr,
                            uint8_t* dst_r,
                            uint8_t* dst_g,
@@ -2604,8 +3066,8 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
                           uint8_t* dst_y,
                           int scale,
                           int width);
-void Convert16To8Row_Any_NEON(const uint16_t* src_y,
-                              uint8_t* dst_y,
+void Convert16To8Row_Any_NEON(const uint16_t* src_ptr,
+                              uint8_t* dst_ptr,
                               int scale,
                               int width);
 
@@ -2614,6 +3076,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int count);
 void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
 void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -2647,6 +3110,9 @@ void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
 void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb,
                              uint8_t* dst_a,
                              int width);
+void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width);
 void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   int width);
@@ -2666,6 +3132,7 @@ void ARGBExtractAlphaRow_Any_LSX(const uint8_t* src_ptr,
 void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   int width);
@@ -2713,6 +3180,10 @@ void ARGBShuffleRow_MSA(const uint8_t* src_argb,
                         uint8_t* dst_argb,
                         const uint8_t* shuffler,
                         int width);
+void ARGBShuffleRow_LSX(const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        const uint8_t* shuffler,
+                        int width);
 void ARGBShuffleRow_LASX(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          const uint8_t* shuffler,
@@ -2733,6 +3204,10 @@ void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             const uint8_t* param,
                             int width);
+void ARGBShuffleRow_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            const uint8_t* param,
+                            int width);
 void ARGBShuffleRow_Any_LASX(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              const uint8_t* param,
@@ -2765,14 +3240,18 @@ void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
 void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24,
                          uint8_t* dst_argb,
                          int width);
+void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                           uint8_t* dst_argb,
                           int width);
@@ -2932,15 +3411,15 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width);
 
 void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
                              uint8_t* dst_rgb,
-                             const uint32_t dither4,
+                             uint32_t dither4,
                              int width);
 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
                                 uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width);
 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
                                 uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width);
 
 void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -2968,7 +3447,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                             int width);
 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width);
 void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -2981,23 +3460,44 @@ void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
                            int width);
 void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
                                uint8_t* dst_rgb,
-                               const uint32_t dither4,
+                               uint32_t dither4,
+                               int width);
+void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               uint32_t dither4,
                                int width);
 void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width);
 
+void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRGB565Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_LSX(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
 void ARGBToARGB1555Row_LASX(const uint8_t* src_argb,
                             uint8_t* dst_rgb,
                             int width);
+void ARGBToARGB4444Row_LSX(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
 void ARGBToARGB4444Row_LASX(const uint8_t* src_argb,
                             uint8_t* dst_rgb,
                             int width);
 
+void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width);
+void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width);
+void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width);
+void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB24Row_RVV(const uint8_t* src_argb, uint8_t* dst_rgb24, int width);
+
+void ARGBToABGRRow_C(const uint8_t* src_argb, uint8_t* dst_abgr, int width);
+void ARGBToBGRARow_C(const uint8_t* src_argb, uint8_t* dst_bgra, int width);
 void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -3011,6 +3511,8 @@ void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
 void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
 void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
 void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void AR64ToAB64Row_C(const uint16_t* src_ar64, uint16_t* dst_ab64, int width);
+void RGBAToARGBRow_C(const uint8_t* src_rgba, uint8_t* dst_argb, int width);
 void AR64ShuffleRow_C(const uint8_t* src_ar64,
                       uint8_t* dst_ar64,
                       const uint8_t* shuffler,
@@ -3035,6 +3537,12 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
 void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
 void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
 void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
+void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
+void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
+void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void AR64ToAB64Row_RVV(const uint16_t* src_ar64, uint16_t* dst_ab64, int width);
+void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width);
 void ARGBToAR64Row_Any_SSSE3(const uint8_t* src_ptr,
                              uint16_t* dst_ptr,
                              int width);
@@ -3077,6 +3585,7 @@ void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
@@ -3096,6 +3605,12 @@ void I444ToARGBRow_C(const uint8_t* src_y,
                      uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
+void I444ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_u,
+                      const uint8_t* src_v,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width);
 void I422ToARGBRow_C(const uint8_t* src_y,
                      const uint8_t* src_u,
                      const uint8_t* src_v,
@@ -3290,6 +3805,18 @@ void I444ToARGBRow_AVX2(const uint8_t* y_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I444ToRGB24Row_SSSE3(const uint8_t* y_buf,
+                          const uint8_t* u_buf,
+                          const uint8_t* v_buf,
+                          uint8_t* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void I444ToRGB24Row_AVX2(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
 void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
                          const uint8_t* u_buf,
                          const uint8_t* v_buf,
@@ -3631,12 +4158,24 @@ void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
+void I444ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
 void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I444ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
                              const uint8_t* u_buf,
                              const uint8_t* v_buf,
@@ -3823,13 +4362,13 @@ void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void NV21ToYUV24Row_Any_SSSE3(const uint8_t* src_y,
-                              const uint8_t* src_vu,
-                              uint8_t* dst_yuv24,
+void NV21ToYUV24Row_Any_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
                               int width);
-void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y,
-                             const uint8_t* src_vu,
-                             uint8_t* dst_yuv24,
+void NV21ToYUV24Row_Any_AVX2(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
                              int width);
 void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
                                const uint8_t* uv_buf,
@@ -3976,6 +4515,10 @@ void I400ToARGBRow_LSX(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I400ToARGBRow_RVV(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
 void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* param,
@@ -4014,6 +4557,10 @@ void ARGBBlendRow_LSX(const uint8_t* src_argb0,
                       const uint8_t* src_argb1,
                       uint8_t* dst_argb,
                       int width);
+void ARGBBlendRow_RVV(const uint8_t* src_argb0,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width);
 void ARGBBlendRow_C(const uint8_t* src_argb,
                     const uint8_t* src_argb1,
                     uint8_t* dst_argb,
@@ -4040,6 +4587,11 @@ void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf,
                             const uint8_t* v_buf,
                             uint8_t* dst_ptr,
                             int width);
+void BlendPlaneRow_RVV(const uint8_t* src0,
+                       const uint8_t* src1,
+                       const uint8_t* alpha,
+                       uint8_t* dst,
+                       int width);
 void BlendPlaneRow_C(const uint8_t* src0,
                      const uint8_t* src1,
                      const uint8_t* alpha,
@@ -4084,10 +4636,18 @@ void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
                              int width);
+void ARGBMultiplyRow_LSX(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
 void ARGBMultiplyRow_LASX(const uint8_t* src_argb0,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
+void ARGBMultiplyRow_Any_LSX(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 void ARGBMultiplyRow_Any_LASX(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
@@ -4130,10 +4690,18 @@ void ARGBAddRow_Any_MSA(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
                         int width);
+void ARGBAddRow_LSX(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width);
 void ARGBAddRow_LASX(const uint8_t* src_argb0,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width);
+void ARGBAddRow_Any_LSX(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
 void ARGBAddRow_Any_LASX(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
@@ -4177,10 +4745,18 @@ void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
                              int width);
+void ARGBSubtractRow_LSX(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
 void ARGBSubtractRow_LASX(const uint8_t* src_argb0,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
+void ARGBSubtractRow_Any_LSX(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 void ARGBSubtractRow_Any_LASX(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
@@ -4273,21 +4849,37 @@ void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr,
                                    uint8_t* dst_ptr,
                                    const uint32_t param,
                                    int width);
+void ARGBToRGB565DitherRow_Any_LSX(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   const uint32_t param,
+                                   int width);
 void ARGBToRGB565DitherRow_Any_LASX(const uint8_t* src_ptr,
                                     uint8_t* dst_ptr,
                                     const uint32_t param,
                                     int width);
-
+void ARGBToRGB24Row_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
 void ARGBToRGB24Row_Any_LASX(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
+void ARGBToRAWRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToRAWRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_LSX(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
 void ARGBToRGB565Row_Any_LASX(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int width);
+void ARGBToARGB1555Row_Any_LSX(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
 void ARGBToARGB1555Row_Any_LASX(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int width);
+void ARGBToARGB4444Row_Any_LSX(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
 void ARGBToARGB4444Row_Any_LASX(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int width);
@@ -4298,6 +4890,12 @@ void I444ToARGBRow_Any_NEON(const uint8_t* y_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I444ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422ToARGBRow_Any_NEON(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -4443,6 +5041,12 @@ void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I422ToARGBRow_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void I422ToARGBRow_Any_LASX(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -4455,6 +5059,12 @@ void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I422ToRGBARow_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void I422ToRGBARow_Any_LASX(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -4468,6 +5078,13 @@ void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
                                 uint8_t* dst_ptr,
                                 const struct YuvConstants* yuvconstants,
                                 int width);
+void I422AlphaToARGBRow_Any_LSX(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                const uint8_t* a_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
 void I422AlphaToARGBRow_Any_LASX(const uint8_t* y_buf,
                                  const uint8_t* u_buf,
                                  const uint8_t* v_buf,
@@ -4481,6 +5098,12 @@ void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I422ToRGB24Row_Any_LSX(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I422ToRGB24Row_Any_LASX(const uint8_t* y_buf,
                              const uint8_t* u_buf,
                              const uint8_t* v_buf,
@@ -4493,6 +5116,12 @@ void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
+void I422ToRGB565Row_Any_LSX(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422ToRGB565Row_Any_LASX(const uint8_t* y_buf,
                               const uint8_t* u_buf,
                               const uint8_t* v_buf,
@@ -4505,6 +5134,12 @@ void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf,
                                uint8_t* dst_ptr,
                                const struct YuvConstants* yuvconstants,
                                int width);
+void I422ToARGB4444Row_Any_LSX(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
 void I422ToARGB4444Row_Any_LASX(const uint8_t* y_buf,
                                 const uint8_t* u_buf,
                                 const uint8_t* v_buf,
@@ -4517,6 +5152,12 @@ void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf,
                                uint8_t* dst_ptr,
                                const struct YuvConstants* yuvconstants,
                                int width);
+void I422ToARGB1555Row_Any_LSX(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
 void I422ToARGB1555Row_Any_LASX(const uint8_t* y_buf,
                                 const uint8_t* u_buf,
                                 const uint8_t* v_buf,
@@ -4592,6 +5233,10 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width);
 void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -4602,6 +5247,10 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width);
 void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -4612,17 +5261,27 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width);
 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
 void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
 void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
 void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
                      int src_stride_yuy2,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void YUY2ToUVRow_LSX(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
 void YUY2ToUVRow_LASX(const uint8_t* src_yuy2,
                       int src_stride_yuy2,
                       uint8_t* dst_u,
@@ -4632,6 +5291,10 @@ void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -4642,6 +5305,10 @@ void YUY2ToUVRow_C(const uint8_t* src_yuy2,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
+void YUY2ToNVUVRow_C(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_uv,
+                     int width);
 void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
@@ -4652,6 +5319,10 @@ void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void YUY2ToNVUVRow_Any_AVX2(const uint8_t* src_yuy2,
+                            int stride_yuy2,
+                            uint8_t* dst_uv,
+                            int width);
 void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
@@ -4662,6 +5333,10 @@ void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void YUY2ToNVUVRow_Any_SSE2(const uint8_t* src_yuy2,
+                            int stride_yuy2,
+                            uint8_t* dst_uv,
+                            int width);
 void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
@@ -4672,17 +5347,27 @@ void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void YUY2ToNVUVRow_Any_NEON(const uint8_t* src_yuy2,
+                            int stride_yuy2,
+                            uint8_t* dst_uv,
+                            int width);
 void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
                              int width);
 void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void YUY2ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void YUY2ToUVRow_Any_LSX(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void YUY2ToUVRow_Any_LASX(const uint8_t* src_ptr,
                           int src_stride_ptr,
                           uint8_t* dst_u,
@@ -4692,6 +5377,10 @@ void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
+void YUY2ToUV422Row_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
 void YUY2ToUV422Row_Any_LASX(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
@@ -4737,12 +5426,18 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
                          uint8_t* dst_v,
                          int width);
 void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
 void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
 void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
                      int src_stride_uyvy,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void UYVYToUVRow_LSX(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
 void UYVYToUVRow_LASX(const uint8_t* src_uyvy,
                       int src_stride_uyvy,
                       uint8_t* dst_u,
@@ -4752,6 +5447,10 @@ void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void UYVYToUV422Row_LSX(const uint8_t* src_uyvy,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 void UYVYToUV422Row_LASX(const uint8_t* src_uyvy,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -4798,12 +5497,18 @@ void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_v,
                              int width);
 void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void UYVYToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void UYVYToUVRow_Any_LSX(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void UYVYToUVRow_Any_LASX(const uint8_t* src_ptr,
                           int src_stride_ptr,
                           uint8_t* dst_u,
@@ -4813,6 +5518,10 @@ void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
+void UYVYToUV422Row_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
 void UYVYToUV422Row_Any_LASX(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
@@ -4927,6 +5636,11 @@ void I422ToYUY2Row_MSA(const uint8_t* src_y,
                        const uint8_t* src_v,
                        uint8_t* dst_yuy2,
                        int width);
+void I422ToYUY2Row_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_yuy2,
+                       int width);
 void I422ToYUY2Row_LASX(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -4937,6 +5651,11 @@ void I422ToUYVYRow_MSA(const uint8_t* src_y,
                        const uint8_t* src_v,
                        uint8_t* dst_uyvy,
                        int width);
+void I422ToUYVYRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_uyvy,
+                       int width);
 void I422ToUYVYRow_LASX(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -4947,6 +5666,11 @@ void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* v_buf,
                            uint8_t* dst_ptr,
                            int width);
+void I422ToYUY2Row_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
 void I422ToYUY2Row_Any_LASX(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -4957,6 +5681,11 @@ void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* v_buf,
                            uint8_t* dst_ptr,
                            int width);
+void I422ToUYVYRow_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
 void I422ToUYVYRow_Any_LASX(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -4977,9 +5706,15 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
 void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int width);
+void ARGBAttenuateRow_LSX(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
 void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width);
+void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
 void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int width);
@@ -4992,6 +5727,9 @@ void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr,
 void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int width);
+void ARGBAttenuateRow_Any_LSX(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
 void ARGBAttenuateRow_Any_LASX(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
@@ -5018,12 +5756,14 @@ void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 
 void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width);
 
 void ARGBColorMatrixRow_C(const uint8_t* src_argb,
@@ -5103,6 +5843,10 @@ void ARGBShadeRow_MSA(const uint8_t* src_argb,
                       uint8_t* dst_argb,
                       int width,
                       uint32_t value);
+void ARGBShadeRow_LSX(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      int width,
+                      uint32_t value);
 void ARGBShadeRow_LASX(const uint8_t* src_argb,
                        uint8_t* dst_argb,
                        int width,
@@ -5175,6 +5919,11 @@ void InterpolateRow_LSX(uint8_t* dst_ptr,
                         ptrdiff_t src_stride,
                         int width,
                         int source_y_fraction);
+void InterpolateRow_RVV(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int source_y_fraction);
 void InterpolateRow_Any_NEON(uint8_t* dst_ptr,
                              const uint8_t* src_ptr,
                              ptrdiff_t src_stride_ptr,
@@ -5482,7 +6231,19 @@ void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,
                              float* dst_ptr,
                              float param,
                              int width);
-
+// Convert FP16 Half Floats to FP32 Floats
+void ConvertFP16ToFP32Row_NEON(const uint16_t* src,  // fp16
+                               float* dst,
+                               int width);
+// Convert a column of FP16 Half Floats to a row of FP32 Floats
+void ConvertFP16ToFP32Column_NEON(const uint16_t* src,  // fp16
+                                  int src_stride,       // stride in elements
+                                  float* dst,
+                                  int width);
+// Convert FP32 Floats to FP16 Half Floats
+void ConvertFP32ToFP16Row_NEON(const float* src,
+                               uint16_t* dst,  // fp16
+                               int width);
 void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              int width,
@@ -5526,6 +6287,17 @@ void GaussCol_F32_C(const float* src0,
                     float* dst,
                     int width);
 
+void GaussRow_C(const uint32_t* src, uint16_t* dst, int width);
+void GaussCol_C(const uint16_t* src0,
+                const uint16_t* src1,
+                const uint16_t* src2,
+                const uint16_t* src3,
+                const uint16_t* src4,
+                uint32_t* dst,
+                int width);
+
+void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width);
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/include/libyuv/scale.h b/include/libyuv/scale.h
index 443f89c2..bfe4a344 100644
--- a/files/include/libyuv/scale.h
+++ b/include/libyuv/scale.h
@@ -27,39 +27,40 @@ typedef enum FilterMode {
 } FilterModeEnum;
 
 // Scale a YUV plane.
+// Returns 0 if successful.
 LIBYUV_API
-void ScalePlane(const uint8_t* src,
-                int src_stride,
-                int src_width,
-                int src_height,
-                uint8_t* dst,
-                int dst_stride,
-                int dst_width,
-                int dst_height,
-                enum FilterMode filtering);
+int ScalePlane(const uint8_t* src,
+               int src_stride,
+               int src_width,
+               int src_height,
+               uint8_t* dst,
+               int dst_stride,
+               int dst_width,
+               int dst_height,
+               enum FilterMode filtering);
 
 LIBYUV_API
-void ScalePlane_16(const uint16_t* src,
-                   int src_stride,
-                   int src_width,
-                   int src_height,
-                   uint16_t* dst,
-                   int dst_stride,
-                   int dst_width,
-                   int dst_height,
-                   enum FilterMode filtering);
+int ScalePlane_16(const uint16_t* src,
+                  int src_stride,
+                  int src_width,
+                  int src_height,
+                  uint16_t* dst,
+                  int dst_stride,
+                  int dst_width,
+                  int dst_height,
+                  enum FilterMode filtering);
 
 // Sample is expected to be in the low 12 bits.
 LIBYUV_API
-void ScalePlane_12(const uint16_t* src,
-                   int src_stride,
-                   int src_width,
-                   int src_height,
-                   uint16_t* dst,
-                   int dst_stride,
-                   int dst_width,
-                   int dst_height,
-                   enum FilterMode filtering);
+int ScalePlane_12(const uint16_t* src,
+                  int src_stride,
+                  int src_width,
+                  int src_height,
+                  uint16_t* dst,
+                  int dst_stride,
+                  int dst_width,
+                  int dst_height,
+                  enum FilterMode filtering);
 
 // Scales a YUV 4:2:0 image from the src width and height to the
 // dst width and height.
diff --git a/files/include/libyuv/scale_argb.h b/include/libyuv/scale_argb.h
index 7641f18e..7641f18e 100644
--- a/files/include/libyuv/scale_argb.h
+++ b/include/libyuv/scale_argb.h
diff --git a/files/include/libyuv/scale_rgb.h b/include/libyuv/scale_rgb.h
index d17c39fd..d17c39fd 100644
--- a/files/include/libyuv/scale_rgb.h
+++ b/include/libyuv/scale_rgb.h
diff --git a/files/include/libyuv/scale_row.h b/include/libyuv/scale_row.h
index 6cb5e128..02ed61ca 100644
--- a/files/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -29,7 +29,10 @@ extern "C" {
 #endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
@@ -133,6 +136,8 @@ extern "C" {
 #define HAS_SCALEROWDOWN34_NEON
 #define HAS_SCALEROWDOWN38_NEON
 #define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEUVROWDOWN2_NEON
+#define HAS_SCALEUVROWDOWN2LINEAR_NEON
 #define HAS_SCALEUVROWDOWN2BOX_NEON
 #define HAS_SCALEUVROWDOWNEVEN_NEON
 #define HAS_SCALEROWUP2_LINEAR_NEON
@@ -173,6 +178,38 @@ extern "C" {
 #define HAS_SCALEROWDOWN34_LSX
 #endif
 
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
+#define HAS_SCALEADDROW_RVV
+// TODO: Test ScaleARGBRowDownEven_RVV and enable it
+// #define HAS_SCALEARGBROWDOWNEVEN_RVV
+#define HAS_SCALEUVROWDOWN4_RVV
+#define HAS_SCALEUVROWDOWNEVEN_RVV
+#if __riscv_v_intrinsic == 11000
+#define HAS_SCALEARGBROWDOWN2_RVV
+#define HAS_SCALEARGBROWDOWN2BOX_RVV
+#define HAS_SCALEARGBROWDOWN2LINEAR_RVV
+#define HAS_SCALEARGBROWDOWNEVENBOX_RVV
+#define HAS_SCALEROWDOWN2_RVV
+#define HAS_SCALEROWDOWN2BOX_RVV
+#define HAS_SCALEROWDOWN2LINEAR_RVV
+#define HAS_SCALEROWDOWN34_0_BOX_RVV
+#define HAS_SCALEROWDOWN34_1_BOX_RVV
+#define HAS_SCALEROWDOWN34_RVV
+#define HAS_SCALEROWDOWN38_2_BOX_RVV
+#define HAS_SCALEROWDOWN38_3_BOX_RVV
+#define HAS_SCALEROWDOWN38_RVV
+#define HAS_SCALEROWDOWN4_RVV
+#define HAS_SCALEROWDOWN4BOX_RVV
+#define HAS_SCALEROWUP2_BILINEAR_RVV
+#define HAS_SCALEROWUP2_LINEAR_RVV
+#define HAS_SCALEUVROWDOWN2_RVV
+#define HAS_SCALEUVROWDOWN2BOX_RVV
+#define HAS_SCALEUVROWDOWN2LINEAR_RVV
+#define HAS_SCALEUVROWUP2_BILINEAR_RVV
+#define HAS_SCALEUVROWUP2_LINEAR_RVV
+#endif
+#endif
+
 // Scale ARGB vertically with bilinear interpolation.
 void ScalePlaneVertical(int src_height,
                         int dst_width,
@@ -214,6 +251,17 @@ void ScalePlaneVertical_16To8(int src_height,
                               int scale,
                               enum FilterMode filtering);
 
+void ScalePlaneDown2_16To8(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           int scale,
+                           enum FilterMode filtering);
+
 // Simplify the filtering based on scale factors.
 enum FilterMode ScaleFilterReduce(int src_width,
                                   int src_height,
@@ -259,6 +307,16 @@ void ScaleRowDown2_16_C(const uint16_t* src_ptr,
                         ptrdiff_t src_stride,
                         uint16_t* dst,
                         int dst_width);
+void ScaleRowDown2_16To8_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width,
+                           int scale);
+void ScaleRowDown2_16To8_Odd_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width,
+                               int scale);
 void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst,
@@ -267,6 +325,16 @@ void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint16_t* dst,
                               int dst_width);
+void ScaleRowDown2Linear_16To8_C(const uint16_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst,
+                                 int dst_width,
+                                 int scale);
+void ScaleRowDown2Linear_16To8_Odd_C(const uint16_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst,
+                                     int dst_width,
+                                     int scale);
 void ScaleRowDown2Box_C(const uint8_t* src_ptr,
                         ptrdiff_t src_stride,
                         uint8_t* dst,
@@ -279,6 +347,16 @@ void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint16_t* dst,
                            int dst_width);
+void ScaleRowDown2Box_16To8_C(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width,
+                              int scale);
+void ScaleRowDown2Box_16To8_Odd_C(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst,
+                                  int dst_width,
+                                  int scale);
 void ScaleRowDown4_C(const uint8_t* src_ptr,
                      ptrdiff_t src_stride,
                      uint8_t* dst,
@@ -906,6 +984,18 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst,
                                int dst_width);
+void ScaleARGBRowDown2_RVV(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width);
+void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width);
 void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
                            ptrdiff_t src_stride,
                            uint8_t* dst_argb,
@@ -1018,6 +1108,16 @@ void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb,
                                  int src_stepx,
                                  uint8_t* dst_argb,
                                  int dst_width);
+void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int32_t src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
 void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr,
                                    ptrdiff_t src_stride,
                                    int src_stepx,
@@ -1100,6 +1200,18 @@ void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst_uv,
                             int dst_width);
+void ScaleUVRowDown2_RVV(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_uv,
+                         int dst_width);
+void ScaleUVRowDown2Linear_RVV(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_uv,
+                               int dst_width);
+void ScaleUVRowDown2Box_RVV(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width);
 void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
@@ -1160,6 +1272,16 @@ void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr,
                                 int src_stepx,
                                 uint8_t* dst_uv,
                                 int dst_width);
+void ScaleUVRowDown4_RVV(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int32_t src_stepx,
+                         uint8_t* dst_uv,
+                         int dst_width);
+void ScaleUVRowDownEven_RVV(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            int32_t src_stepx,
+                            uint8_t* dst_uv,
+                            int dst_width);
 void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             int32_t src_stepx,
@@ -1249,6 +1371,14 @@ void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
                                      uint8_t* dst_ptr,
                                      ptrdiff_t dst_stride,
                                      int dst_width);
+void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width);
 void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
                                    uint16_t* dst_ptr,
                                    int dst_width);
@@ -1701,6 +1831,61 @@ void ScaleRowDown34_1_Box_Any_LSX(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   int dst_width);
 
+void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleRowDown2_RVV(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width);
+void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+
+void ScaleRowDown4_RVV(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst_ptr,
+                       int dst_width);
+void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width);
+void ScaleRowDown34_RVV(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width);
+void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_RVV(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+
+void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/include/libyuv/scale_uv.h b/include/libyuv/scale_uv.h
index 8e74e319..8e74e319 100644
--- a/files/include/libyuv/scale_uv.h
+++ b/include/libyuv/scale_uv.h
diff --git a/files/include/libyuv/version.h b/include/libyuv/version.h
index a85be048..a9c54400 100644
--- a/files/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1837
+#define LIBYUV_VERSION 1883
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/files/include/libyuv/video_common.h b/include/libyuv/video_common.h
index 32b8a521..32b8a521 100644
--- a/files/include/libyuv/video_common.h
+++ b/include/libyuv/video_common.h
diff --git a/infra/config/OWNERS b/infra/config/OWNERS
new file mode 100644
index 00000000..2c4f90a0
--- /dev/null
+++ b/infra/config/OWNERS
@@ -0,0 +1,3 @@
+fbarchard@chromium.org
+mbonadei@chromium.org
+jansson@google.com
diff --git a/files/infra/config/PRESUBMIT.py b/infra/config/PRESUBMIT.py
index 01ec0eed..f79e08ad 100644
--- a/files/infra/config/PRESUBMIT.py
+++ b/infra/config/PRESUBMIT.py
@@ -2,6 +2,8 @@
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 
+USE_PYTHON3 = True
+
 
 def CheckChangeOnUpload(input_api, output_api):
   return input_api.canned_checks.CheckChangedLUCIConfigs(input_api, output_api)
diff --git a/files/infra/config/README.md b/infra/config/README.md
index e5e3b5f8..e5e3b5f8 100644
--- a/files/infra/config/README.md
+++ b/infra/config/README.md
diff --git a/files/infra/config/codereview.settings b/infra/config/codereview.settings
index 6d742273..6d742273 100644
--- a/files/infra/config/codereview.settings
+++ b/infra/config/codereview.settings
diff --git a/files/infra/config/commit-queue.cfg b/infra/config/commit-queue.cfg
index 4a8d77f4..4a8d77f4 100644
--- a/files/infra/config/commit-queue.cfg
+++ b/infra/config/commit-queue.cfg
diff --git a/files/infra/config/cr-buildbucket.cfg b/infra/config/cr-buildbucket.cfg
index 061cf33b..7415851b 100644
--- a/files/infra/config/cr-buildbucket.cfg
+++ b/infra/config/cr-buildbucket.cfg
@@ -29,10 +29,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -60,10 +59,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -91,10 +89,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -111,7 +108,7 @@ buckets {
       name: "Android Tester ARM32 Debug (Nexus 5X)"
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
-      dimensions: "device_type:bullhead"
+      dimensions: "device_type:walleye"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -120,9 +117,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -139,7 +136,7 @@ buckets {
       name: "Android Tester ARM32 Release (Nexus 5X)"
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
-      dimensions: "device_type:bullhead"
+      dimensions: "device_type:walleye"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -148,9 +145,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -167,7 +164,7 @@ buckets {
       name: "Android Tester ARM64 Debug (Nexus 5X)"
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
-      dimensions: "device_type:bullhead"
+      dimensions: "device_type:walleye"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -176,9 +173,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -206,10 +203,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -237,10 +233,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -268,10 +263,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -299,10 +293,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -330,10 +323,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -361,10 +353,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -392,10 +383,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -423,10 +413,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -454,10 +443,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -485,10 +473,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -516,10 +503,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -537,7 +523,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -546,9 +532,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -566,7 +552,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -575,9 +561,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -595,7 +581,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -604,9 +590,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -634,10 +620,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -665,10 +650,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -696,10 +680,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -727,10 +710,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -758,10 +740,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -789,10 +770,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -820,10 +800,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -851,10 +830,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -872,7 +850,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -881,9 +859,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -901,7 +879,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.ci"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -910,9 +888,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "client.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -985,7 +963,7 @@ buckets {
       name: "android"
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
-      dimensions: "device_type:bullhead"
+      dimensions: "device_type:walleye"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -994,9 +972,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1013,7 +991,7 @@ buckets {
       name: "android_arm64"
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
-      dimensions: "device_type:bullhead"
+      dimensions: "device_type:walleye"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -1022,9 +1000,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1041,7 +1019,7 @@ buckets {
       name: "android_rel"
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
-      dimensions: "device_type:bullhead"
+      dimensions: "device_type:walleye"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -1050,9 +1028,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1080,10 +1058,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1111,10 +1088,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1132,7 +1108,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -1141,9 +1117,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1161,7 +1137,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -1170,9 +1146,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1200,10 +1176,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1231,10 +1206,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1262,10 +1236,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1293,10 +1266,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1324,10 +1296,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1355,10 +1326,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1386,10 +1356,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1417,10 +1386,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1438,7 +1406,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -1447,9 +1415,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1467,7 +1435,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -1476,9 +1444,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1496,7 +1464,7 @@ buckets {
       swarming_host: "chromium-swarm.appspot.com"
       swarming_tags: "vpython:native-python-wrapper"
       dimensions: "cpu:x86-64"
-      dimensions: "os:Mac-10.15"
+      dimensions: "os:Mac-12"
       dimensions: "pool:luci.flex.try"
       exe {
         cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
@@ -1505,9 +1473,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1535,10 +1503,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": true,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "run_presubmit",'
@@ -1568,10 +1535,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": false,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1599,10 +1565,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": false,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1630,10 +1595,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": false,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1661,10 +1625,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": false,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1692,10 +1655,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": false,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
@@ -1723,10 +1685,9 @@ buckets {
       }
       properties:
         '{'
-        '  "$build/goma": {'
-        '    "enable_ats": false,'
-        '    "server_host": "goma.chromium.org",'
-        '    "use_luci_auth": true'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
         '  },'
         '  "builder_group": "tryserver.libyuv",'
         '  "recipe": "libyuv/libyuv"'
diff --git a/files/infra/config/luci-logdog.cfg b/infra/config/luci-logdog.cfg
index adc75bef..adc75bef 100644
--- a/files/infra/config/luci-logdog.cfg
+++ b/infra/config/luci-logdog.cfg
diff --git a/files/infra/config/luci-milo.cfg b/infra/config/luci-milo.cfg
index baf786f2..baf786f2 100644
--- a/files/infra/config/luci-milo.cfg
+++ b/infra/config/luci-milo.cfg
diff --git a/files/infra/config/luci-scheduler.cfg b/infra/config/luci-scheduler.cfg
index 0ec5dd0e..0ec5dd0e 100644
--- a/files/infra/config/luci-scheduler.cfg
+++ b/infra/config/luci-scheduler.cfg
diff --git a/files/infra/config/main.star b/infra/config/main.star
index b922ca02..e83afe4f 100755
--- a/files/infra/config/main.star
+++ b/infra/config/main.star
@@ -8,22 +8,14 @@ lucicfg.check_version("1.30.9")
 LIBYUV_GIT = "https://chromium.googlesource.com/libyuv/libyuv"
 LIBYUV_GERRIT = "https://chromium-review.googlesource.com/libyuv/libyuv"
 
-GOMA_BACKEND_RBE_PROD = {
-    "server_host": "goma.chromium.org",
-    "use_luci_auth": True,
+RECLIENT_CI = {
+    "instance": "rbe-webrtc-trusted",
+    "metrics_project": "chromium-reclient-metrics",
 }
 
-GOMA_BACKEND_RBE_ATS_PROD = {
-    "server_host": "goma.chromium.org",
-    "use_luci_auth": True,
-    "enable_ats": True,
-}
-
-# Disable ATS on Windows CQ/try.
-GOMA_BACKEND_RBE_NO_ATS_PROD = {
-    "server_host": "goma.chromium.org",
-    "use_luci_auth": True,
-    "enable_ats": False,
+RECLIENT_CQ = {
+    "instance": "rbe-webrtc-untrusted",
+    "metrics_project": "chromium-reclient-metrics",
 }
 
 # Use LUCI Scheduler BBv2 names and add Scheduler realms configs.
@@ -70,6 +62,10 @@ luci.project(
     ],
     bindings = [
         luci.binding(
+            roles = "role/swarming.taskTriggerer",  # for LED tasks.
+            groups = "project-libyuv-admins",
+        ),
+        luci.binding(
             roles = "role/configs.validator",
             users = "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com",
         ),
@@ -195,28 +191,15 @@ luci.bucket(
 
 def get_os_dimensions(os):
     if os == "android":
-        return {"device_type": "bullhead"}
+        return {"device_type": "walleye"}
     if os == "ios" or os == "mac":
-        return {"os": "Mac-10.15", "cpu": "x86-64"}
+        return {"os": "Mac-12", "cpu": "x86-64"}
     elif os == "win":
         return {"os": "Windows-10", "cores": "8", "cpu": "x86-64"}
     elif os == "linux":
         return {"os": "Ubuntu-18.04", "cores": "8", "cpu": "x86-64"}
     return {}
 
-def get_os_properties(os, try_builder = False):
-    if os == "android":
-        return {"$build/goma": GOMA_BACKEND_RBE_PROD}
-    elif os in ("ios", "mac"):
-        return {"$build/goma": GOMA_BACKEND_RBE_PROD}
-    elif os == "win" and try_builder:
-        return {"$build/goma": GOMA_BACKEND_RBE_NO_ATS_PROD}
-    elif os == "win":
-        return {"$build/goma": GOMA_BACKEND_RBE_ATS_PROD}
-    elif os == "linux":
-        return {"$build/goma": GOMA_BACKEND_RBE_ATS_PROD}
-    return {}
-
 def libyuv_ci_builder(name, dimensions, properties, triggered_by):
     return luci.builder(
         name = name,
@@ -254,7 +237,7 @@ def libyuv_try_builder(name, dimensions, properties, recipe_name = "libyuv/libyu
 
 def ci_builder(name, os, category, short_name = None):
     dimensions = get_os_dimensions(os)
-    properties = get_os_properties(os)
+    properties = {"$build/reclient": RECLIENT_CI}
 
     dimensions["pool"] = "luci.flex.ci"
     properties["builder_group"] = "client.libyuv"
@@ -265,7 +248,7 @@ def ci_builder(name, os, category, short_name = None):
 
 def try_builder(name, os, experiment_percentage = None):
     dimensions = get_os_dimensions(os)
-    properties = get_os_properties(os, try_builder = True)
+    properties = {"$build/reclient": RECLIENT_CQ}
 
     dimensions["pool"] = "luci.flex.try"
     properties["builder_group"] = "tryserver.libyuv"
diff --git a/files/infra/config/project.cfg b/infra/config/project.cfg
index 700226ad..3c327118 100644
--- a/files/infra/config/project.cfg
+++ b/infra/config/project.cfg
@@ -7,7 +7,7 @@
 name: "libyuv"
 access: "group:all"
 lucicfg {
-  version: "1.30.9"
+  version: "1.39.14"
   package_dir: "."
   config_dir: "."
   entry_point: "main.star"
diff --git a/files/infra/config/realms.cfg b/infra/config/realms.cfg
index ae04529e..16ffaac9 100644
--- a/files/infra/config/realms.cfg
+++ b/infra/config/realms.cfg
@@ -38,6 +38,10 @@ realms {
     role: "role/scheduler.reader"
     principals: "group:all"
   }
+  bindings {
+    role: "role/swarming.taskTriggerer"
+    principals: "group:project-libyuv-admins"
+  }
 }
 realms {
   name: "ci"
diff --git a/files/libyuv.gni b/libyuv.gni
index 8df40ba2..343160c3 100644
--- a/files/libyuv.gni
+++ b/libyuv.gni
@@ -6,13 +6,15 @@
 # in the file PATENTS. All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
-import("//build_overrides/build.gni")
 import("//build/config/arm.gni")
+import("//build/config/loongarch64.gni")
 import("//build/config/mips.gni")
+import("//build_overrides/build.gni")
 
 declare_args() {
   libyuv_include_tests = !build_with_chromium
   libyuv_disable_jpeg = false
+  libyuv_disable_rvv = false
   libyuv_use_neon =
       current_cpu == "arm64" ||
       (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon))
@@ -20,4 +22,8 @@ declare_args() {
       (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_msa
   libyuv_use_mmi =
       (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_mmi
+  libyuv_use_lsx =
+      (current_cpu == "loong64") && loongarch64_use_lsx
+  libyuv_use_lasx =
+      (current_cpu == "loong64") && loongarch64_use_lasx
 }
diff --git a/files/libyuv.gyp b/libyuv.gyp
index f73a1a4b..f73a1a4b 100644
--- a/files/libyuv.gyp
+++ b/libyuv.gyp
diff --git a/files/libyuv.gypi b/libyuv.gypi
index 48936aa7..48936aa7 100644
--- a/files/libyuv.gypi
+++ b/libyuv.gypi
diff --git a/files/linux.mk b/linux.mk
index b541b47c..d19a888a 100644
--- a/files/linux.mk
+++ b/linux.mk
@@ -33,6 +33,7 @@ LOCAL_OBJ_FILES := \
 	source/rotate_argb.o       \
 	source/rotate_common.o     \
 	source/rotate_gcc.o        \
+	source/rotate_lsx.o        \
 	source/rotate_msa.o        \
 	source/rotate_neon.o       \
 	source/rotate_neon64.o     \
@@ -40,19 +41,24 @@ LOCAL_OBJ_FILES := \
 	source/row_any.o           \
 	source/row_common.o        \
 	source/row_gcc.o           \
+	source/row_lasx.o          \
+	source/row_lsx.o           \
 	source/row_msa.o           \
 	source/row_neon.o          \
 	source/row_neon64.o        \
+	source/row_rvv.o           \
 	source/row_win.o           \
 	source/scale.o             \
 	source/scale_any.o         \
 	source/scale_argb.o        \
 	source/scale_common.o      \
 	source/scale_gcc.o         \
+	source/scale_lsx.o         \
 	source/scale_msa.o         \
 	source/scale_neon.o        \
 	source/scale_neon64.o      \
 	source/scale_rgb.o         \
+	source/scale_rvv.o         \
 	source/scale_uv.o          \
 	source/scale_win.o         \
 	source/video_common.o
diff --git a/public.mk b/public.mk
index 259ece21..1342307a 100644
--- a/public.mk
+++ b/public.mk
@@ -3,7 +3,7 @@
 # Note that dependencies on NDK are not directly listed since NDK auto adds
 # them.
 
-LIBYUV_INCLUDES := $(LIBYUV_PATH)/files/include
+LIBYUV_INCLUDES := $(LIBYUV_PATH)/include
 
 LIBYUV_C_FLAGS :=
 
diff --git a/files/pylintrc b/pylintrc
index b8bea334..b8bea334 100644
--- a/files/pylintrc
+++ b/pylintrc
diff --git a/riscv_script/prepare_toolchain_qemu.sh b/riscv_script/prepare_toolchain_qemu.sh
new file mode 100755
index 00000000..2a901739
--- /dev/null
+++ b/riscv_script/prepare_toolchain_qemu.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+set -ev
+
+# Download & build RISC-V Clang toolchain & QEMU emulator. 
+# RISC-V Clang is for cross compile with  the RISC-V Vector ISA.
+# RISC-V QEMU is used to run the test suite.
+#
+# Requirements: Linux host w/ working C++ compiler, git, cmake, ninja, wget, tar
+
+# NOTE: this script must be run from the top-level directory of the LIBYUV_SRC_DIR.
+
+RISCV_TRIPLE="riscv64-unknown-linux-gnu"
+RISCV_QEMU="qemu-riscv64"
+
+LIBYUV_SRC_DIR=$(pwd)
+BUILD_DIR="$LIBYUV_SRC_DIR"/build-toolchain-qemu
+INSTALL_QEMU="$BUILD_DIR"/riscv-qemu
+INSTALL_CLANG="$BUILD_DIR"/riscv-clang
+
+LLVM_VERSION="16.0.0"
+LLVM_NAME=llvm-project-"$LLVM_VERSION".src
+
+RISCV_GNU_TOOLCHAIN="$BUILD_DIR"/riscv-gnu-toolchain
+RISCV_CLANG_TOOLCHAIN="$BUILD_DIR"/"$LLVM_NAME"
+
+QEMU_NAME="qemu-7.0.0"
+
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
+
+# Download and install RISC-V GNU Toolchain (needed to build Clang)
+if [ ! -d "$RISCV_GNU_TOOLCHAIN" ]
+then
+  git clone git@github.com:riscv/riscv-gnu-toolchain.git
+  pushd "$RISCV_GNU_TOOLCHAIN"
+  git submodule update --init --recursive
+  ./configure --with-cmodel=medany --prefix="$INSTALL_CLANG"
+  ionice nice make linux -j `nproc` install
+  popd
+fi
+
+# Download Clang toolchain & build cross compiler
+if [ ! -d "$RISCV_CLANG_TOOLCHAIN" ]
+then
+  wget https://github.com/llvm/llvm-project/releases/download/llvmorg-"$LLVM_VERSION"/"$LLVM_NAME".tar.xz
+  tar xvJf "$LLVM_NAME".tar.xz
+  pushd "$RISCV_CLANG_TOOLCHAIN"
+	cmake -DCMAKE_INSTALL_PREFIX="$INSTALL_CLANG" \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DLLVM_TARGETS_TO_BUILD="RISCV" \
+      -DLLVM_ENABLE_PROJECTS="clang" \
+      -DLLVM_DEFAULT_TARGET_TRIPLE="$RISCV_TRIPLE" \
+      -DLLVM_INSTALL_TOOLCHAIN_ONLY=On \
+      -DDEFAULT_SYSROOT=../sysroot \
+      -G "Ninja" "$RISCV_CLANG_TOOLCHAIN"/llvm
+	ionice nice ninja -j `nproc`
+	ionice nice ninja -j `nproc` install
+  popd
+  pushd "$INSTALL_CLANG"/bin
+  ln -sf clang "$RISCV_TRIPLE"-clang
+  ln -sf clang++ "$RISCV_TRIPLE"-clang++
+  popd
+fi
+
+# Download QEMU and build the riscv64 Linux usermode emulator
+if [ ! -d "$QEMU_NAME" ]
+then
+  wget https://download.qemu.org/"$QEMU_NAME".tar.xz
+  tar xvJf "$QEMU_NAME".tar.xz
+  pushd "$QEMU_NAME"
+  ./configure --target-list=riscv64-linux-user --prefix="$INSTALL_QEMU"
+  ionice nice make -j `nproc` install
+  popd
+fi
diff --git a/riscv_script/riscv-clang.cmake b/riscv_script/riscv-clang.cmake
new file mode 100644
index 00000000..e287941f
--- /dev/null
+++ b/riscv_script/riscv-clang.cmake
@@ -0,0 +1,55 @@
+set(CMAKE_CROSSCOMPILING TRUE)
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_SYSTEM_PROCESSOR "riscv64")
+
+option(USE_RVV "Enable riscv vector or not." ON)
+option(USE_AUTO_VECTORIZER "Enable riscv auto vectorizer or not." OFF)
+
+# Avoid to use system path for cross-compile
+set(CMAKE_FIND_USE_CMAKE_SYSTEM_PATH FALSE)
+
+set(TOOLCHAIN_PATH "" CACHE STRING "The toolcahin path.")
+if(NOT TOOLCHAIN_PATH)
+  set(TOOLCHAIN_PATH ${CMAKE_SOURCE_DIR}/build-toolchain-qemu/riscv-clang)
+endif()
+
+set(TOOLCHAIN_PREFIX "riscv64-unknown-linux-gnu-" CACHE STRING "The toolcahin prefix.")
+
+# toolchain setting
+set(CMAKE_C_COMPILER "${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}clang")
+set(CMAKE_CXX_COMPILER "${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}clang++")
+
+# CMake will just use the host-side tools for the following tools, so we setup them here.
+set(CMAKE_C_COMPILER_AR "${TOOLCHAIN_PATH}/bin/llvm-ar")
+set(CMAKE_CXX_COMPILER_AR "${TOOLCHAIN_PATH}/bin/llvm-ar")
+set(CMAKE_C_COMPILER_RANLIB "${TOOLCHAIN_PATH}/bin/llvm-ranlib")
+set(CMAKE_CXX_COMPILER_RANLIB "${TOOLCHAIN_PATH}/bin/llvm-ranlib")
+set(CMAKE_OBJDUMP "${TOOLCHAIN_PATH}/bin/llvm-objdump")
+set(CMAKE_OBJCOPY "${TOOLCHAIN_PATH}/bin/llvm-objcopy")
+
+# compile options
+set(RISCV_COMPILER_FLAGS "" CACHE STRING "Compile flags")
+# if user provides RISCV_COMPILER_FLAGS, appeding compile flags is avoided.
+if(RISCV_COMPILER_FLAGS STREQUAL "")
+  message(STATUS "USE_RVV: ${USE_RVV}")
+  message(STATUS "USE_AUTO_VECTORIZER: ${USE_AUTO_VECTORIZER}")
+  if(USE_RVV)
+    list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gcv")
+    if(NOT USE_AUTO_VECTORIZER)
+      # Disable auto-vectorizer
+      add_compile_options(-fno-vectorize -fno-slp-vectorize)
+    endif()
+  else()
+    list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gc")
+  endif()
+endif()
+message(STATUS "RISCV_COMPILER_FLAGS: ${RISCV_COMPILER_FLAGS}")
+
+set(CMAKE_C_FLAGS             "${RISCV_COMPILER_FLAGS} ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS           "${RISCV_COMPILER_FLAGS} ${CMAKE_CXX_FLAGS}")
+
+set(RISCV_LINKER_FLAGS "-lstdc++ -lpthread -lm -ldl")
+set(RISCV_LINKER_FLAGS_EXE)
+set(CMAKE_SHARED_LINKER_FLAGS "${RISCV_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
+set(CMAKE_MODULE_LINKER_FLAGS "${RISCV_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS    "${RISCV_LINKER_FLAGS} ${RISCV_LINKER_FLAGS_EXE} ${CMAKE_EXE_LINKER_FLAGS}")
diff --git a/riscv_script/run_qemu.sh b/riscv_script/run_qemu.sh
new file mode 100755
index 00000000..080af3b1
--- /dev/null
+++ b/riscv_script/run_qemu.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -x
+set -e
+
+USE_RVV="${USE_RVV:-OFF}"
+TOOLCHAIN_PATH="${TOOLCHAIN_PATH:-../../build-toolchain-qemu/riscv-clang}"
+QEMU_PREFIX_PATH="${QEMU_PREFIX_PATH:-../../build-toolchain-qemu/riscv-qemu/}"
+
+if [ "${USE_RVV}" = "ON" ];then
+  QEMU_OPTION="-cpu rv64,zba=true,zbb=true,zbc=true,zbs=true,v=true,vlen=512,elen=64,vext_spec=v1.0 -L ${TOOLCHAIN_PATH}/sysroot"
+else
+  QEMU_OPTION="-cpu rv64,zba=true,zbb=true,zbc=true,zbs=true -L ${TOOLCHAIN_PATH}/sysroot"
+fi
+
+$QEMU_PREFIX_PATH/bin/qemu-riscv64 $QEMU_OPTION $@
diff --git a/files/source/compare.cc b/source/compare.cc
index d4713b60..50a736bd 100644
--- a/files/source/compare.cc
+++ b/source/compare.cc
@@ -45,7 +45,7 @@ uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
   }
 #endif
 
-  while (count >= (uint64_t)(kBlockSize)) {
+  while (count >= (uint64_t)kBlockSize) {
     seed = HashDjb2_SSE(src, kBlockSize, seed);
     src += kBlockSize;
     count -= kBlockSize;
@@ -359,10 +359,10 @@ static double Ssim8x8_C(const uint8_t* src_a,
         (sum_a_sq + sum_b_sq + c1) *
         (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
 
-    if (ssim_d == 0.0) {
+    if (ssim_d == 0) {
       return DBL_MAX;
     }
-    return ssim_n * 1.0 / ssim_d;
+    return (double)ssim_n / (double)ssim_d;
   }
 }
 
diff --git a/files/source/compare_common.cc b/source/compare_common.cc
index d1cab8d2..d1cab8d2 100644
--- a/files/source/compare_common.cc
+++ b/source/compare_common.cc
diff --git a/files/source/compare_gcc.cc b/source/compare_gcc.cc
index b834b42a..33cbe25d 100644
--- a/files/source/compare_gcc.cc
+++ b/source/compare_gcc.cc
@@ -67,7 +67,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
       :
       : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
 
-  return static_cast<uint32_t>(diff);
+  return (uint32_t)(diff);
 }
 #else
 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
diff --git a/files/source/compare_msa.cc b/source/compare_msa.cc
index 0b807d37..0b807d37 100644
--- a/files/source/compare_msa.cc
+++ b/source/compare_msa.cc
diff --git a/files/source/compare_neon.cc b/source/compare_neon.cc
index afdd6012..afdd6012 100644
--- a/files/source/compare_neon.cc
+++ b/source/compare_neon.cc
diff --git a/files/source/compare_neon64.cc b/source/compare_neon64.cc
index 70fb9b91..70fb9b91 100644
--- a/files/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
diff --git a/files/source/compare_win.cc b/source/compare_win.cc
index 9bb27f1d..9bb27f1d 100644
--- a/files/source/compare_win.cc
+++ b/source/compare_win.cc
diff --git a/files/source/convert.cc b/source/convert.cc
index 7178580f..6ac5bc43 100644
--- a/files/source/convert.cc
+++ b/source/convert.cc
@@ -24,6 +24,10 @@ namespace libyuv {
 extern "C" {
 #endif
 
+// Subsample amount uses a shift.
+//   v is value
+//   a is amount to add to round up
+//   s is shift to subsample down
 #define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
 static __inline int Abs(int v) {
   return v >= 0 ? v : -v;
@@ -50,18 +54,25 @@ static int I4xxToI420(const uint8_t* src_y,
   const int dst_y_height = Abs(src_y_height);
   const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
   const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  int r;
   if (src_uv_width <= 0 || src_uv_height == 0) {
     return -1;
   }
   if (dst_y) {
-    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
-               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+    r = ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+                   dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
   }
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
-             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
-             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
-  return 0;
+  r = ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+                 dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+                 dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+  return r;
 }
 
 // Copy I420 with optional flipping.
@@ -199,6 +210,99 @@ static int Planar16bitTo8bit(const uint16_t* src_y,
   return 0;
 }
 
+static int I41xToI420(const uint16_t* src_y,
+                      int src_stride_y,
+                      const uint16_t* src_u,
+                      int src_stride_u,
+                      const uint16_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height,
+                      int depth) {
+  const int scale = 1 << (24 - depth);
+
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  {
+    const int uv_width = SUBSAMPLE(width, 1, 1);
+    const int uv_height = SUBSAMPLE(height, 1, 1);
+
+    Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
+                      height);
+    ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_u,
+                          dst_stride_u, src_u, dst_u, scale, kFilterBilinear);
+    ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_v,
+                          dst_stride_v, src_v, dst_v, scale, kFilterBilinear);
+  }
+  return 0;
+}
+
+static int I21xToI420(const uint16_t* src_y,
+                      int src_stride_y,
+                      const uint16_t* src_u,
+                      int src_stride_u,
+                      const uint16_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height,
+                      int depth) {
+  const int scale = 1 << (24 - depth);
+
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  {
+    const int uv_width = SUBSAMPLE(width, 1, 1);
+    const int uv_height = SUBSAMPLE(height, 1, 1);
+    const int dy = FixedDiv(height, uv_height);
+
+    Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
+                      height);
+    ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u,
+                             dst_stride_u, src_u, dst_u, 0, 32768, dy,
+                             /*bpp=*/1, scale, kFilterBilinear);
+    ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v,
+                             dst_stride_v, src_v, dst_v, 0, 32768, dy,
+                             /*bpp=*/1, scale, kFilterBilinear);
+  }
+  return 0;
+}
+
 // Convert 10 bit YUV to 8 bit.
 LIBYUV_API
 int I010ToI420(const uint16_t* src_y,
@@ -236,38 +340,9 @@ int I210ToI420(const uint16_t* src_y,
                int dst_stride_v,
                int width,
                int height) {
-  const int depth = 10;
-  const int scale = 1 << (24 - depth);
-
-  if (width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  {
-    const int uv_width = SUBSAMPLE(width, 1, 1);
-    const int uv_height = SUBSAMPLE(height, 1, 1);
-    const int dy = FixedDiv(height, uv_height);
-
-    Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
-                      height);
-    ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u,
-                             dst_stride_u, src_u, dst_u, 0, 32768, dy,
-                             /*bpp=*/1, scale, kFilterBilinear);
-    ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v,
-                             dst_stride_v, src_v, dst_v, 0, 32768, dy,
-                             /*bpp=*/1, scale, kFilterBilinear);
-  }
-  return 0;
+  return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 10);
 }
 
 LIBYUV_API
@@ -292,6 +367,26 @@ int I210ToI422(const uint16_t* src_y,
 }
 
 LIBYUV_API
+int I410ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 10);
+}
+
+LIBYUV_API
 int I410ToI444(const uint16_t* src_y,
                int src_stride_y,
                const uint16_t* src_u,
@@ -355,6 +450,26 @@ int I212ToI422(const uint16_t* src_y,
 }
 
 LIBYUV_API
+int I212ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 12);
+}
+
+LIBYUV_API
 int I412ToI444(const uint16_t* src_y,
                int src_stride_y,
                const uint16_t* src_u,
@@ -375,6 +490,26 @@ int I412ToI444(const uint16_t* src_y,
                            0, 12);
 }
 
+LIBYUV_API
+int I412ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 12);
+}
+
 // Any Ix10 To I010 format with mirroring.
 static int Ix10ToI010(const uint16_t* src_y,
                       int src_stride_y,
@@ -398,18 +533,25 @@ static int Ix10ToI010(const uint16_t* src_y,
   const int src_uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
   const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
   const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  int r;
   if (width <= 0 || height == 0) {
     return -1;
   }
   if (dst_y) {
-    ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-                  dst_y_width, dst_y_height, kFilterBilinear);
+    r = ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                      dst_y_width, dst_y_height, kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
   }
-  ScalePlane_12(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
-                dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
-  ScalePlane_12(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
-                dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
-  return 0;
+  r = ScalePlane_12(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+                    dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+                    dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+  return r;
 }
 
 LIBYUV_API
@@ -649,6 +791,8 @@ int I422ToNV21(const uint8_t* src_y,
   // Allocate u and v buffers
   align_buffer_64(plane_u, halfwidth * halfheight * 2);
   uint8_t* plane_v = plane_u + halfwidth * halfheight;
+  if (!plane_u)
+    return 1;
 
   I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
              dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
@@ -713,6 +857,112 @@ int MM21ToI420(const uint8_t* src_y,
   return 0;
 }
 
+LIBYUV_API
+int MM21ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
+  if (!src_y || !src_uv || !dst_yuy2 || width <= 0) {
+    return -1;
+  }
+
+  DetileToYUY2(src_y, src_stride_y, src_uv, src_stride_uv, dst_yuy2,
+               dst_stride_yuy2, width, height, 32);
+
+  return 0;
+}
+
+// Convert MT2T into P010. See tinyurl.com/mtk-10bit-video-format for format
+// documentation.
+// TODO(greenjustin): Add an MT2T to I420 conversion.
+LIBYUV_API
+int MT2TToP010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (width <= 0 || !height || !src_uv || !dst_uv) {
+    return -1;
+  }
+
+  {
+    int uv_width = (width + 1) & ~1;
+    int uv_height = (height + 1) / 2;
+    int y = 0;
+    const int tile_width = 16;
+    const int y_tile_height = 32;
+    const int uv_tile_height = 16;
+    int padded_width = (width + tile_width - 1) & ~(tile_width - 1);
+    int y_tile_row_size = padded_width * y_tile_height * 10 / 8;
+    int uv_tile_row_size = padded_width * uv_tile_height * 10 / 8;
+    size_t row_buf_size = padded_width * y_tile_height * sizeof(uint16_t);
+    void (*UnpackMT2T)(const uint8_t* src, uint16_t* dst, size_t size) =
+        UnpackMT2T_C;
+    align_buffer_64(row_buf, row_buf_size);
+    if (!row_buf)
+      return 1;
+
+#if defined(HAS_UNPACKMT2T_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      UnpackMT2T = UnpackMT2T_NEON;
+    }
+#endif
+    // Negative height means invert the image.
+    if (height < 0) {
+      height = -height;
+      uv_height = (height + 1) / 2;
+      if (dst_y) {
+        dst_y = dst_y + (height - 1) * dst_stride_y;
+        dst_stride_y = -dst_stride_y;
+      }
+      dst_uv = dst_uv + (uv_height - 1) * dst_stride_uv;
+      dst_stride_uv = -dst_stride_uv;
+    }
+
+    // Unpack and detile Y in rows of tiles
+    if (src_y && dst_y) {
+      for (y = 0; y < (height & ~(y_tile_height - 1)); y += y_tile_height) {
+        UnpackMT2T(src_y, (uint16_t*)row_buf, y_tile_row_size);
+        DetilePlane_16((uint16_t*)row_buf, padded_width, dst_y, dst_stride_y,
+                       width, y_tile_height, y_tile_height);
+        src_y += src_stride_y * y_tile_height;
+        dst_y += dst_stride_y * y_tile_height;
+      }
+      if (height & (y_tile_height - 1)) {
+        UnpackMT2T(src_y, (uint16_t*)row_buf, y_tile_row_size);
+        DetilePlane_16((uint16_t*)row_buf, padded_width, dst_y, dst_stride_y,
+                       width, height & (y_tile_height - 1), y_tile_height);
+      }
+    }
+
+    // Unpack and detile UV plane
+    for (y = 0; y < (uv_height & ~(uv_tile_height - 1)); y += uv_tile_height) {
+      UnpackMT2T(src_uv, (uint16_t*)row_buf, uv_tile_row_size);
+      DetilePlane_16((uint16_t*)row_buf, padded_width, dst_uv, dst_stride_uv,
+                     uv_width, uv_tile_height, uv_tile_height);
+      src_uv += src_stride_uv * uv_tile_height;
+      dst_uv += dst_stride_uv * uv_tile_height;
+    }
+    if (uv_height & (uv_tile_height - 1)) {
+      UnpackMT2T(src_uv, (uint16_t*)row_buf, uv_tile_row_size);
+      DetilePlane_16((uint16_t*)row_buf, padded_width, dst_uv, dst_stride_uv,
+                     uv_width, uv_height & (uv_tile_height - 1),
+                     uv_tile_height);
+    }
+    free_aligned_buffer_64(row_buf);
+  }
+  return 0;
+}
+
 #ifdef I422TONV21_ROW_VERSION
 // Unittest fails for this version.
 // 422 chroma is 1/2 width, 1x height
@@ -734,7 +984,7 @@ int I422ToNV21(const uint8_t* src_y,
   int y;
   void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
                      uint8_t* dst_uv, int width) = MergeUVRow_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
   int halfwidth = (width + 1) >> 1;
@@ -764,11 +1014,19 @@ int I422ToNV21(const uint8_t* src_y,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow = MergeUVRow_Any_NEON;
@@ -793,6 +1051,11 @@ int I422ToNV21(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow = MergeUVRow_RVV;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -833,6 +1096,11 @@ int I422ToNV21(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   if (dst_y) {
     CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height);
@@ -842,6 +1110,8 @@ int I422ToNV21(const uint8_t* src_y,
     int awidth = halfwidth * 2;
     align_buffer_64(row_vu_0, awidth * 2);
     uint8_t* row_vu_1 = row_vu_0 + awidth;
+    if (!row_vu_0)
+      return 1;
 
     for (y = 0; y < height - 1; y += 2) {
       MergeUVRow(src_v, src_u, row_vu_0, halfwidth);
@@ -1080,18 +1350,22 @@ int NV12ToNV24(const uint8_t* src_y,
                int dst_stride_uv,
                int width,
                int height) {
+  int r;
   if (width <= 0 || height == 0) {
     return -1;
   }
 
   if (dst_y) {
-    ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-               Abs(width), Abs(height), kFilterBilinear);
+    r = ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                   Abs(width), Abs(height), kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
   }
-  UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
-          SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
-          Abs(height), kFilterBilinear);
-  return 0;
+  r = UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
+              SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
+              Abs(height), kFilterBilinear);
+  return r;
 }
 
 LIBYUV_API
@@ -1105,20 +1379,88 @@ int NV16ToNV24(const uint8_t* src_y,
                int dst_stride_uv,
                int width,
                int height) {
+  int r;
   if (width <= 0 || height == 0) {
     return -1;
   }
 
   if (dst_y) {
-    ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-               Abs(width), Abs(height), kFilterBilinear);
+    r = ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                   Abs(width), Abs(height), kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
   }
-  UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
-          dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
+  r = UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
+              dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
+  return r;
+}
+
+// Any P[420]1[02] to I[420]1[02] format with mirroring.
+static int PxxxToIxxx(const uint16_t* src_y,
+                      int src_stride_y,
+                      const uint16_t* src_uv,
+                      int src_stride_uv,
+                      uint16_t* dst_y,
+                      int dst_stride_y,
+                      uint16_t* dst_u,
+                      int dst_stride_u,
+                      uint16_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height,
+                      int subsample_x,
+                      int subsample_y,
+                      int depth) {
+  const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+  const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+  ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
+                       depth);
+  SplitUVPlane_16(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                  dst_stride_v, uv_width, uv_height, depth);
   return 0;
 }
 
 LIBYUV_API
+int P010ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return PxxxToIxxx(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+                    dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+                    width, height, 1, 1, 10);
+}
+
+LIBYUV_API
+int P012ToI012(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return PxxxToIxxx(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+                    dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+                    width, height, 1, 1, 12);
+}
+
+LIBYUV_API
 int P010ToP410(const uint16_t* src_y,
                int src_stride_y,
                const uint16_t* src_uv,
@@ -1129,18 +1471,22 @@ int P010ToP410(const uint16_t* src_y,
                int dst_stride_uv,
                int width,
                int height) {
+  int r;
   if (width <= 0 || height == 0) {
     return -1;
   }
 
   if (dst_y) {
-    ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-                  Abs(width), Abs(height), kFilterBilinear);
+    r = ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                      Abs(width), Abs(height), kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
   }
-  UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
-             SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
-             Abs(height), kFilterBilinear);
-  return 0;
+  r = UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
+                 SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
+                 Abs(height), kFilterBilinear);
+  return r;
 }
 
 LIBYUV_API
@@ -1154,17 +1500,21 @@ int P210ToP410(const uint16_t* src_y,
                int dst_stride_uv,
                int width,
                int height) {
+  int r;
   if (width <= 0 || height == 0) {
     return -1;
   }
 
   if (dst_y) {
-    ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-                  Abs(width), Abs(height), kFilterBilinear);
+    r = ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                      Abs(width), Abs(height), kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
   }
-  UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
-             dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
-  return 0;
+  r = UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
+                 dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
+  return r;
 }
 
 // Convert YUY2 to I420.
@@ -1231,6 +1581,16 @@ int YUY2ToI420(const uint8_t* src_yuy2,
     }
   }
 #endif
+#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    YUY2ToYRow = YUY2ToYRow_Any_LSX;
+    YUY2ToUVRow = YUY2ToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_LSX;
+      YUY2ToUVRow = YUY2ToUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     YUY2ToYRow = YUY2ToYRow_Any_LASX;
@@ -1322,6 +1682,26 @@ int UYVYToI420(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    UYVYToYRow = UYVYToYRow_Any_LSX;
+    UYVYToUVRow = UYVYToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_LSX;
+      UYVYToUVRow = UYVYToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    UYVYToYRow = UYVYToYRow_Any_LSX;
+    UYVYToUVRow = UYVYToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_LSX;
+      UYVYToUVRow = UYVYToUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_UYVYTOYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     UYVYToYRow = UYVYToYRow_Any_LASX;
@@ -1574,6 +1954,176 @@ int ARGBToI420(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYRow = ARGBToYRow_Any_LASX;
+    ARGBToUVRow = ARGBToUVRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_LASX;
+      ARGBToUVRow = ARGBToUVRow_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    src_argb += src_stride_argb * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+  }
+  return 0;
+}
+
+#ifdef USE_EXTRACTALPHA
+// Convert ARGB to I420 with Alpha
+// The following version calls ARGBExtractAlpha on the full image.
+LIBYUV_API
+int ARGBToI420Alpha(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    uint8_t* dst_a,
+                    int dst_stride_a,
+                    int width,
+                    int height) {
+  int r = ARGBToI420(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, width, height);
+  if (r == 0) {
+    r = ARGBExtractAlpha(src_argb, src_stride_argb, dst_a, dst_stride_a, width,
+                         height);
+  }
+  return r;
+}
+#else  // USE_EXTRACTALPHA
+// Convert ARGB to I420 with Alpha
+LIBYUV_API
+int ARGBToI420Alpha(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    uint8_t* dst_a,
+                    int dst_stride_a,
+                    int width,
+                    int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a,
+                              int width) = ARGBExtractAlphaRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || !dst_a || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -1584,22 +2134,63 @@ int ARGBToI420(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
+                                               : ARGBExtractAlphaRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2
+                                                : ARGBExtractAlphaRow_Any_AVX2;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
+                                                : ARGBExtractAlphaRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
+                                                : ARGBExtractAlphaRow_Any_MSA;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_LSX
+                                                : ARGBExtractAlphaRow_Any_LSX;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBExtractAlphaRow = ARGBExtractAlphaRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
     ARGBToYRow(src_argb, dst_y, width);
     ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    ARGBExtractAlphaRow(src_argb, dst_a, width);
+    ARGBExtractAlphaRow(src_argb + src_stride_argb, dst_a + dst_stride_a,
+                        width);
     src_argb += src_stride_argb * 2;
     dst_y += dst_stride_y * 2;
     dst_u += dst_stride_u;
     dst_v += dst_stride_v;
+    dst_a += dst_stride_a * 2;
   }
   if (height & 1) {
     ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
     ARGBToYRow(src_argb, dst_y, width);
+    ARGBExtractAlphaRow(src_argb, dst_a, width);
   }
   return 0;
 }
+#endif  // USE_EXTRACTALPHA
 
 // Convert BGRA to I420.
 LIBYUV_API
@@ -1628,16 +2219,6 @@ int BGRAToI420(const uint8_t* src_bgra,
     src_bgra = src_bgra + (height - 1) * src_stride_bgra;
     src_stride_bgra = -src_stride_bgra;
   }
-#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
-    BGRAToYRow = BGRAToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      BGRAToUVRow = BGRAToUVRow_SSSE3;
-      BGRAToYRow = BGRAToYRow_SSSE3;
-    }
-  }
-#endif
 #if defined(HAS_BGRATOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     BGRAToYRow = BGRAToYRow_Any_NEON;
@@ -1654,12 +2235,46 @@ int BGRAToI420(const uint8_t* src_bgra,
     }
   }
 #endif
+#if defined(HAS_BGRATOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BGRAToYRow = BGRAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    BGRAToYRow = BGRAToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      BGRAToYRow = BGRAToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    BGRAToUVRow = BGRAToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      BGRAToUVRow = BGRAToUVRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     BGRAToYRow = BGRAToYRow_Any_MSA;
     BGRAToUVRow = BGRAToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
       BGRAToYRow = BGRAToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
       BGRAToUVRow = BGRAToUVRow_MSA;
     }
   }
@@ -1674,6 +2289,19 @@ int BGRAToI420(const uint8_t* src_bgra,
     }
   }
 #endif
+#if defined(HAS_BGRATOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    BGRAToYRow = BGRAToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      BGRAToYRow = BGRAToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    BGRAToYRow = BGRAToYRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
@@ -1786,6 +2414,19 @@ int ABGRToI420(const uint8_t* src_abgr,
     }
   }
 #endif
+#if defined(HAS_ABGRTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYRow = ABGRToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYRow = ABGRToYRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
@@ -1882,6 +2523,19 @@ int RGBAToI420(const uint8_t* src_rgba,
     }
   }
 #endif
+#if defined(HAS_RGBATOYROW_LASX)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToYRow = RGBAToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGBAToYRow = RGBAToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGBAToYRow = RGBAToYRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
@@ -1901,7 +2555,7 @@ int RGBAToI420(const uint8_t* src_rgba,
 
 // Enabled if 1 pass is available
 #if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
-     defined(HAS_RGB24TOYROW_LSX))
+     defined(HAS_RGB24TOYROW_LSX) || defined(HAS_RGB24TOYROW_RVV))
 #define HAS_RGB24TOYROW
 #endif
 
@@ -1986,6 +2640,11 @@ int RGB24ToI420(const uint8_t* src_rgb24,
     }
   }
 #endif
+#if defined(HAS_RGB24TOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGB24ToYRow = RGB24ToYRow_RVV;
+  }
+#endif
 
 // Other platforms do intermediate conversion from RGB24 to ARGB.
 #else  // HAS_RGB24TOYROW
@@ -2035,8 +2694,10 @@ int RGB24ToI420(const uint8_t* src_rgb24,
   {
 #if !defined(HAS_RGB24TOYROW)
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
@@ -2046,10 +2707,10 @@ int RGB24ToI420(const uint8_t* src_rgb24,
       RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
 #else
       RGB24ToARGBRow(src_rgb24, row, width);
-      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + row_size, width);
+      ARGBToUVRow(row, row_size, dst_u, dst_v, width);
       ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_rgb24 += src_stride_rgb24 * 2;
       dst_y += dst_stride_y * 2;
@@ -2075,7 +2736,8 @@ int RGB24ToI420(const uint8_t* src_rgb24,
 #undef HAS_RGB24TOYROW
 
 // Enabled if 1 pass is available
-#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA)
+#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+    defined(HAS_RGB24TOYJROW_RVV)
 #define HAS_RGB24TOYJROW
 #endif
 
@@ -2140,6 +2802,27 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
     }
   }
 #endif
+#if defined(HAS_RGB24TOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGB24ToYJRow = RGB24ToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGB24ToYJRow = RGB24ToYJRow_RVV;
+  }
+#endif
 
 // Other platforms do intermediate conversion from RGB24 to ARGB.
 #else  // HAS_RGB24TOYJROW
@@ -2189,8 +2872,10 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
   {
 #if !defined(HAS_RGB24TOYJROW)
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
@@ -2200,10 +2885,10 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
       RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
 #else
       RGB24ToARGBRow(src_rgb24, row, width);
-      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
-      ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + row_size, width);
+      ARGBToUVJRow(row, row_size, dst_u, dst_v, width);
       ARGBToYJRow(row, dst_y, width);
-      ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_rgb24 += src_stride_rgb24 * 2;
       dst_y += dst_stride_y * 2;
@@ -2230,7 +2915,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
 
 // Enabled if 1 pass is available
 #if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
-     defined(HAS_RAWTOYROW_LSX))
+     defined(HAS_RAWTOYROW_LSX) || defined(HAS_RAWTOYROW_RVV))
 #define HAS_RAWTOYROW
 #endif
 
@@ -2314,6 +2999,11 @@ int RAWToI420(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToYRow = RAWToYRow_RVV;
+  }
+#endif
 
 // Other platforms do intermediate conversion from RAW to ARGB.
 #else  // HAS_RAWTOYROW
@@ -2363,8 +3053,10 @@ int RAWToI420(const uint8_t* src_raw,
   {
 #if !defined(HAS_RAWTOYROW)
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
@@ -2374,10 +3066,10 @@ int RAWToI420(const uint8_t* src_raw,
       RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
 #else
       RAWToARGBRow(src_raw, row, width);
-      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width);
+      ARGBToUVRow(row, row_size, dst_u, dst_v, width);
       ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_raw += src_stride_raw * 2;
       dst_y += dst_stride_y * 2;
@@ -2403,7 +3095,8 @@ int RAWToI420(const uint8_t* src_raw,
 #undef HAS_RAWTOYROW
 
 // Enabled if 1 pass is available
-#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA)
+#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || \
+    defined(HAS_RAWTOYJROW_RVV)
 #define HAS_RAWTOYJROW
 #endif
 
@@ -2468,6 +3161,27 @@ int RAWToJ420(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RAWToYJRow = RAWToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RAWToYJRow = RAWToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToYJRow = RAWToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToYJRow = RAWToYJRow_RVV;
+  }
+#endif
 
 // Other platforms do intermediate conversion from RAW to ARGB.
 #else  // HAS_RAWTOYJROW
@@ -2517,8 +3231,10 @@ int RAWToJ420(const uint8_t* src_raw,
   {
 #if !defined(HAS_RAWTOYJROW)
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
@@ -2528,10 +3244,10 @@ int RAWToJ420(const uint8_t* src_raw,
       RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
 #else
       RAWToARGBRow(src_raw, row, width);
-      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
-      ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width);
+      ARGBToUVJRow(row, row_size, dst_u, dst_v, width);
       ARGBToYJRow(row, dst_y, width);
-      ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_raw += src_stride_raw * 2;
       dst_y += dst_stride_y * 2;
@@ -2695,8 +3411,10 @@ int RGB565ToI420(const uint8_t* src_rgb565,
 #if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
       defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
 #endif
     for (y = 0; y < height - 1; y += 2) {
 #if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
@@ -2706,10 +3424,10 @@ int RGB565ToI420(const uint8_t* src_rgb565,
       RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
 #else
       RGB565ToARGBRow(src_rgb565, row, width);
-      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + row_size, width);
+      ARGBToUVRow(row, row_size, dst_u, dst_v, width);
       ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_rgb565 += src_stride_rgb565 * 2;
       dst_y += dst_stride_y * 2;
@@ -2875,8 +3593,10 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
 #if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
       defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
@@ -2888,11 +3608,11 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
                      width);
 #else
       ARGB1555ToARGBRow(src_argb1555, row, width);
-      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + row_size,
                         width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToUVRow(row, row_size, dst_u, dst_v, width);
       ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_argb1555 += src_stride_argb1555 * 2;
       dst_y += dst_stride_y * 2;
@@ -3055,6 +3775,24 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -3070,8 +3808,10 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
   {
 #if !(defined(HAS_ARGB4444TOYROW_NEON))
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
@@ -3082,11 +3822,11 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
                      width);
 #else
       ARGB4444ToARGBRow(src_argb4444, row, width);
-      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + row_size,
                         width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToUVRow(row, row_size, dst_u, dst_v, width);
       ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_argb4444 += src_stride_argb4444 * 2;
       dst_y += dst_stride_y * 2;
@@ -3167,6 +3907,27 @@ int RGB24ToJ400(const uint8_t* src_rgb24,
     }
   }
 #endif
+#if defined(HAS_RGB24TOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGB24ToYJRow = RGB24ToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGB24ToYJRow = RGB24ToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RGB24ToYJRow(src_rgb24, dst_yj, width);
@@ -3235,6 +3996,27 @@ int RAWToJ400(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RAWToYJRow = RAWToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RAWToYJRow = RAWToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToYJRow = RAWToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToYJRow = RAWToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToYJRow(src_raw, dst_yj, width);
diff --git a/files/source/convert_argb.cc b/source/convert_argb.cc
index 71ef8c10..871fea59 100644
--- a/files/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -7,8 +7,12 @@
  *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
 #include "libyuv/convert_argb.h"
 
+#include <assert.h>
+
+#include "libyuv/convert_from_argb.h"
 #include "libyuv/cpu_id.h"
 #ifdef HAVE_JPEG
 #include "libyuv/mjpeg_decoder.h"
@@ -65,6 +69,7 @@ int I420ToARGBMatrix(const uint8_t* src_y,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -115,6 +120,14 @@ int I420ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGBROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToARGBRow = I422ToARGBRow_Any_LASX;
@@ -123,6 +136,11 @@ int I420ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -298,6 +316,7 @@ int I422ToARGBMatrix(const uint8_t* src_y,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -355,6 +374,14 @@ int I422ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGBROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToARGBRow = I422ToARGBRow_Any_LASX;
@@ -363,6 +390,11 @@ int I422ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -536,6 +568,7 @@ int I444ToARGBMatrix(const uint8_t* src_y,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I444ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -592,6 +625,11 @@ int I444ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I444TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToARGBRow = I444ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -747,6 +785,133 @@ int U444ToABGR(const uint8_t* src_y,
                           width, height);
 }
 
+// Convert I444 to RGB24 with matrix.
+LIBYUV_API
+int I444ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height) {
+  int y;
+  void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I444ToRGB24Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
+      dst_stride_rgb24 == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_rgb24 = 0;
+  }
+#if defined(HAS_I444TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToRGB24Row = I444ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I444ToRGB24Row = I444ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToRGB24Row = I444ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToRGB24Row = I444ToRGB24Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToRGB24Row = I444ToRGB24Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToRGB24Row = I444ToRGB24Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I444ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I444 to RGB24.
+LIBYUV_API
+int I444ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I444ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvI601Constants, width, height);
+}
+
+// Convert I444 to RAW.
+LIBYUV_API
+int I444ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I444ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuI601Constants,  // Use Yvu matrix
+                           width, height);
+}
+
 // Convert 10 bit YUV to ARGB with matrix.
 // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
 // multiply 10 bit yuv into high bits to allow any number of bits.
@@ -767,6 +932,7 @@ int I010ToAR30Matrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I210ToAR30Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -926,6 +1092,7 @@ int I012ToAR30Matrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I212ToAR30Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -983,6 +1150,7 @@ int I210ToAR30Matrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I210ToAR30Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -1137,6 +1305,7 @@ int I410ToAR30Matrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I410ToAR30Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -1190,6 +1359,7 @@ int I010ToARGBMatrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I210ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1353,6 +1523,7 @@ int I012ToARGBMatrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I212ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1408,6 +1579,7 @@ int I210ToARGBMatrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I210ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1568,6 +1740,7 @@ int I410ToARGBMatrix(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I410ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1617,6 +1790,7 @@ int P010ToARGBMatrix(const uint16_t* src_y,
   void (*P210ToARGBRow)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1667,6 +1841,7 @@ int P210ToARGBMatrix(const uint16_t* src_y,
   void (*P210ToARGBRow)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1715,6 +1890,7 @@ int P010ToAR30Matrix(const uint16_t* src_y,
   void (*P210ToAR30Row)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -1765,6 +1941,7 @@ int P210ToAR30Matrix(const uint16_t* src_y,
   void (*P210ToAR30Row)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -1823,6 +2000,7 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y,
                              int width) = I422AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -1865,6 +2043,14 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422ALPHATOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422ALPHATOARGBROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX;
@@ -1873,6 +2059,11 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422ALPHATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_RVV;
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -1905,6 +2096,11 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -1947,6 +2143,7 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y,
                              int width) = I422AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -1989,6 +2186,14 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422ALPHATOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422ALPHATOARGBROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX;
@@ -1997,6 +2202,11 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422ALPHATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_RVV;
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -2029,6 +2239,11 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -2069,6 +2284,7 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y,
                              int width) = I444AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -2111,6 +2327,11 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I444ALPHATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_RVV;
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -2143,6 +2364,11 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I444AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -2312,6 +2538,7 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y,
                              int width) = I210AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -2370,6 +2597,11 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -2412,6 +2644,7 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y,
                              int width) = I210AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -2470,6 +2703,11 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -2510,6 +2748,7 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y,
                              int width) = I410AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -2568,6 +2807,11 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -2597,6 +2841,7 @@ int I400ToARGBMatrix(const uint8_t* src_y,
   void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I400ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -2652,6 +2897,11 @@ int I400ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I400TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I400ToARGBRow = I400ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I400ToARGBRow(src_y, dst_argb, yuvconstants, width);
@@ -2739,6 +2989,12 @@ int J400ToARGB(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_J400TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    J400ToARGBRow = J400ToARGBRow_RVV;
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     J400ToARGBRow(src_y, dst_argb, width);
     src_y += src_stride_y;
@@ -2747,6 +3003,7 @@ int J400ToARGB(const uint8_t* src_y,
   return 0;
 }
 
+#ifndef __riscv
 // Shuffle table for converting BGRA to ARGB.
 static const uvec8 kShuffleMaskBGRAToARGB = {
     3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
@@ -2834,6 +3091,195 @@ int AR64ToAB64(const uint16_t* src_ar64,
   return AR64Shuffle(src_ar64, src_stride_ar64, dst_ab64, dst_stride_ab64,
                      (const uint8_t*)&kShuffleMaskAR64ToAB64, width, height);
 }
+#else
+// Convert BGRA to ARGB (same as ARGBToBGRA).
+LIBYUV_API
+int BGRAToARGB(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBToBGRA(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, width,
+                    height);
+}
+
+// Convert ARGB to BGRA.
+LIBYUV_API
+int ARGBToBGRA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToBGRARow)(const uint8_t* src_argb, uint8_t* dst_bgra, int width) =
+      ARGBToBGRARow_C;
+  if (!src_argb || !dst_bgra || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_bgra == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_bgra = 0;
+  }
+
+#if defined(HAS_ARGBTOBGRAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToBGRARow = ARGBToBGRARow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToBGRARow(src_argb, dst_bgra, width);
+    src_argb += src_stride_argb;
+    dst_bgra += dst_stride_bgra;
+  }
+  return 0;
+}
+
+// Convert ARGB to ABGR.
+LIBYUV_API
+int ARGBToABGR(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToABGRRow)(const uint8_t* src_argb, uint8_t* dst_abgr, int width) =
+      ARGBToABGRRow_C;
+  if (!src_argb || !dst_abgr || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_abgr == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_abgr = 0;
+  }
+
+#if defined(HAS_ARGBTOABGRROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToABGRRow = ARGBToABGRRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToABGRRow(src_argb, dst_abgr, width);
+    src_argb += src_stride_argb;
+    dst_abgr += dst_stride_abgr;
+  }
+  return 0;
+}
+
+// Convert ABGR to ARGB (same as ARGBToABGR).
+LIBYUV_API
+int ABGRToARGB(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBToABGR(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, width,
+                    height);
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*RGBAToARGBRow)(const uint8_t* src_rgba, uint8_t* dst_argb, int width) =
+      RGBAToARGBRow_C;
+  if (!src_rgba || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
+  }
+  // Coalesce rows.
+  if (src_stride_rgba == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgba = dst_stride_argb = 0;
+  }
+
+#if defined(HAS_RGBATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGBAToARGBRow = RGBAToARGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGBAToARGBRow(src_rgba, dst_argb, width);
+    src_rgba += src_stride_rgba;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert AR64 To AB64.
+LIBYUV_API
+int AR64ToAB64(const uint16_t* src_ar64,
+               int src_stride_ar64,
+               uint16_t* dst_ab64,
+               int dst_stride_ab64,
+               int width,
+               int height) {
+  int y;
+  void (*AR64ToAB64Row)(const uint16_t* src_ar64, uint16_t* dst_ab64,
+                        int width) = AR64ToAB64Row_C;
+  if (!src_ar64 || !dst_ab64 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
+    src_stride_ar64 = -src_stride_ar64;
+  }
+  // Coalesce rows.
+  if (src_stride_ar64 == width * 4 && dst_stride_ab64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar64 = dst_stride_ab64 = 0;
+  }
+
+#if defined(HAS_AR64TOAB64ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    AR64ToAB64Row = AR64ToAB64Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    AR64ToAB64Row(src_ar64, dst_ab64, width);
+    src_ar64 += src_stride_ar64;
+    dst_ab64 += dst_stride_ab64;
+  }
+  return 0;
+}
+#endif
 
 // Convert RGB24 to ARGB.
 LIBYUV_API
@@ -2901,6 +3347,11 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
     }
   }
 #endif
+#if defined(HAS_RGB24TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RGB24ToARGBRow(src_rgb24, dst_argb, width);
@@ -2976,6 +3427,11 @@ int RAWToARGB(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToARGBRow = RAWToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToARGBRow(src_raw, dst_argb, width);
@@ -3027,6 +3483,11 @@ int RAWToRGBA(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTORGBAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToRGBARow = RAWToRGBARow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToRGBARow(src_raw, dst_rgba, width);
@@ -3431,6 +3892,11 @@ int AR64ToARGB(const uint16_t* src_ar64,
     }
   }
 #endif
+#if defined(HAS_AR64TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    AR64ToARGBRow = AR64ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     AR64ToARGBRow(src_ar64, dst_argb, width);
@@ -3490,6 +3956,11 @@ int AB64ToARGB(const uint16_t* src_ab64,
     }
   }
 #endif
+#if defined(HAS_AB64TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    AB64ToARGBRow = AB64ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     AB64ToARGBRow(src_ab64, dst_argb, width);
@@ -3514,6 +3985,7 @@ int NV12ToARGBMatrix(const uint8_t* src_y,
   void (*NV12ToARGBRow)(
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -3571,6 +4043,11 @@ int NV12ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_NV12TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    NV12ToARGBRow = NV12ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
@@ -3598,6 +4075,7 @@ int NV21ToARGBMatrix(const uint8_t* src_y,
   void (*NV21ToARGBRow)(
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;
+  assert(yuvconstants);
   if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -3655,6 +4133,11 @@ int NV21ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_NV21TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    NV21ToARGBRow = NV21ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);
@@ -3741,6 +4224,7 @@ int NV12ToRGB24Matrix(const uint8_t* src_y,
   void (*NV12ToRGB24Row)(
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
@@ -3774,6 +4258,11 @@ int NV12ToRGB24Matrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_NV12TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
@@ -3801,6 +4290,7 @@ int NV21ToRGB24Matrix(const uint8_t* src_y,
   void (*NV21ToRGB24Row)(
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
@@ -3834,6 +4324,11 @@ int NV21ToRGB24Matrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_NV21TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
@@ -4143,6 +4638,7 @@ int Android420ToARGBMatrix(const uint8_t* src_y,
   const ptrdiff_t vu_off = src_v - src_u;
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -4174,6 +4670,8 @@ int Android420ToARGBMatrix(const uint8_t* src_y,
 
   // General case fallback creates NV12
   align_buffer_64(plane_uv, halfwidth * 2 * halfheight);
+  if (!plane_uv)
+    return 1;
   dst_uv = plane_uv;
   for (y = 0; y < halfheight; ++y) {
     WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth);
@@ -4243,6 +4741,7 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToRGBARow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
     return -1;
   }
@@ -4284,6 +4783,14 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGBAROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGBARow = I422ToRGBARow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TORGBAROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToRGBARow = I422ToRGBARow_Any_LASX;
@@ -4292,6 +4799,11 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGBAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGBARow = I422ToRGBARow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
@@ -4354,6 +4866,7 @@ int NV12ToRGB565Matrix(const uint8_t* src_y,
   void (*NV12ToRGB565Row)(
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
@@ -4456,6 +4969,7 @@ int I420ToRGBAMatrix(const uint8_t* src_y,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToRGBARow_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
     return -1;
   }
@@ -4497,6 +5011,14 @@ int I420ToRGBAMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGBAROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGBARow = I422ToRGBARow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TORGBAROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToRGBARow = I422ToRGBARow_Any_LASX;
@@ -4505,6 +5027,11 @@ int I420ToRGBAMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGBAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGBARow = I422ToRGBARow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
@@ -4572,6 +5099,7 @@ int I420ToRGB24Matrix(const uint8_t* src_y,
                          const uint8_t* v_buf, uint8_t* rgb_buf,
                          const struct YuvConstants* yuvconstants, int width) =
       I422ToRGB24Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
@@ -4613,6 +5141,14 @@ int I420ToRGB24Matrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGB24ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TORGB24ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToRGB24Row = I422ToRGB24Row_Any_LASX;
@@ -4621,6 +5157,11 @@ int I420ToRGB24Matrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGB24Row = I422ToRGB24Row_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
@@ -4742,6 +5283,134 @@ int H420ToRAW(const uint8_t* src_y,
                            width, height);
 }
 
+// Convert I422 to RGB24 with matrix.
+LIBYUV_API
+int I422ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height) {
+  int y;
+  void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB24Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToRGB24Row = I422ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToRGB24Row = I422ToRGB24Row_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGB24Row = I422ToRGB24Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to RGB24.
+LIBYUV_API
+int I422ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I422ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to RAW.
+LIBYUV_API
+int I422ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I422ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuI601Constants,  // Use Yvu matrix
+                           width, height);
+}
+
 // Convert I420 to ARGB1555.
 LIBYUV_API
 int I420ToARGB1555(const uint8_t* src_y,
@@ -4801,6 +5470,14 @@ int I420ToARGB1555(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGB1555ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGB1555ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToARGB1555Row = I422ToARGB1555Row_Any_LASX;
@@ -4882,6 +5559,14 @@ int I420ToARGB4444(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGB4444ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGB4444ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToARGB4444Row = I422ToARGB4444Row_Any_LASX;
@@ -4922,6 +5607,7 @@ int I420ToRGB565Matrix(const uint8_t* src_y,
                           const uint8_t* v_buf, uint8_t* rgb_buf,
                           const struct YuvConstants* yuvconstants, int width) =
       I422ToRGB565Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
@@ -4963,6 +5649,14 @@ int I420ToRGB565Matrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGB565ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TORGB565ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToRGB565Row = I422ToRGB565Row_Any_LASX;
@@ -5035,23 +5729,25 @@ int H420ToRGB565(const uint8_t* src_y,
                             &kYuvH709Constants, width, height);
 }
 
-// Convert I422 to RGB565.
+// Convert I422 to RGB565 with specified color matrix.
 LIBYUV_API
-int I422ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
+int I422ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height) {
   int y;
   void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
                           const uint8_t* v_buf, uint8_t* rgb_buf,
                           const struct YuvConstants* yuvconstants, int width) =
       I422ToRGB565Row_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
@@ -5093,6 +5789,14 @@ int I422ToRGB565(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGB565ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TORGB565ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToRGB565Row = I422ToRGB565Row_Any_LASX;
@@ -5103,7 +5807,7 @@ int I422ToRGB565(const uint8_t* src_y,
 #endif
 
   for (y = 0; y < height; ++y) {
-    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
     dst_rgb565 += dst_stride_rgb565;
     src_y += src_stride_y;
     src_u += src_stride_u;
@@ -5112,6 +5816,23 @@ int I422ToRGB565(const uint8_t* src_y,
   return 0;
 }
 
+// Convert I422 to RGB565.
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  return I422ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_rgb565, dst_stride_rgb565,
+                            &kYuvI601Constants, width, height);
+}
+
 // Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
 static const uint8_t kDither565_4x4[16] = {
     0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
@@ -5136,7 +5857,7 @@ int I420ToRGB565Dither(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToARGBRow_C;
   void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
-                                const uint32_t dither4, int width) =
+                                uint32_t dither4, int width) =
       ARGBToRGB565DitherRow_C;
   if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
@@ -5191,6 +5912,14 @@ int I420ToRGB565Dither(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGBROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToARGBRow = I422ToARGBRow_Any_LASX;
@@ -5199,6 +5928,11 @@ int I420ToRGB565Dither(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif
 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
@@ -5231,6 +5965,14 @@ int I420ToRGB565Dither(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB565DITHERROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX;
@@ -5242,6 +5984,8 @@ int I420ToRGB565Dither(const uint8_t* src_y,
   {
     // Allocate a row of argb.
     align_buffer_64(row_argb, width * 4);
+    if (!row_argb)
+      return 1;
     for (y = 0; y < height; ++y) {
       I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
       ARGBToRGB565DitherRow(row_argb, dst_rgb565,
@@ -5278,6 +6022,7 @@ int I420ToAR30Matrix(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToAR30Row_C;
 
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -5401,9 +6146,12 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I444ToARGBRow_C;
-  void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
-                      uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleRowUp2_Bilinear_Any_C;
+  void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                               uint8_t* dst_ptr, ptrdiff_t dst_stride,
+                               int dst_width) = ScaleRowUp2_Bilinear_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -5453,48 +6201,65 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I444TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToARGBRow = I444ToARGBRow_RVV;
+  }
+#endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+#if defined(HAS_SCALEROWUP2_BILINEAR_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_BILINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_RVV;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
   }
 #endif
 
   // alloc 4 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 4);
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4);
   uint8_t* temp_u_1 = row;
-  uint8_t* temp_u_2 = row + kRowSize;
-  uint8_t* temp_v_1 = row + kRowSize * 2;
-  uint8_t* temp_v_2 = row + kRowSize * 3;
-
-  Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-  Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+  uint8_t* temp_u_2 = row + row_size;
+  uint8_t* temp_v_1 = row + row_size * 2;
+  uint8_t* temp_v_2 = row + row_size * 3;
+  if (!row)
+    return 1;
+
+  ScaleRowUp2_Linear(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear(src_v, temp_v_1, width);
   I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
   dst_argb += dst_stride_argb;
   src_y += src_stride_y;
 
   for (y = 0; y < height - 2; y += 2) {
-    Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width);
+    Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width);
     I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -5506,8 +6271,8 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y,
   }
 
   if (!(height & 1)) {
-    Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+    ScaleRowUp2_Linear(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear(src_v, temp_v_1, width);
     I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
   }
 
@@ -5531,8 +6296,9 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y,
                         const uint8_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I444ToARGBRow_C;
-  void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
-      ScaleRowUp2_Linear_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -5582,36 +6348,48 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I444TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToARGBRow = I444ToARGBRow_RVV;
+  }
+#endif
 #if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
   }
 #endif
 
   // alloc 2 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2);
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2);
   uint8_t* temp_u = row;
-  uint8_t* temp_v = row + kRowSize;
+  uint8_t* temp_v = row + row_size;
+  if (!row)
+    return 1;
 
   for (y = 0; y < height; ++y) {
-    ScaleRowUp(src_u, temp_u, width);
-    ScaleRowUp(src_v, temp_v, width);
+    ScaleRowUp2_Linear(src_u, temp_u, width);
+    ScaleRowUp2_Linear(src_v, temp_v, width);
     I444ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -5623,6 +6401,156 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y,
   return 0;
 }
 
+static int I420ToRGB24MatrixBilinear(const uint8_t* src_y,
+                                     int src_stride_y,
+                                     const uint8_t* src_u,
+                                     int src_stride_u,
+                                     const uint8_t* src_v,
+                                     int src_stride_v,
+                                     uint8_t* dst_rgb24,
+                                     int dst_stride_rgb24,
+                                     const struct YuvConstants* yuvconstants,
+                                     int width,
+                                     int height) {
+  int y;
+  void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I444ToRGB24Row_C;
+  void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                               uint8_t* dst_ptr, ptrdiff_t dst_stride,
+                               int dst_width) = ScaleRowUp2_Bilinear_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I444TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToRGB24Row = I444ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I444ToRGB24Row = I444ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToRGB24Row = I444ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToRGB24Row = I444ToRGB24Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I444ToRGB24Row = I444ToRGB24Row_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToRGB24Row = I444ToRGB24Row_RVV;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_BILINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_RVV;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+  }
+#endif
+
+  // alloc 4 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4);
+  uint8_t* temp_u_1 = row;
+  uint8_t* temp_u_2 = row + row_size;
+  uint8_t* temp_v_1 = row + row_size * 2;
+  uint8_t* temp_v_2 = row + row_size * 3;
+  if (!row)
+    return 1;
+
+  ScaleRowUp2_Linear(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear(src_v, temp_v_1, width);
+  I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width);
+  dst_rgb24 += dst_stride_rgb24;
+  src_y += src_stride_y;
+
+  for (y = 0; y < height - 2; y += 2) {
+    Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width);
+    I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    I444ToRGB24Row(src_y, temp_u_2, temp_v_2, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+
+  if (!(height & 1)) {
+    ScaleRowUp2_Linear(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear(src_v, temp_v_1, width);
+    I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width);
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
 static int I010ToAR30MatrixBilinear(const uint16_t* src_y,
                                     int src_stride_y,
                                     const uint16_t* src_u,
@@ -5639,9 +6567,12 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I410ToAR30Row_C;
-  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleRowUp2_Bilinear_16_Any_C;
+  void (*Scale2RowUp_Bilinear_12)(
+      const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+      ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C;
+  void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                                int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -5668,41 +6599,46 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y,
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
   }
 #endif
 
   // alloc 4 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 4 * sizeof(uint16_t));
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4 * sizeof(uint16_t));
   uint16_t* temp_u_1 = (uint16_t*)(row);
-  uint16_t* temp_u_2 = (uint16_t*)(row) + kRowSize;
-  uint16_t* temp_v_1 = (uint16_t*)(row) + kRowSize * 2;
-  uint16_t* temp_v_2 = (uint16_t*)(row) + kRowSize * 3;
-
-  Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-  Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+  uint16_t* temp_u_2 = (uint16_t*)(row) + row_size;
+  uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2;
+  uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3;
+  if (!row)
+    return 1;
+
+  ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
   I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width);
   dst_ar30 += dst_stride_ar30;
   src_y += src_stride_y;
 
   for (y = 0; y < height - 2; y += 2) {
-    Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width);
+    Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width);
     I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width);
     dst_ar30 += dst_stride_ar30;
     src_y += src_stride_y;
@@ -5714,8 +6650,8 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y,
   }
 
   if (!(height & 1)) {
-    Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+    ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
     I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width);
   }
 
@@ -5740,8 +6676,9 @@ static int I210ToAR30MatrixLinear(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I410ToAR30Row_C;
-  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
-                     int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                                int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -5770,29 +6707,31 @@ static int I210ToAR30MatrixLinear(const uint16_t* src_y,
 
 #if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
   }
 #endif
 
   // alloc 2 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t));
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
   uint16_t* temp_u = (uint16_t*)(row);
-  uint16_t* temp_v = (uint16_t*)(row) + kRowSize;
+  uint16_t* temp_v = (uint16_t*)(row) + row_size;
+  if (!row)
+    return 1;
 
   for (y = 0; y < height; ++y) {
-    ScaleRowUp(src_u, temp_u, width);
-    ScaleRowUp(src_v, temp_v, width);
+    ScaleRowUp2_Linear_12(src_u, temp_u, width);
+    ScaleRowUp2_Linear_12(src_v, temp_v, width);
     I410ToAR30Row(src_y, temp_u, temp_v, dst_ar30, yuvconstants, width);
     dst_ar30 += dst_stride_ar30;
     src_y += src_stride_y;
@@ -5819,9 +6758,12 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I410ToARGBRow_C;
-  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleRowUp2_Bilinear_16_Any_C;
+  void (*Scale2RowUp_Bilinear_12)(
+      const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+      ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C;
+  void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                                int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -5848,41 +6790,46 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y,
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
   }
 #endif
 
   // alloc 4 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 4 * sizeof(uint16_t));
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4 * sizeof(uint16_t));
   uint16_t* temp_u_1 = (uint16_t*)(row);
-  uint16_t* temp_u_2 = (uint16_t*)(row) + kRowSize;
-  uint16_t* temp_v_1 = (uint16_t*)(row) + kRowSize * 2;
-  uint16_t* temp_v_2 = (uint16_t*)(row) + kRowSize * 3;
-
-  Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-  Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+  uint16_t* temp_u_2 = (uint16_t*)(row) + row_size;
+  uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2;
+  uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3;
+  if (!row)
+    return 1;
+
+  ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
   I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
   dst_argb += dst_stride_argb;
   src_y += src_stride_y;
 
   for (y = 0; y < height - 2; y += 2) {
-    Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width);
+    Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width);
     I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -5894,8 +6841,8 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y,
   }
 
   if (!(height & 1)) {
-    Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+    ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
     I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
   }
 
@@ -5919,8 +6866,9 @@ static int I210ToARGBMatrixLinear(const uint16_t* src_y,
                         const uint16_t* v_buf, uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
       I410ToARGBRow_C;
-  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
-                     int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                                int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -5949,29 +6897,31 @@ static int I210ToARGBMatrixLinear(const uint16_t* src_y,
 
 #if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
   }
 #endif
 
   // alloc 2 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t));
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
   uint16_t* temp_u = (uint16_t*)(row);
-  uint16_t* temp_v = (uint16_t*)(row) + kRowSize;
+  uint16_t* temp_v = (uint16_t*)(row) + row_size;
+  if (!row)
+    return 1;
 
   for (y = 0; y < height; ++y) {
-    ScaleRowUp(src_u, temp_u, width);
-    ScaleRowUp(src_v, temp_v, width);
+    ScaleRowUp2_Linear_12(src_u, temp_u, width);
+    ScaleRowUp2_Linear_12(src_v, temp_v, width);
     I410ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -6006,9 +6956,12 @@ static int I420AlphaToARGBMatrixBilinear(
                              int width) = I444AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
-  void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
-                      uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleRowUp2_Bilinear_Any_C;
+  void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                               uint8_t* dst_ptr, ptrdiff_t dst_stride,
+                               int dst_width) = ScaleRowUp2_Bilinear_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -6059,6 +7012,11 @@ static int I420AlphaToARGBMatrixBilinear(
     }
   }
 #endif
+#if defined(HAS_I444ALPHATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_RVV;
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -6091,40 +7049,58 @@ static int I420AlphaToARGBMatrixBilinear(
     }
   }
 #endif
-#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+#if defined(HAS_SCALEROWUP2_BILINEAR_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_BILINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_RVV;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
   }
 #endif
 
   // alloc 4 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 4);
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4);
   uint8_t* temp_u_1 = row;
-  uint8_t* temp_u_2 = row + kRowSize;
-  uint8_t* temp_v_1 = row + kRowSize * 2;
-  uint8_t* temp_v_2 = row + kRowSize * 3;
-
-  Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-  Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+  uint8_t* temp_u_2 = row + row_size;
+  uint8_t* temp_v_1 = row + row_size * 2;
+  uint8_t* temp_v_2 = row + row_size * 3;
+  if (!row)
+    return 1;
+
+  ScaleRowUp2_Linear(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear(src_v, temp_v_1, width);
   I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
                      width);
   if (attenuate) {
@@ -6135,8 +7111,8 @@ static int I420AlphaToARGBMatrixBilinear(
   src_a += src_stride_a;
 
   for (y = 0; y < height - 2; y += 2) {
-    Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width);
+    Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width);
     I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
                        width);
     if (attenuate) {
@@ -6158,8 +7134,8 @@ static int I420AlphaToARGBMatrixBilinear(
   }
 
   if (!(height & 1)) {
-    Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+    ScaleRowUp2_Linear(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear(src_v, temp_v_1, width);
     I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
                        width);
     if (attenuate) {
@@ -6193,8 +7169,9 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y,
                              int width) = I444AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
-  void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
-      ScaleRowUp2_Linear_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -6245,6 +7222,11 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I444ALPHATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_RVV;
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -6277,36 +7259,49 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
+
 #if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
   }
 #endif
 
   // alloc 2 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2);
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2);
   uint8_t* temp_u = row;
-  uint8_t* temp_v = row + kRowSize;
+  uint8_t* temp_v = row + row_size;
+  if (!row)
+    return 1;
 
   for (y = 0; y < height; ++y) {
-    ScaleRowUp(src_u, temp_u, width);
-    ScaleRowUp(src_v, temp_v, width);
+    ScaleRowUp2_Linear(src_u, temp_u, width);
+    ScaleRowUp2_Linear(src_v, temp_v, width);
     I444AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants,
                        width);
     if (attenuate) {
@@ -6346,9 +7341,12 @@ static int I010AlphaToARGBMatrixBilinear(
                              int width) = I410AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
-  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleRowUp2_Bilinear_16_Any_C;
+  void (*Scale2RowUp_Bilinear_12)(
+      const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+      ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C;
+  void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                                int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -6407,35 +7405,45 @@ static int I010AlphaToARGBMatrixBilinear(
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
   }
 #endif
 
-#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON;
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
   }
 #endif
 
   // alloc 4 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 4 * sizeof(uint16_t));
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4 * sizeof(uint16_t));
   uint16_t* temp_u_1 = (uint16_t*)(row);
-  uint16_t* temp_u_2 = (uint16_t*)(row) + kRowSize;
-  uint16_t* temp_v_1 = (uint16_t*)(row) + kRowSize * 2;
-  uint16_t* temp_v_2 = (uint16_t*)(row) + kRowSize * 3;
-
-  Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-  Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+  uint16_t* temp_u_2 = (uint16_t*)(row) + row_size;
+  uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2;
+  uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3;
+  if (!row)
+    return 1;
+
+  ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
   I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
                      width);
   if (attenuate) {
@@ -6446,8 +7454,8 @@ static int I010AlphaToARGBMatrixBilinear(
   src_a += src_stride_a;
 
   for (y = 0; y < height - 2; y += 2) {
-    Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width);
+    Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width);
     I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
                        width);
     if (attenuate) {
@@ -6469,8 +7477,8 @@ static int I010AlphaToARGBMatrixBilinear(
   }
 
   if (!(height & 1)) {
-    Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
-    Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+    ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
     I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
                        width);
     if (attenuate) {
@@ -6504,8 +7512,9 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y,
                              int width) = I410AlphaToARGBRow_C;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
-  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
-                     int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
       height == 0) {
     return -1;
@@ -6564,32 +7573,39 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
 #if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_SSSE3;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_AVX2;
   }
 #endif
 #if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_NEON;
   }
 #endif
 
   // alloc 2 lines temp
-  const int kRowSize = (width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t));
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
   uint16_t* temp_u = (uint16_t*)(row);
-  uint16_t* temp_v = (uint16_t*)(row) + kRowSize;
+  uint16_t* temp_v = (uint16_t*)(row) + row_size;
+  if (!row)
+    return 1;
 
   for (y = 0; y < height; ++y) {
-    ScaleRowUp(src_u, temp_u, width);
-    ScaleRowUp(src_v, temp_v, width);
+    ScaleRowUp2_Linear(src_u, temp_u, width);
+    ScaleRowUp2_Linear(src_v, temp_v, width);
     I410AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants,
                        width);
     if (attenuate) {
@@ -6618,9 +7634,10 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y,
   void (*P410ToARGBRow)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C;
-  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleUVRowUp2_Bilinear_16_Any_C;
+  void (*Scale2RowUp_Bilinear_16)(
+      const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+      ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -6649,35 +7666,37 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y,
 
 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON;
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_NEON;
   }
 #endif
 
   // alloc 2 lines temp
-  const int kRowSize = (2 * width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t));
+  const int row_size = (2 * width + 31) & ~31;
+  align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
   uint16_t* temp_uv_1 = (uint16_t*)(row);
-  uint16_t* temp_uv_2 = (uint16_t*)(row) + kRowSize;
+  uint16_t* temp_uv_2 = (uint16_t*)(row) + row_size;
+  if (!row)
+    return 1;
 
-  Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width);
+  Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
   P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width);
   dst_argb += dst_stride_argb;
   src_y += src_stride_y;
 
   for (y = 0; y < height - 2; y += 2) {
-    Scale2RowUp(src_uv, src_stride_uv, temp_uv_1, kRowSize, width);
+    Scale2RowUp_Bilinear_16(src_uv, src_stride_uv, temp_uv_1, row_size, width);
     P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -6688,7 +7707,7 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y,
   }
 
   if (!(height & 1)) {
-    Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width);
+    Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
     P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width);
   }
 
@@ -6709,8 +7728,9 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y,
   void (*P410ToARGBRow)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C;
-  void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
-      ScaleUVRowUp2_Linear_16_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint16_t* src_uv, uint16_t* dst_uv,
+                             int dst_width) = ScaleUVRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -6739,28 +7759,30 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y,
 
 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_AVX2;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON;
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_NEON;
   }
 #endif
 
-  const int kRowSize = (2 * width + 31) & ~31;
-  align_buffer_64(row, kRowSize * sizeof(uint16_t));
+  const int row_size = (2 * width + 31) & ~31;
+  align_buffer_64(row, row_size * sizeof(uint16_t));
   uint16_t* temp_uv = (uint16_t*)(row);
+  if (!row)
+    return 1;
 
   for (y = 0; y < height; ++y) {
-    ScaleRowUp(src_uv, temp_uv, width);
+    ScaleRowUp2_Linear(src_uv, temp_uv, width);
     P410ToARGBRow(src_y, temp_uv, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -6784,9 +7806,10 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y,
   void (*P410ToAR30Row)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C;
-  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleUVRowUp2_Bilinear_16_Any_C;
+  void (*Scale2RowUp_Bilinear_16)(
+      const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+      ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -6815,35 +7838,37 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y,
 
 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON;
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_NEON;
   }
 #endif
 
   // alloc 2 lines temp
-  const int kRowSize = (2 * width + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t));
+  const int row_size = (2 * width + 31) & ~31;
+  align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
   uint16_t* temp_uv_1 = (uint16_t*)(row);
-  uint16_t* temp_uv_2 = (uint16_t*)(row) + kRowSize;
+  uint16_t* temp_uv_2 = (uint16_t*)(row) + row_size;
+  if (!row)
+    return 1;
 
-  Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width);
+  Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
   P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width);
   dst_ar30 += dst_stride_ar30;
   src_y += src_stride_y;
 
   for (y = 0; y < height - 2; y += 2) {
-    Scale2RowUp(src_uv, src_stride_uv, temp_uv_1, kRowSize, width);
+    Scale2RowUp_Bilinear_16(src_uv, src_stride_uv, temp_uv_1, row_size, width);
     P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width);
     dst_ar30 += dst_stride_ar30;
     src_y += src_stride_y;
@@ -6854,7 +7879,7 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y,
   }
 
   if (!(height & 1)) {
-    Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width);
+    Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
     P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width);
   }
 
@@ -6875,8 +7900,9 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y,
   void (*P410ToAR30Row)(
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C;
-  void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
-      ScaleUVRowUp2_Linear_16_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint16_t* src_uv, uint16_t* dst_uv,
+                             int dst_width) = ScaleUVRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
   if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -6905,28 +7931,30 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y,
 
 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_AVX2;
   }
 #endif
 
 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON;
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_NEON;
   }
 #endif
 
-  const int kRowSize = (2 * width + 31) & ~31;
-  align_buffer_64(row, kRowSize * sizeof(uint16_t));
+  const int row_size = (2 * width + 31) & ~31;
+  align_buffer_64(row, row_size * sizeof(uint16_t));
   uint16_t* temp_uv = (uint16_t*)(row);
+  if (!row)
+    return 1;
 
   for (y = 0; y < height; ++y) {
-    ScaleRowUp(src_uv, temp_uv, width);
+    ScaleRowUp2_Linear(src_uv, temp_uv, width);
     P410ToAR30Row(src_y, temp_uv, dst_ar30, yuvconstants, width);
     dst_ar30 += dst_stride_ar30;
     src_y += src_stride_y;
@@ -6937,6 +7965,140 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y,
   return 0;
 }
 
+static int I422ToRGB24MatrixLinear(const uint8_t* src_y,
+                                   int src_stride_y,
+                                   const uint8_t* src_u,
+                                   int src_stride_u,
+                                   const uint8_t* src_v,
+                                   int src_stride_v,
+                                   uint8_t* dst_rgb24,
+                                   int dst_stride_rgb24,
+                                   const struct YuvConstants* yuvconstants,
+                                   int width,
+                                   int height) {
+  int y;
+  void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I444ToRGB24Row_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I444TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToRGB24Row = I444ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I444ToRGB24Row = I444ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToRGB24Row = I444ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToRGB24Row = I444ToRGB24Row_RVV;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+  }
+#endif
+
+  // alloc 2 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2);
+  uint8_t* temp_u = row;
+  uint8_t* temp_v = row + row_size;
+  if (!row)
+    return 1;
+
+  for (y = 0; y < height; ++y) {
+    ScaleRowUp2_Linear(src_u, temp_u, width);
+    ScaleRowUp2_Linear(src_v, temp_v, width);
+    I444ToRGB24Row(src_y, temp_u, temp_v, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+LIBYUV_API
+int I422ToRGB24MatrixFilter(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgb24,
+                            int dst_stride_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height,
+                            enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I422ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                               src_stride_v, dst_rgb24, dst_stride_rgb24,
+                               yuvconstants, width, height);
+    case kFilterBilinear:
+    case kFilterBox:
+    case kFilterLinear:
+      return I422ToRGB24MatrixLinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+          dst_rgb24, dst_stride_rgb24, yuvconstants, width, height);
+  }
+
+  return -1;
+}
+
 LIBYUV_API
 int I420ToARGBMatrixFilter(const uint8_t* src_y,
                            int src_stride_y,
@@ -6998,6 +8160,35 @@ int I422ToARGBMatrixFilter(const uint8_t* src_y,
 }
 
 LIBYUV_API
+int I420ToRGB24MatrixFilter(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgb24,
+                            int dst_stride_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height,
+                            enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                               src_stride_v, dst_rgb24, dst_stride_rgb24,
+                               yuvconstants, width, height);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
+    case kFilterBilinear:
+    case kFilterBox:
+      return I420ToRGB24MatrixBilinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+          dst_rgb24, dst_stride_rgb24, yuvconstants, width, height);
+  }
+
+  return -1;
+}
+
+LIBYUV_API
 int I010ToAR30MatrixFilter(const uint16_t* src_y,
                            int src_stride_y,
                            const uint16_t* src_u,
@@ -7015,13 +8206,12 @@ int I010ToAR30MatrixFilter(const uint16_t* src_y,
       return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
                               src_stride_v, dst_ar30, dst_stride_ar30,
                               yuvconstants, width, height);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
     case kFilterBilinear:
     case kFilterBox:
       return I010ToAR30MatrixBilinear(
           src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
           dst_ar30, dst_stride_ar30, yuvconstants, width, height);
-    case kFilterLinear:
-      return -1;
   }
 
   return -1;
@@ -7074,13 +8264,12 @@ int I010ToARGBMatrixFilter(const uint16_t* src_y,
       return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
                               src_stride_v, dst_argb, dst_stride_argb,
                               yuvconstants, width, height);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
     case kFilterBilinear:
     case kFilterBox:
       return I010ToARGBMatrixBilinear(
           src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
           dst_argb, dst_stride_argb, yuvconstants, width, height);
-    case kFilterLinear:
-      return -1;
   }
 
   return -1;
@@ -7137,14 +8326,13 @@ int I420AlphaToARGBMatrixFilter(const uint8_t* src_y,
                                    src_v, src_stride_v, src_a, src_stride_a,
                                    dst_argb, dst_stride_argb, yuvconstants,
                                    width, height, attenuate);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
     case kFilterBilinear:
     case kFilterBox:
       return I420AlphaToARGBMatrixBilinear(
           src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
           src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
           attenuate);
-    case kFilterLinear:
-      return -1;
   }
 
   return -1;
@@ -7206,14 +8394,13 @@ int I010AlphaToARGBMatrixFilter(const uint16_t* src_y,
                                    src_v, src_stride_v, src_a, src_stride_a,
                                    dst_argb, dst_stride_argb, yuvconstants,
                                    width, height, attenuate);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
     case kFilterBilinear:
     case kFilterBox:
       return I010AlphaToARGBMatrixBilinear(
           src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
           src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
           attenuate);
-    case kFilterLinear:
-      return -1;
   }
 
   return -1;
@@ -7253,6 +8440,8 @@ int I210AlphaToARGBMatrixFilter(const uint16_t* src_y,
   return -1;
 }
 
+// TODO(fb): Verify this function works correctly.  P010 is like NV12 but 10 bit
+// UV is biplanar.
 LIBYUV_API
 int P010ToARGBMatrixFilter(const uint16_t* src_y,
                            int src_stride_y,
@@ -7269,13 +8458,12 @@ int P010ToARGBMatrixFilter(const uint16_t* src_y,
       return P010ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv,
                               dst_argb, dst_stride_argb, yuvconstants, width,
                               height);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
     case kFilterBilinear:
     case kFilterBox:
       return P010ToARGBMatrixBilinear(src_y, src_stride_y, src_uv,
                                       src_stride_uv, dst_argb, dst_stride_argb,
                                       yuvconstants, width, height);
-    case kFilterLinear:
-      return -1;
   }
 
   return -1;
@@ -7324,13 +8512,12 @@ int P010ToAR30MatrixFilter(const uint16_t* src_y,
       return P010ToAR30Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
                               dst_ar30, dst_stride_ar30, yuvconstants, width,
                               height);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
     case kFilterBilinear:
     case kFilterBox:
       return P010ToAR30MatrixBilinear(src_y, src_stride_y, src_uv,
                                       src_stride_uv, dst_ar30, dst_stride_ar30,
                                       yuvconstants, width, height);
-    case kFilterLinear:
-      return -1;
   }
 
   return -1;
diff --git a/files/source/convert_from.cc b/source/convert_from.cc
index 8bd07e4c..e69da9e9 100644
--- a/files/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -52,19 +52,26 @@ static int I420ToI4xx(const uint8_t* src_y,
   const int dst_y_height = Abs(src_y_height);
   const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
   const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+  int r;
   if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||
       dst_uv_height <= 0) {
     return -1;
   }
   if (dst_y) {
-    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
-               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+    r = ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+                   dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
   }
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
-             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
-             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
-  return 0;
+  r = ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+                 dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+                 dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+  return r;
 }
 
 // Convert 8 bit YUV to 10 bit.
@@ -223,21 +230,28 @@ int I010ToI410(const uint16_t* src_y,
                int dst_stride_v,
                int width,
                int height) {
+  int r;
   if (width == 0 || height == 0) {
     return -1;
   }
 
   if (dst_y) {
-    ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-                  Abs(width), Abs(height), kFilterBilinear);
-  }
-  ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1),
-                SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, Abs(width),
-                Abs(height), kFilterBilinear);
-  ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1),
-                SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, Abs(width),
-                Abs(height), kFilterBilinear);
-  return 0;
+    r = ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                      Abs(width), Abs(height), kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
+  }
+  r = ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1),
+                    SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, Abs(width),
+                    Abs(height), kFilterBilinear);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1),
+                    SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, Abs(width),
+                    Abs(height), kFilterBilinear);
+  return r;
 }
 
 // 422 chroma to 444 chroma, 10/12 bit version
@@ -256,19 +270,26 @@ int I210ToI410(const uint16_t* src_y,
                int dst_stride_v,
                int width,
                int height) {
+  int r;
   if (width == 0 || height == 0) {
     return -1;
   }
 
   if (dst_y) {
-    ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-                  Abs(width), Abs(height), kFilterBilinear);
+    r = ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                      Abs(width), Abs(height), kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
   }
-  ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
-                dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
-  ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
-                dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
-  return 0;
+  r = ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
+                    dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
+                    dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
+  return r;
 }
 
 // 422 chroma is 1/2 width, 1x height
@@ -288,19 +309,26 @@ int I422ToI444(const uint8_t* src_y,
                int dst_stride_v,
                int width,
                int height) {
+  int r;
   if (width == 0 || height == 0) {
     return -1;
   }
 
   if (dst_y) {
-    ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-               Abs(width), Abs(height), kFilterBilinear);
+    r = ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                   Abs(width), Abs(height), kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
   }
-  ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
-             dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
-             dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
-  return 0;
+  r = ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
+                 dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
+                 dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
+  return r;
 }
 
 // Copy to I400. Source can be I420,422,444,400,NV12,NV21
@@ -446,6 +474,14 @@ int I420ToYUY2(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_LASX;
@@ -533,6 +569,14 @@ int I422ToUYVY(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
@@ -608,6 +652,14 @@ int I420ToUYVY(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
diff --git a/files/source/convert_from_argb.cc b/source/convert_from_argb.cc
index e50c2af3..b45de8c8 100644
--- a/files/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -76,6 +76,14 @@ int ARGBToI444(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUV444ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOUV444ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToUV444Row = ARGBToUV444Row_Any_LASX;
@@ -116,6 +124,14 @@ int ARGBToI444(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -124,6 +140,11 @@ int ARGBToI444(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToUV444Row(src_argb, dst_u, dst_v, width);
@@ -230,7 +251,24 @@ int ARGBToI422(const uint8_t* src_argb,
     }
   }
 #endif
-
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -241,6 +279,11 @@ int ARGBToI422(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
@@ -340,6 +383,14 @@ int ARGBToNV12(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -350,6 +401,11 @@ int ARGBToNV12(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -361,11 +417,19 @@ int ARGBToNV12(const uint8_t* src_argb,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -390,10 +454,17 @@ int ARGBToNV12(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
+#endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
     uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+    if (!row_u)
+      return 1;
 
     for (y = 0; y < height - 1; y += 2) {
       ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@@ -502,6 +573,24 @@ int ARGBToNV21(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -512,6 +601,11 @@ int ARGBToNV21(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -523,11 +617,19 @@ int ARGBToNV21(const uint8_t* src_argb,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -552,10 +654,17 @@ int ARGBToNV21(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
+#endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
     uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+    if (!row_u)
+      return 1;
 
     for (y = 0; y < height - 1; y += 2) {
       ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@@ -663,6 +772,27 @@ int ABGRToNV12(const uint8_t* src_abgr,
     }
   }
 #endif
+#if defined(HAS_ABGRTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYRow = ABGRToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYRow = ABGRToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYRow = ABGRToYRow_RVV;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -674,11 +804,19 @@ int ABGRToNV12(const uint8_t* src_abgr,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -703,10 +841,17 @@ int ABGRToNV12(const uint8_t* src_abgr,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
+#endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
     uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+    if (!row_u)
+      return 1;
 
     for (y = 0; y < height - 1; y += 2) {
       ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
@@ -815,6 +960,27 @@ int ABGRToNV21(const uint8_t* src_abgr,
     }
   }
 #endif
+#if defined(HAS_ABGRTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYRow = ABGRToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYRow = ABGRToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYRow = ABGRToYRow_RVV;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -826,11 +992,19 @@ int ABGRToNV21(const uint8_t* src_abgr,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -855,10 +1029,17 @@ int ABGRToNV21(const uint8_t* src_abgr,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
+#endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
     uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+    if (!row_u)
+      return 1;
 
     for (y = 0; y < height - 1; y += 2) {
       ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
@@ -972,6 +1153,24 @@ int ARGBToYUY2(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -982,6 +1181,11 @@ int ARGBToYUY2(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -1014,6 +1218,14 @@ int ARGBToYUY2(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_LASX;
@@ -1028,6 +1240,8 @@ int ARGBToYUY2(const uint8_t* src_argb,
     align_buffer_64(row_y, ((width + 63) & ~63) * 2);
     uint8_t* row_u = row_y + ((width + 63) & ~63);
     uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
+    if (!row_y)
+      return 1;
 
     for (y = 0; y < height; ++y) {
       ARGBToUVRow(src_argb, 0, row_u, row_v, width);
@@ -1135,6 +1349,24 @@ int ARGBToUYVY(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -1145,6 +1377,11 @@ int ARGBToUYVY(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -1177,6 +1414,14 @@ int ARGBToUYVY(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
@@ -1191,6 +1436,8 @@ int ARGBToUYVY(const uint8_t* src_argb,
     align_buffer_64(row_y, ((width + 63) & ~63) * 2);
     uint8_t* row_u = row_y + ((width + 63) & ~63);
     uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
+    if (!row_y)
+      return 1;
 
     for (y = 0; y < height; ++y) {
       ARGBToUVRow(src_argb, 0, row_u, row_v, width);
@@ -1262,6 +1509,14 @@ int ARGBToI400(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToYRow = ARGBToYRow_Any_LASX;
@@ -1270,6 +1525,11 @@ int ARGBToI400(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToYRow(src_argb, dst_y, width);
@@ -1279,6 +1539,7 @@ int ARGBToI400(const uint8_t* src_argb,
   return 0;
 }
 
+#ifndef __riscv
 // Shuffle table for converting ARGB to RGBA.
 static const uvec8 kShuffleMaskARGBToRGBA = {
     3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};
@@ -1294,6 +1555,47 @@ int ARGBToRGBA(const uint8_t* src_argb,
   return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba,
                      (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height);
 }
+#else
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToRGBARow)(const uint8_t* src_argb, uint8_t* dst_rgba, int width) =
+      ARGBToRGBARow_C;
+  if (!src_argb || !dst_rgba || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_rgba == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgba = 0;
+  }
+
+#if defined(HAS_ARGBTORGBAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToRGBARow = ARGBToRGBARow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGBARow(src_argb, dst_rgba, width);
+    src_argb += src_stride_argb;
+    dst_rgba += dst_stride_rgba;
+  }
+  return 0;
+}
+#endif
 
 // Convert ARGB To RGB24.
 LIBYUV_API
@@ -1360,6 +1662,14 @@ int ARGBToRGB24(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB24ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToRGB24Row = ARGBToRGB24Row_Any_LASX;
@@ -1368,6 +1678,11 @@ int ARGBToRGB24(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRGB24Row(src_argb, dst_rgb24, width);
@@ -1434,6 +1749,14 @@ int ARGBToRAW(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORAWROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORAWROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToRAWRow = ARGBToRAWRow_Any_LASX;
@@ -1442,6 +1765,11 @@ int ARGBToRAW(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORAWROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToRAWRow = ARGBToRAWRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRAWRow(src_argb, dst_raw, width);
@@ -1467,7 +1795,7 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
                        int height) {
   int y;
   void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
-                                const uint32_t dither4, int width) =
+                                uint32_t dither4, int width) =
       ARGBToRGB565DitherRow_C;
   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
@@ -1512,6 +1840,14 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB565DITHERROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX;
@@ -1589,6 +1925,15 @@ int ARGBToRGB565(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_LSX;
+    }
+  }
+#endif
+
 #if defined(HAS_ARGBTORGB565ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToRGB565Row = ARGBToRGB565Row_Any_LASX;
@@ -1663,6 +2008,14 @@ int ARGBToARGB1555(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOARGB1555ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOARGB1555ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LASX;
@@ -1737,6 +2090,14 @@ int ARGBToARGB4444(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOARGB4444ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOARGB4444ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LASX;
@@ -1858,19 +2219,19 @@ int ARGBToJ420(const uint8_t* src_argb,
                int src_stride_argb,
                uint8_t* dst_yj,
                int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height) {
   int y;
   void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
       ARGBToUVJRow_C;
   void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
-  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if (!src_argb || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1879,6 +2240,22 @@ int ARGBToJ420(const uint8_t* src_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
@@ -1903,19 +2280,11 @@ int ARGBToJ420(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_NEON;
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_AVX2;
     }
   }
 #endif
@@ -1951,18 +2320,23 @@ int ARGBToJ420(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYJRow = ARGBToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToUVJRow(src_argb, src_stride_argb, dst_uj, dst_vj, width);
     ARGBToYJRow(src_argb, dst_yj, width);
     ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
     src_argb += src_stride_argb * 2;
     dst_yj += dst_stride_yj * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
+    dst_uj += dst_stride_uj;
+    dst_vj += dst_stride_vj;
   }
   if (height & 1) {
-    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToUVJRow(src_argb, 0, dst_uj, dst_vj, width);
     ARGBToYJRow(src_argb, dst_yj, width);
   }
   return 0;
@@ -1974,19 +2348,19 @@ int ARGBToJ422(const uint8_t* src_argb,
                int src_stride_argb,
                uint8_t* dst_yj,
                int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height) {
   int y;
   void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
       ARGBToUVJRow_C;
   void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
-  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if (!src_argb || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1997,10 +2371,10 @@ int ARGBToJ422(const uint8_t* src_argb,
   }
   // Coalesce rows.
   if (src_stride_argb == width * 4 && dst_stride_yj == width &&
-      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
+      dst_stride_uj * 2 == width && dst_stride_vj * 2 == width) {
     width *= height;
     height = 1;
-    src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
+    src_stride_argb = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0;
   }
 #if defined(HAS_ARGBTOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -2026,6 +2400,14 @@ int ARGBToJ422(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYJROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYJRow = ARGBToYJRow_Any_NEON;
@@ -2074,270 +2456,649 @@ int ARGBToJ422(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYJRow = ARGBToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToUVJRow(src_argb, 0, dst_uj, dst_vj, width);
     ARGBToYJRow(src_argb, dst_yj, width);
     src_argb += src_stride_argb;
     dst_yj += dst_stride_yj;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
+    dst_uj += dst_stride_uj;
+    dst_vj += dst_stride_vj;
   }
   return 0;
 }
 
-// Convert ARGB to AR64.
+// Convert ARGB to J400.
 LIBYUV_API
-int ARGBToAR64(const uint8_t* src_argb,
+int ARGBToJ400(const uint8_t* src_argb,
                int src_stride_argb,
-               uint16_t* dst_ar64,
-               int dst_stride_ar64,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
                int width,
                int height) {
   int y;
-  void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
-                        int width) = ARGBToAR64Row_C;
-  if (!src_argb || !dst_ar64 || width <= 0 || height == 0) {
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_yj || width <= 0 || height == 0) {
     return -1;
   }
-  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_yj == width) {
     width *= height;
     height = 1;
-    src_stride_argb = dst_stride_ar64 = 0;
+    src_stride_argb = dst_stride_yj = 0;
   }
-#if defined(HAS_ARGBTOAR64ROW_SSSE3)
+#if defined(HAS_ARGBTOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToAR64Row = ARGBToAR64Row_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOAR64ROW_AVX2)
+#if defined(HAS_ARGBTOYJROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToAR64Row = ARGBToAR64Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToAR64Row = ARGBToAR64Row_AVX2;
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOAR64ROW_NEON)
+#if defined(HAS_ARGBTOYJROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToAR64Row = ARGBToAR64Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToAR64Row = ARGBToAR64Row_NEON;
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYJRow = ARGBToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    ARGBToAR64Row(src_argb, dst_ar64, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
     src_argb += src_stride_argb;
-    dst_ar64 += dst_stride_ar64;
+    dst_yj += dst_stride_yj;
   }
   return 0;
 }
 
-// Convert ARGB to AB64.
+// Convert RGBA to J400.
 LIBYUV_API
-int ARGBToAB64(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint16_t* dst_ab64,
-               int dst_stride_ab64,
+int RGBAToJ400(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
                int width,
                int height) {
   int y;
-  void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
-                        int width) = ARGBToAB64Row_C;
-  if (!src_argb || !dst_ab64 || width <= 0 || height == 0) {
+  void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) =
+      RGBAToYJRow_C;
+  if (!src_rgba || !dst_yj || width <= 0 || height == 0) {
     return -1;
   }
-  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) {
+  if (src_stride_rgba == width * 4 && dst_stride_yj == width) {
     width *= height;
     height = 1;
-    src_stride_argb = dst_stride_ab64 = 0;
+    src_stride_rgba = dst_stride_yj = 0;
   }
-#if defined(HAS_ARGBTOAB64ROW_SSSE3)
+#if defined(HAS_RGBATOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToAB64Row = ARGBToAB64Row_SSSE3;
+    RGBAToYJRow = RGBAToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOAB64ROW_AVX2)
+#if defined(HAS_RGBATOYJROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToAB64Row = ARGBToAB64Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToAB64Row = ARGBToAB64Row_AVX2;
+    RGBAToYJRow = RGBAToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      RGBAToYJRow = RGBAToYJRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOAB64ROW_NEON)
+#if defined(HAS_RGBATOYJROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToAB64Row = ARGBToAB64Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToAB64Row = ARGBToAB64Row_NEON;
+    RGBAToYJRow = RGBAToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToYJRow = RGBAToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGBAToYJRow = RGBAToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_LSX;
     }
   }
 #endif
+#if defined(HAS_RGBATOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RGBAToYJRow = RGBAToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGBAToYJRow = RGBAToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGBAToYJRow = RGBAToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    ARGBToAB64Row(src_argb, dst_ab64, width);
-    src_argb += src_stride_argb;
-    dst_ab64 += dst_stride_ab64;
+    RGBAToYJRow(src_rgba, dst_yj, width);
+    src_rgba += src_stride_rgba;
+    dst_yj += dst_stride_yj;
   }
   return 0;
 }
 
-// Convert ARGB to J400.
+// Convert ABGR to J420. (JPeg full range I420).
 LIBYUV_API
-int ARGBToJ400(const uint8_t* src_argb,
-               int src_stride_argb,
+int ABGRToJ420(const uint8_t* src_abgr,
+               int src_stride_abgr,
                uint8_t* dst_yj,
                int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height) {
   int y;
-  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
-      ARGBToYJRow_C;
-  if (!src_argb || !dst_yj || width <= 0 || height == 0) {
+  void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+      ABGRToUVJRow_C;
+  void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) =
+      ABGRToYJRow_C;
+  if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
     return -1;
   }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToYJRow = ABGRToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToYJRow = ABGRToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVJRow = ABGRToUVJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYJRow = ABGRToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYJRow = ABGRToYJRow_Any_MSA;
+    ABGRToUVJRow = ABGRToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_MSA;
+      ABGRToUVJRow = ABGRToUVJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYJRow = ABGRToYJRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ABGRToUVJRow(src_abgr, src_stride_abgr, dst_uj, dst_vj, width);
+    ABGRToYJRow(src_abgr, dst_yj, width);
+    ABGRToYJRow(src_abgr + src_stride_abgr, dst_yj + dst_stride_yj, width);
+    src_abgr += src_stride_abgr * 2;
+    dst_yj += dst_stride_yj * 2;
+    dst_uj += dst_stride_uj;
+    dst_vj += dst_stride_vj;
+  }
+  if (height & 1) {
+    ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width);
+    ABGRToYJRow(src_abgr, dst_yj, width);
+  }
+  return 0;
+}
+
+// Convert ABGR to J422. (JPeg full range I422).
+LIBYUV_API
+int ABGRToJ422(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height) {
+  int y;
+  void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+      ABGRToUVJRow_C;
+  void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) =
+      ABGRToYJRow_C;
+  if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_yj == width) {
+  if (src_stride_abgr == width * 4 && dst_stride_yj == width &&
+      dst_stride_uj * 2 == width && dst_stride_vj * 2 == width) {
     width *= height;
     height = 1;
-    src_stride_argb = dst_stride_yj = 0;
+    src_stride_abgr = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0;
   }
-#if defined(HAS_ARGBTOYJROW_SSSE3)
+#if defined(HAS_ABGRTOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    ABGRToYJRow = ABGRToYJRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
+      ABGRToYJRow = ABGRToYJRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
+#if defined(HAS_ABGRTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    ABGRToYJRow = ABGRToYJRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_AVX2;
+      ABGRToYJRow = ABGRToYJRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_NEON)
+#if defined(HAS_ABGRTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVJRow = ABGRToUVJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    ABGRToYJRow = ABGRToYJRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
+      ABGRToYJRow = ABGRToYJRow_NEON;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_MSA)
+#if defined(HAS_ABGRTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    ABGRToYJRow = ABGRToYJRow_Any_MSA;
+    ABGRToUVJRow = ABGRToUVJRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_MSA;
+      ABGRToYJRow = ABGRToYJRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVJRow = ABGRToUVJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_LASX;
     }
   }
 #endif
+#if defined(HAS_ABGRTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYJRow = ABGRToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    ARGBToYJRow(src_argb, dst_yj, width);
-    src_argb += src_stride_argb;
+    ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width);
+    ABGRToYJRow(src_abgr, dst_yj, width);
+    src_abgr += src_stride_abgr;
     dst_yj += dst_stride_yj;
+    dst_uj += dst_stride_uj;
+    dst_vj += dst_stride_vj;
   }
   return 0;
 }
 
-// Convert RGBA to J400.
+// Convert ABGR to J400.
 LIBYUV_API
-int RGBAToJ400(const uint8_t* src_rgba,
-               int src_stride_rgba,
+int ABGRToJ400(const uint8_t* src_abgr,
+               int src_stride_abgr,
                uint8_t* dst_yj,
                int dst_stride_yj,
                int width,
                int height) {
   int y;
-  void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) =
-      RGBAToYJRow_C;
-  if (!src_rgba || !dst_yj || width <= 0 || height == 0) {
+  void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) =
+      ABGRToYJRow_C;
+  if (!src_abgr || !dst_yj || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
-    src_stride_rgba = -src_stride_rgba;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
   }
   // Coalesce rows.
-  if (src_stride_rgba == width * 4 && dst_stride_yj == width) {
+  if (src_stride_abgr == width * 4 && dst_stride_yj == width) {
     width *= height;
     height = 1;
-    src_stride_rgba = dst_stride_yj = 0;
+    src_stride_abgr = dst_stride_yj = 0;
   }
-#if defined(HAS_RGBATOYJROW_SSSE3)
+#if defined(HAS_ABGRTOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGBAToYJRow = RGBAToYJRow_Any_SSSE3;
+    ABGRToYJRow = ABGRToYJRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      RGBAToYJRow = RGBAToYJRow_SSSE3;
+      ABGRToYJRow = ABGRToYJRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_RGBATOYJROW_AVX2)
+#if defined(HAS_ABGRTOYJROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    RGBAToYJRow = RGBAToYJRow_Any_AVX2;
+    ABGRToYJRow = ABGRToYJRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      RGBAToYJRow = RGBAToYJRow_AVX2;
+      ABGRToYJRow = ABGRToYJRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_RGBATOYJROW_NEON)
+#if defined(HAS_ABGRTOYJROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    RGBAToYJRow = RGBAToYJRow_Any_NEON;
+    ABGRToYJRow = ABGRToYJRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      RGBAToYJRow = RGBAToYJRow_NEON;
+      ABGRToYJRow = ABGRToYJRow_NEON;
     }
   }
 #endif
-#if defined(HAS_RGBATOYJROW_MSA)
+#if defined(HAS_ABGRTOYJROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
-    RGBAToYJRow = RGBAToYJRow_Any_MSA;
+    ABGRToYJRow = ABGRToYJRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
-      RGBAToYJRow = RGBAToYJRow_MSA;
+      ABGRToYJRow = ABGRToYJRow_MSA;
     }
   }
 #endif
+#if defined(HAS_ABGRTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYJRow = ABGRToYJRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    RGBAToYJRow(src_rgba, dst_yj, width);
-    src_rgba += src_stride_rgba;
+    ABGRToYJRow(src_abgr, dst_yj, width);
+    src_abgr += src_stride_abgr;
     dst_yj += dst_stride_yj;
   }
   return 0;
 }
 
+// Convert ARGB to AR64.
+LIBYUV_API
+int ARGBToAR64(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint16_t* dst_ar64,
+               int dst_stride_ar64,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
+                        int width) = ARGBToAR64Row_C;
+  if (!src_argb || !dst_ar64 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_ar64 = 0;
+  }
+#if defined(HAS_ARGBTOAR64ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToAR64Row = ARGBToAR64Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToAR64Row = ARGBToAR64Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAR64Row = ARGBToAR64Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToAR64Row = ARGBToAR64Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAR64Row = ARGBToAR64Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToAR64Row = ARGBToAR64Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToAR64Row(src_argb, dst_ar64, width);
+    src_argb += src_stride_argb;
+    dst_ar64 += dst_stride_ar64;
+  }
+  return 0;
+}
+
+// Convert ARGB to AB64.
+LIBYUV_API
+int ARGBToAB64(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint16_t* dst_ab64,
+               int dst_stride_ab64,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
+                        int width) = ARGBToAB64Row_C;
+  if (!src_argb || !dst_ab64 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_ab64 = 0;
+  }
+#if defined(HAS_ARGBTOAB64ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToAB64Row = ARGBToAB64Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToAB64Row = ARGBToAB64Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAB64Row = ARGBToAB64Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToAB64Row = ARGBToAB64Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAB64Row = ARGBToAB64Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToAB64Row = ARGBToAB64Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToAB64Row(src_argb, dst_ab64, width);
+    src_argb += src_stride_argb;
+    dst_ab64 += dst_stride_ab64;
+  }
+  return 0;
+}
+
 // Enabled if 1 pass is available
-#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA)
+#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || \
+    defined(HAS_RAWTOYJROW_RVV)
 #define HAS_RAWTOYJROW
 #endif
 
@@ -2355,7 +3116,7 @@ int RAWToJNV21(const uint8_t* src_raw,
   int halfwidth = (width + 1) >> 1;
 #if defined(HAS_RAWTOYJROW)
   void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+                      uint8_t* dst_uj, uint8_t* dst_vj, int width) =
       RAWToUVJRow_C;
   void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
       RAWToYJRow_C;
@@ -2363,12 +3124,12 @@ int RAWToJNV21(const uint8_t* src_raw,
   void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RAWToARGBRow_C;
   void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
       ARGBToUVJRow_C;
   void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYJRow_C;
 #endif
-  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+  void (*MergeUVRow_)(const uint8_t* src_uj, const uint8_t* src_vj,
                       uint8_t* dst_vu, int width) = MergeUVRow_C;
   if (!src_raw || !dst_y || !dst_vu || width <= 0 || height == 0) {
     return -1;
@@ -2403,6 +3164,27 @@ int RAWToJNV21(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RAWToYJRow = RAWToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RAWToYJRow = RAWToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToYJRow = RAWToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToYJRow = RAWToYJRow_RVV;
+  }
+#endif
 
 // Other platforms do intermediate conversion from RAW to ARGB.
 #else  // HAS_RAWTOYJROW
@@ -2459,11 +3241,19 @@ int RAWToJNV21(const uint8_t* src_raw,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -2488,29 +3278,41 @@ int RAWToJNV21(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
+#endif
   {
+#if defined(HAS_RAWTOYJROW)
     // Allocate a row of uv.
-    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
-#if !defined(HAS_RAWTOYJROW)
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_uv_size = ((halfwidth + 31) & ~31);
+    align_buffer_64(row_uj, row_uv_size * 2);
+    uint8_t* row_vj = row_uj + row_uv_size;
+#else
+    // Allocate row of uv and 2 rows of ARGB.
+    const int row_size = ((width * 4 + 31) & ~31);
+    const int row_uv_size = ((halfwidth + 31) & ~31);
+    align_buffer_64(row_uj, row_uv_size * 2 + row_size * 2);
+    uint8_t* row_vj = row_uj + row_uv_size;
+    uint8_t* row = row_vj + row_uv_size;
 #endif
+    if (!row_uj)
+      return 1;
 
     for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_RAWTOYJROW)
-      RAWToUVJRow(src_raw, src_stride_raw, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      RAWToUVJRow(src_raw, src_stride_raw, row_uj, row_vj, width);
+      MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
       RAWToYJRow(src_raw, dst_y, width);
       RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
 #else
       RAWToARGBRow(src_raw, row, width);
-      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
-      ARGBToUVJRow(row, kRowSize, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width);
+      ARGBToUVJRow(row, row_size, row_uj, row_vj, width);
+      MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
       ARGBToYJRow(row, dst_y, width);
-      ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
 #endif
       src_raw += src_stride_raw * 2;
       dst_y += dst_stride_y * 2;
@@ -2518,20 +3320,17 @@ int RAWToJNV21(const uint8_t* src_raw,
     }
     if (height & 1) {
 #if defined(HAS_RAWTOYJROW)
-      RAWToUVJRow(src_raw, 0, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      RAWToUVJRow(src_raw, 0, row_uj, row_vj, width);
+      MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
       RAWToYJRow(src_raw, dst_y, width);
 #else
       RAWToARGBRow(src_raw, row, width);
-      ARGBToUVJRow(row, 0, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      ARGBToUVJRow(row, 0, row_uj, row_vj, width);
+      MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
       ARGBToYJRow(row, dst_y, width);
 #endif
     }
-#if !defined(HAS_RAWTOYJROW)
-    free_aligned_buffer_64(row);
-#endif
-    free_aligned_buffer_64(row_u);
+    free_aligned_buffer_64(row_uj);
   }
   return 0;
 }
diff --git a/files/source/convert_jpeg.cc b/source/convert_jpeg.cc
index d7556ee9..d7556ee9 100644
--- a/files/source/convert_jpeg.cc
+++ b/source/convert_jpeg.cc
diff --git a/files/source/convert_to_argb.cc b/source/convert_to_argb.cc
index 84df16c8..84df16c8 100644
--- a/files/source/convert_to_argb.cc
+++ b/source/convert_to_argb.cc
diff --git a/files/source/convert_to_i420.cc b/source/convert_to_i420.cc
index 5869ecd7..5869ecd7 100644
--- a/files/source/convert_to_i420.cc
+++ b/source/convert_to_i420.cc
diff --git a/files/source/cpu_id.cc b/source/cpu_id.cc
index 56fe60e4..eedce16b 100644
--- a/files/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -40,7 +40,6 @@ extern "C" {
 // cpu_info_ variable for SIMD instruction sets detected.
 LIBYUV_API int cpu_info_ = 0;
 
-// TODO(fbarchard): Consider using int for cpuid so casting is not needed.
 // Low level cpuid for X86.
 #if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
      defined(__x86_64__)) &&                                     \
@@ -108,14 +107,14 @@ void CpuId(int eax, int ecx, int* cpu_info) {
 //  }
 // For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
 // https://code.google.com/p/libyuv/issues/detail?id=529
-#if defined(_M_IX86) && (_MSC_VER < 1900)
+#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
 #pragma optimize("g", off)
 #endif
 #if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
      defined(__x86_64__)) &&                                     \
     !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
 // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-int GetXCR0() {
+static int GetXCR0() {
   int xcr0 = 0;
 #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
   xcr0 = (int)_xgetbv(0);  // VS2010 SP1 required.  NOLINT
@@ -129,7 +128,7 @@ int GetXCR0() {
 #define GetXCR0() 0
 #endif  // defined(_M_IX86) || defined(_M_X64) ..
 // Return optimization to previous setting.
-#if defined(_M_IX86) && (_MSC_VER < 1900)
+#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
 #pragma optimize("g", on)
 #endif
 
@@ -137,13 +136,14 @@ int GetXCR0() {
 // For Arm, but public to allow testing on any CPU
 LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
   char cpuinfo_line[512];
-  FILE* f = fopen(cpuinfo_name, "r");
+  FILE* f = fopen(cpuinfo_name, "re");
   if (!f) {
     // Assume Neon if /proc/cpuinfo is unavailable.
     // This will occur for Chrome sandbox for Pepper or Render process.
     return kCpuHasNEON;
   }
-  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+  memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
     if (memcmp(cpuinfo_line, "Features", 8) == 0) {
       char* p = strstr(cpuinfo_line, " neon");
       if (p && (p[5] == ' ' || p[5] == '\n')) {
@@ -162,17 +162,90 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
   return 0;
 }
 
-// TODO(fbarchard): Consider read_msa_ir().
+LIBYUV_API SAFEBUFFERS int RiscvCpuCaps(const char* cpuinfo_name) {
+  char cpuinfo_line[512];
+  int flag = 0;
+  FILE* f = fopen(cpuinfo_name, "re");
+  if (!f) {
+#if defined(__riscv_vector)
+    // Assume RVV if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
+    return kCpuHasRVV;
+#else
+    return 0;
+#endif
+  }
+  memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
+    if (memcmp(cpuinfo_line, "isa", 3) == 0) {
+      // ISA string must begin with rv64{i,e,g} for a 64-bit processor.
+      char* isa = strstr(cpuinfo_line, "rv64");
+      if (isa) {
+        size_t isa_len = strlen(isa);
+        char* extensions;
+        size_t extensions_len = 0;
+        size_t std_isa_len;
+        // Remove the new-line character at the end of string
+        if (isa[isa_len - 1] == '\n') {
+          isa[--isa_len] = '\0';
+        }
+        // 5 ISA characters
+        if (isa_len < 5) {
+          fclose(f);
+          return 0;
+        }
+        // Skip {i,e,g} canonical checking.
+        // Skip rvxxx
+        isa += 5;
+        // Find the very first occurrence of 's', 'x' or 'z'.
+        // To detect multi-letter standard, non-standard, and
+        // supervisor-level extensions.
+        extensions = strpbrk(isa, "zxs");
+        if (extensions) {
+          // Multi-letter extensions are seperated by a single underscore
+          // as described in RISC-V User-Level ISA V2.2.
+          char* ext = strtok(extensions, "_");
+          extensions_len = strlen(extensions);
+          while (ext) {
+            // Search for the ZVFH (Vector FP16) extension.
+            if (!strcmp(ext, "zvfh")) {
+              flag |= kCpuHasRVVZVFH;
+            }
+            ext = strtok(NULL, "_");
+          }
+        }
+        std_isa_len = isa_len - extensions_len - 5;
+        // Detect the v in the standard single-letter extensions.
+        if (memchr(isa, 'v', std_isa_len)) {
+          // The RVV implied the F extension.
+          flag |= kCpuHasRVV;
+        }
+      }
+    }
+#if defined(__riscv_vector)
+    // Assume RVV if /proc/cpuinfo is from x86 host running QEMU.
+    else if ((memcmp(cpuinfo_line, "vendor_id\t: GenuineIntel", 24) == 0) ||
+             (memcmp(cpuinfo_line, "vendor_id\t: AuthenticAMD", 24) == 0)) {
+      fclose(f);
+      return kCpuHasRVV;
+    }
+#endif
+  }
+  fclose(f);
+  return flag;
+}
+
 LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
   char cpuinfo_line[512];
-  int flag = 0x0;
-  FILE* f = fopen(cpuinfo_name, "r");
+  int flag = 0;
+  FILE* f = fopen(cpuinfo_name, "re");
   if (!f) {
     // Assume nothing if /proc/cpuinfo is unavailable.
     // This will occur for Chrome sandbox for Pepper or Render process.
     return 0;
   }
-  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+  memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
     if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
       // Workaround early kernel without MSA in ASEs line.
       if (strstr(cpuinfo_line, "Loongson-2K")) {
@@ -191,14 +264,13 @@ LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
   return flag;
 }
 
-// TODO(fbarchard): Consider read_loongarch_ir().
 #define LOONGARCH_CFG2 0x2
 #define LOONGARCH_CFG2_LSX (1 << 6)
 #define LOONGARCH_CFG2_LASX (1 << 7)
 
 #if defined(__loongarch__)
 LIBYUV_API SAFEBUFFERS int LoongarchCpuCaps(void) {
-  int flag = 0x0;
+  int flag = 0;
   uint32_t cfg2 = 0;
 
   __asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(cfg2) : "r"(LOONGARCH_CFG2));
@@ -220,10 +292,12 @@ static SAFEBUFFERS int GetCpuFlags(void) {
   int cpu_info0[4] = {0, 0, 0, 0};
   int cpu_info1[4] = {0, 0, 0, 0};
   int cpu_info7[4] = {0, 0, 0, 0};
+  int cpu_einfo7[4] = {0, 0, 0, 0};
   CpuId(0, 0, cpu_info0);
   CpuId(1, 0, cpu_info1);
   if (cpu_info0[0] >= 7) {
     CpuId(7, 0, cpu_info7);
+    CpuId(7, 1, cpu_einfo7);
   }
   cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
              ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
@@ -236,7 +310,9 @@ static SAFEBUFFERS int GetCpuFlags(void) {
       ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
     cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
                 ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
-                ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
+                ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0) |
+                ((cpu_einfo7[0] & 0x00000010) ? kCpuHasAVXVNNI : 0) |
+                ((cpu_einfo7[3] & 0x00000010) ? kCpuHasAVXVNNIINT8 : 0);
 
     // Detect AVX512bw
     if ((GetXCR0() & 0xe0) == 0xe0) {
@@ -246,8 +322,7 @@ static SAFEBUFFERS int GetCpuFlags(void) {
       cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
       cpu_info |= (cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0;
       cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
-      cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
-      cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;
+      cpu_info |= (cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0;
     }
   }
 #endif
@@ -277,6 +352,10 @@ static SAFEBUFFERS int GetCpuFlags(void) {
 #endif
   cpu_info |= kCpuHasARM;
 #endif  // __arm__
+#if defined(__riscv) && defined(__linux__)
+  cpu_info = RiscvCpuCaps("/proc/cpuinfo");
+  cpu_info |= kCpuHasRISCV;
+#endif  // __riscv
   cpu_info |= kCpuInitialized;
   return cpu_info;
 }
diff --git a/files/source/mjpeg_decoder.cc b/source/mjpeg_decoder.cc
index 4ccf00a3..0141da8a 100644
--- a/files/source/mjpeg_decoder.cc
+++ b/source/mjpeg_decoder.cc
@@ -109,7 +109,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) {
   }
 
   buf_.data = src;
-  buf_.len = static_cast<int>(src_len);
+  buf_.len = (int)src_len;
   buf_vec_.pos = 0;
   decompress_struct_->client_data = &buf_vec_;
 #ifdef HAVE_SETJMP
@@ -428,7 +428,7 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {
 
 void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
   jpeg_source_mgr* src = cinfo->src;
-  size_t bytes = static_cast<size_t>(num_bytes);
+  size_t bytes = (size_t)num_bytes;
   if (bytes > src->bytes_in_buffer) {
     src->next_input_byte = nullptr;
     src->bytes_in_buffer = 0;
diff --git a/files/source/mjpeg_validate.cc b/source/mjpeg_validate.cc
index ba0a03ab..ba0a03ab 100644
--- a/files/source/mjpeg_validate.cc
+++ b/source/mjpeg_validate.cc
diff --git a/files/source/planar_functions.cc b/source/planar_functions.cc
index 169d4a8f..1c94e260 100644
--- a/files/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -75,6 +75,11 @@ void CopyPlane(const uint8_t* src_y,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
+#if defined(HAS_COPYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    CopyRow = CopyRow_RVV;
+  }
+#endif
 
   // Copy plane
   for (y = 0; y < height; ++y) {
@@ -162,7 +167,7 @@ void Convert8To16Plane(const uint8_t* src_y,
                        int src_stride_y,
                        uint16_t* dst_y,
                        int dst_stride_y,
-                       int scale,  // 16384 for 10 bits
+                       int scale,  // 1024 for 10 bits
                        int width,
                        int height) {
   int y;
@@ -333,6 +338,45 @@ int I210Copy(const uint16_t* src_y,
   return 0;
 }
 
+// Copy I410.
+LIBYUV_API
+int I410Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+  CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+  return 0;
+}
+
 // Copy I400.
 LIBYUV_API
 int I400ToI400(const uint8_t* src_y,
@@ -385,6 +429,7 @@ int I420ToI400(const uint8_t* src_y,
 }
 
 // Copy NV12. Supports inverting.
+LIBYUV_API
 int NV12Copy(const uint8_t* src_y,
              int src_stride_y,
              const uint8_t* src_uv,
@@ -418,6 +463,7 @@ int NV12Copy(const uint8_t* src_y,
 }
 
 // Copy NV21. Supports inverting.
+LIBYUV_API
 int NV21Copy(const uint8_t* src_y,
              int src_stride_y,
              const uint8_t* src_vu,
@@ -504,6 +550,11 @@ void SplitUVPlane(const uint8_t* src_uv,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitUVRow = SplitUVRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Copy a row of UV.
@@ -553,11 +604,19 @@ void MergeUVPlane(const uint8_t* src_u,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
+    if (IS_ALIGNED(width, 16)) {
       MergeUVRow = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 32)) {
+      MergeUVRow = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow = MergeUVRow_Any_NEON;
@@ -582,6 +641,11 @@ void MergeUVPlane(const uint8_t* src_u,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow = MergeUVRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Merge a row of U and V into a row of UV.
@@ -687,7 +751,7 @@ void MergeUVPlane_16(const uint16_t* src_u,
 #if defined(HAS_MERGEUVROW_16_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_16 = MergeUVRow_16_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 8)) {
       MergeUVRow_16 = MergeUVRow_16_AVX2;
     }
   }
@@ -911,31 +975,31 @@ int NV21ToNV12(const uint8_t* src_y,
   return 0;
 }
 
+// Test if tile_height is a power of 2 (16 or 32)
+#define IS_POWEROFTWO(x) (!((x) & ((x)-1)))
+
 // Detile a plane of data
 // tile width is 16 and assumed.
 // tile_height is 16 or 32 for MM21.
 // src_stride_y is bytes per row of source ignoring tiling. e.g. 640
 // TODO: More detile row functions.
-
 LIBYUV_API
-void DetilePlane(const uint8_t* src_y,
-                 int src_stride_y,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 int width,
-                 int height,
-                 int tile_height) {
+int DetilePlane(const uint8_t* src_y,
+                int src_stride_y,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                int width,
+                int height,
+                int tile_height) {
   const ptrdiff_t src_tile_stride = 16 * tile_height;
   int y;
   void (*DetileRow)(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst,
                     int width) = DetileRow_C;
-  assert(src_stride_y >= 0);
-  assert(tile_height > 0);
-  assert(src_stride_y > 0);
-
-  if (width <= 0 || height == 0) {
-    return;
+  if (!src_y || !dst_y || width <= 0 || height == 0 ||
+      !IS_POWEROFTWO(tile_height)) {
+    return -1;
   }
+
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -970,6 +1034,72 @@ void DetilePlane(const uint8_t* src_y,
       src_y = src_y - src_tile_stride + src_stride_y * tile_height;
     }
   }
+  return 0;
+}
+
+// Convert a plane of 16 bit tiles of 16 x H to linear.
+// tile width is 16 and assumed.
+// tile_height is 16 or 32 for MT2T.
+LIBYUV_API
+int DetilePlane_16(const uint16_t* src_y,
+                   int src_stride_y,
+                   uint16_t* dst_y,
+                   int dst_stride_y,
+                   int width,
+                   int height,
+                   int tile_height) {
+  const ptrdiff_t src_tile_stride = 16 * tile_height;
+  int y;
+  void (*DetileRow_16)(const uint16_t* src, ptrdiff_t src_tile_stride,
+                       uint16_t* dst, int width) = DetileRow_16_C;
+  if (!src_y || !dst_y || width <= 0 || height == 0 ||
+      !IS_POWEROFTWO(tile_height)) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+
+#if defined(HAS_DETILEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    DetileRow_16 = DetileRow_16_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      DetileRow_16 = DetileRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_DETILEROW_16_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    DetileRow_16 = DetileRow_16_Any_AVX;
+    if (IS_ALIGNED(width, 16)) {
+      DetileRow_16 = DetileRow_16_AVX;
+    }
+  }
+#endif
+#if defined(HAS_DETILEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    DetileRow_16 = DetileRow_16_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      DetileRow_16 = DetileRow_16_NEON;
+    }
+  }
+#endif
+
+  // Detile plane
+  for (y = 0; y < height; ++y) {
+    DetileRow_16(src_y, src_tile_stride, dst_y, width);
+    dst_y += dst_stride_y;
+    src_y += 16;
+    // Advance to next row of tiles.
+    if ((y & (tile_height - 1)) == (tile_height - 1)) {
+      src_y = src_y - src_tile_stride + src_stride_y * tile_height;
+    }
+  }
+  return 0;
 }
 
 LIBYUV_API
@@ -1033,6 +1163,74 @@ void DetileSplitUVPlane(const uint8_t* src_uv,
   }
 }
 
+LIBYUV_API
+void DetileToYUY2(const uint8_t* src_y,
+                  int src_stride_y,
+                  const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_yuy2,
+                  int dst_stride_yuy2,
+                  int width,
+                  int height,
+                  int tile_height) {
+  const ptrdiff_t src_y_tile_stride = 16 * tile_height;
+  const ptrdiff_t src_uv_tile_stride = src_y_tile_stride / 2;
+  int y;
+  void (*DetileToYUY2)(const uint8_t* src_y, ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv, ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2, int width) = DetileToYUY2_C;
+  assert(src_stride_y >= 0);
+  assert(src_stride_y > 0);
+  assert(src_stride_uv >= 0);
+  assert(src_stride_uv > 0);
+  assert(tile_height > 0);
+
+  if (width <= 0 || height == 0 || tile_height <= 0) {
+    return;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+
+#if defined(HAS_DETILETOYUY2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    DetileToYUY2 = DetileToYUY2_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      DetileToYUY2 = DetileToYUY2_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_DETILETOYUY2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    DetileToYUY2 = DetileToYUY2_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      DetileToYUY2 = DetileToYUY2_SSE2;
+    }
+  }
+#endif
+
+  // Detile plane
+  for (y = 0; y < height; ++y) {
+    DetileToYUY2(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, dst_yuy2,
+                 width);
+    dst_yuy2 += dst_stride_yuy2;
+    src_y += 16;
+
+    if (y & 0x1)
+      src_uv += 16;
+
+    // Advance to next row of tiles.
+    if ((y & (tile_height - 1)) == (tile_height - 1)) {
+      src_y = src_y - src_y_tile_stride + src_stride_y * tile_height;
+      src_uv = src_uv - src_uv_tile_stride + src_stride_uv * (tile_height / 2);
+    }
+  }
+}
+
 // Support function for NV12 etc RGB channels.
 // Width and height are plane sizes (typically half pixel width).
 LIBYUV_API
@@ -1085,6 +1283,11 @@ void SplitRGBPlane(const uint8_t* src_rgb,
     }
   }
 #endif
+#if defined(HAS_SPLITRGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitRGBRow = SplitRGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Copy a row of RGB.
@@ -1144,6 +1347,11 @@ void MergeRGBPlane(const uint8_t* src_r,
     }
   }
 #endif
+#if defined(HAS_MERGERGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeRGBRow = MergeRGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Merge a row of U and V into a row of RGB.
@@ -1156,18 +1364,18 @@ void MergeRGBPlane(const uint8_t* src_r,
 }
 
 LIBYUV_NOINLINE
-void SplitARGBPlaneAlpha(const uint8_t* src_argb,
-                         int src_stride_argb,
-                         uint8_t* dst_r,
-                         int dst_stride_r,
-                         uint8_t* dst_g,
-                         int dst_stride_g,
-                         uint8_t* dst_b,
-                         int dst_stride_b,
-                         uint8_t* dst_a,
-                         int dst_stride_a,
-                         int width,
-                         int height) {
+static void SplitARGBPlaneAlpha(const uint8_t* src_argb,
+                                int src_stride_argb,
+                                uint8_t* dst_r,
+                                int dst_stride_r,
+                                uint8_t* dst_g,
+                                int dst_stride_g,
+                                uint8_t* dst_b,
+                                int dst_stride_b,
+                                uint8_t* dst_a,
+                                int dst_stride_a,
+                                int width,
+                                int height) {
   int y;
   void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
                        uint8_t* dst_b, uint8_t* dst_a, int width) =
@@ -1175,6 +1383,9 @@ void SplitARGBPlaneAlpha(const uint8_t* src_argb,
 
   assert(height > 0);
 
+  if (width <= 0 || height == 0) {
+    return;
+  }
   if (src_stride_argb == width * 4 && dst_stride_r == width &&
       dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) {
     width *= height;
@@ -1215,6 +1426,11 @@ void SplitARGBPlaneAlpha(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_SPLITARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitARGBRow = SplitARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width);
@@ -1227,21 +1443,24 @@ void SplitARGBPlaneAlpha(const uint8_t* src_argb,
 }
 
 LIBYUV_NOINLINE
-void SplitARGBPlaneOpaque(const uint8_t* src_argb,
-                          int src_stride_argb,
-                          uint8_t* dst_r,
-                          int dst_stride_r,
-                          uint8_t* dst_g,
-                          int dst_stride_g,
-                          uint8_t* dst_b,
-                          int dst_stride_b,
-                          int width,
-                          int height) {
+static void SplitARGBPlaneOpaque(const uint8_t* src_argb,
+                                 int src_stride_argb,
+                                 uint8_t* dst_r,
+                                 int dst_stride_r,
+                                 uint8_t* dst_g,
+                                 int dst_stride_g,
+                                 uint8_t* dst_b,
+                                 int dst_stride_b,
+                                 int width,
+                                 int height) {
   int y;
   void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
                        uint8_t* dst_b, int width) = SplitXRGBRow_C;
   assert(height > 0);
 
+  if (width <= 0 || height == 0) {
+    return;
+  }
   if (src_stride_argb == width * 4 && dst_stride_r == width &&
       dst_stride_g == width && dst_stride_b == width) {
     width *= height;
@@ -1281,6 +1500,11 @@ void SplitARGBPlaneOpaque(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_SPLITXRGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitXRGBRow = SplitXRGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width);
@@ -1328,18 +1552,18 @@ void SplitARGBPlane(const uint8_t* src_argb,
 }
 
 LIBYUV_NOINLINE
-void MergeARGBPlaneAlpha(const uint8_t* src_r,
-                         int src_stride_r,
-                         const uint8_t* src_g,
-                         int src_stride_g,
-                         const uint8_t* src_b,
-                         int src_stride_b,
-                         const uint8_t* src_a,
-                         int src_stride_a,
-                         uint8_t* dst_argb,
-                         int dst_stride_argb,
-                         int width,
-                         int height) {
+static void MergeARGBPlaneAlpha(const uint8_t* src_r,
+                                int src_stride_r,
+                                const uint8_t* src_g,
+                                int src_stride_g,
+                                const uint8_t* src_b,
+                                int src_stride_b,
+                                const uint8_t* src_a,
+                                int src_stride_a,
+                                uint8_t* dst_argb,
+                                int dst_stride_argb,
+                                int width,
+                                int height) {
   int y;
   void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g,
                        const uint8_t* src_b, const uint8_t* src_a,
@@ -1347,6 +1571,9 @@ void MergeARGBPlaneAlpha(const uint8_t* src_r,
 
   assert(height > 0);
 
+  if (width <= 0 || height == 0) {
+    return;
+  }
   if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
       src_stride_a == width && dst_stride_argb == width * 4) {
     width *= height;
@@ -1378,6 +1605,11 @@ void MergeARGBPlaneAlpha(const uint8_t* src_r,
     }
   }
 #endif
+#if defined(HAS_MERGEARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeARGBRow = MergeARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width);
@@ -1390,16 +1622,16 @@ void MergeARGBPlaneAlpha(const uint8_t* src_r,
 }
 
 LIBYUV_NOINLINE
-void MergeARGBPlaneOpaque(const uint8_t* src_r,
-                          int src_stride_r,
-                          const uint8_t* src_g,
-                          int src_stride_g,
-                          const uint8_t* src_b,
-                          int src_stride_b,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          int width,
-                          int height) {
+static void MergeARGBPlaneOpaque(const uint8_t* src_r,
+                                 int src_stride_r,
+                                 const uint8_t* src_g,
+                                 int src_stride_g,
+                                 const uint8_t* src_b,
+                                 int src_stride_b,
+                                 uint8_t* dst_argb,
+                                 int dst_stride_argb,
+                                 int width,
+                                 int height) {
   int y;
   void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
                        const uint8_t* src_b, uint8_t* dst_argb, int width) =
@@ -1407,6 +1639,9 @@ void MergeARGBPlaneOpaque(const uint8_t* src_r,
 
   assert(height > 0);
 
+  if (width <= 0 || height == 0) {
+    return;
+  }
   if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
       dst_stride_argb == width * 4) {
     width *= height;
@@ -1437,6 +1672,11 @@ void MergeARGBPlaneOpaque(const uint8_t* src_r,
     }
   }
 #endif
+#if defined(HAS_MERGEXRGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeXRGBRow = MergeXRGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     MergeXRGBRow(src_r, src_g, src_b, dst_argb, width);
@@ -1888,6 +2128,16 @@ int YUY2ToI422(const uint8_t* src_yuy2,
     }
   }
 #endif
+#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUV422ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    YUY2ToYRow = YUY2ToYRow_Any_LSX;
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_LSX;
+      YUY2ToUV422Row = YUY2ToUV422Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     YUY2ToYRow = YUY2ToYRow_Any_LASX;
@@ -1984,6 +2234,16 @@ int UYVYToI422(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_LSX) && defined(HAS_UYVYTOUV422ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    UYVYToYRow = UYVYToYRow_Any_LSX;
+    UYVYToUV422Row = UYVYToUV422Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_LSX;
+      UYVYToUV422Row = UYVYToUV422Row_LSX;
+    }
+  }
+#endif
 #if defined(HAS_UYVYTOYROW_LASX) && defined(HAS_UYVYTOUV422ROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     UYVYToYRow = UYVYToYRow_Any_LASX;
@@ -2131,6 +2391,14 @@ int UYVYToY(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    UYVYToYRow = UYVYToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_LSX;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     UYVYToYRow(src_uyvy, dst_y, width);
@@ -2189,6 +2457,14 @@ void MirrorPlane(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_MIRRORROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MirrorRow = MirrorRow_Any_LSX;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_MIRRORROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     MirrorRow = MirrorRow_Any_LASX;
@@ -2255,6 +2531,14 @@ void MirrorUVPlane(const uint8_t* src_uv,
     }
   }
 #endif
+#if defined(HAS_MIRRORUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MirrorUVRow = MirrorUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorUVRow = MirrorUVRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_MIRRORUVROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     MirrorUVRow = MirrorUVRow_Any_LASX;
@@ -2427,6 +2711,14 @@ int ARGBMirror(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBMIRRORROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBMIRRORROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBMirrorRow = ARGBMirrorRow_Any_LASX;
@@ -2491,37 +2783,6 @@ int RGB24Mirror(const uint8_t* src_rgb24,
   return 0;
 }
 
-// Get a blender that optimized for the CPU and pixel count.
-// As there are 6 blenders to choose from, the caller should try to use
-// the same blend function for all pixels if possible.
-LIBYUV_API
-ARGBBlendRow GetARGBBlend() {
-  void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
-                       uint8_t* dst_argb, int width) = ARGBBlendRow_C;
-#if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBBlendRow = ARGBBlendRow_SSSE3;
-    return ARGBBlendRow;
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBBlendRow = ARGBBlendRow_NEON;
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBBlendRow = ARGBBlendRow_MSA;
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBBlendRow = ARGBBlendRow_LSX;
-  }
-#endif
-  return ARGBBlendRow;
-}
-
 // Alpha Blend 2 ARGB images and store to destination.
 LIBYUV_API
 int ARGBBlend(const uint8_t* src_argb0,
@@ -2534,7 +2795,7 @@ int ARGBBlend(const uint8_t* src_argb0,
               int height) {
   int y;
   void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
-                       uint8_t* dst_argb, int width) = GetARGBBlend();
+                       uint8_t* dst_argb, int width) = ARGBBlendRow_C;
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -2551,7 +2812,31 @@ int ARGBBlend(const uint8_t* src_argb0,
     height = 1;
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
-
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBBlendRow = ARGBBlendRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBBlendRow = ARGBBlendRow_NEON;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBBlendRow = ARGBBlendRow_MSA;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBBlendRow = ARGBBlendRow_LSX;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBBlendRow = ARGBBlendRow_RVV;
+  }
+#endif
   for (y = 0; y < height; ++y) {
     ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
     src_argb0 += src_stride_argb0;
@@ -2611,6 +2896,11 @@ int BlendPlane(const uint8_t* src_y0,
     }
   }
 #endif
+#if defined(HAS_BLENDPLANEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    BlendPlaneRow = BlendPlaneRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
@@ -2688,6 +2978,11 @@ int I420Blend(const uint8_t* src_y0,
     }
   }
 #endif
+#if defined(HAS_BLENDPLANEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    BlendPlaneRow = BlendPlaneRow_RVV;
+  }
+#endif
   if (!IS_ALIGNED(width, 2)) {
     ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
   }
@@ -2724,9 +3019,16 @@ int I420Blend(const uint8_t* src_y0,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN2_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowDown2 = ScaleRowDown2Box_RVV;
+  }
+#endif
 
   // Row buffer for intermediate alpha pixels.
   align_buffer_64(halfalpha, halfwidth);
+  if (!halfalpha)
+    return 1;
   for (y = 0; y < height; y += 2) {
     // last row of odd height image use 1 row of alpha instead of 2.
     if (y == (height - 1)) {
@@ -2809,6 +3111,14 @@ int ARGBMultiply(const uint8_t* src_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBMULTIPLYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_LSX;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBMULTIPLYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBMultiplyRow = ARGBMultiplyRow_Any_LASX;
@@ -2894,6 +3204,14 @@ int ARGBAdd(const uint8_t* src_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBADDROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBAddRow = ARGBAddRow_Any_LSX;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAddRow = ARGBAddRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBADDROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBAddRow = ARGBAddRow_Any_LASX;
@@ -2974,6 +3292,14 @@ int ARGBSubtract(const uint8_t* src_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBSUBTRACTROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_LSX;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSubtractRow = ARGBSubtractRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBSUBTRACTROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBSubtractRow = ARGBSubtractRow_Any_LASX;
@@ -3051,6 +3377,11 @@ int RAWToRGB24(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToRGB24Row = RAWToRGB24Row_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToRGB24Row(src_raw, dst_rgb24, width);
@@ -3060,6 +3391,7 @@ int RAWToRGB24(const uint8_t* src_raw,
   return 0;
 }
 
+// TODO(fbarchard): Consider uint8_t value
 LIBYUV_API
 void SetPlane(uint8_t* dst_y,
               int dst_stride_y,
@@ -3067,7 +3399,7 @@ void SetPlane(uint8_t* dst_y,
               int height,
               uint32_t value) {
   int y;
-  void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C;
+  void (*SetRow)(uint8_t* dst, uint8_t value, int width) = SetRow_C;
 
   if (width <= 0 || height == 0) {
     return;
@@ -3120,7 +3452,7 @@ void SetPlane(uint8_t* dst_y,
 
   // Set plane
   for (y = 0; y < height; ++y) {
-    SetRow(dst_y, value, width);
+    SetRow(dst_y, (uint8_t)value, width);
     dst_y += dst_stride_y;
   }
 }
@@ -3168,7 +3500,7 @@ int ARGBRect(uint8_t* dst_argb,
              int height,
              uint32_t value) {
   int y;
-  void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) =
+  void (*ARGBSetRow)(uint8_t* dst_argb, uint32_t value, int width) =
       ARGBSetRow_C;
   if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
     return -1;
@@ -3293,6 +3625,14 @@ int ARGBAttenuate(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
@@ -3301,6 +3641,11 @@ int ARGBAttenuate(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBAttenuateRow(src_argb, dst_argb, width);
@@ -3401,6 +3746,11 @@ int ARGBGrayTo(const uint8_t* src_argb,
     ARGBGrayRow = ARGBGrayRow_MSA;
   }
 #endif
+#if defined(HAS_ARGBGRAYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_LSX;
+  }
+#endif
 #if defined(HAS_ARGBGRAYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
     ARGBGrayRow = ARGBGrayRow_LASX;
@@ -3451,6 +3801,11 @@ int ARGBGray(uint8_t* dst_argb,
     ARGBGrayRow = ARGBGrayRow_MSA;
   }
 #endif
+#if defined(HAS_ARGBGRAYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_LSX;
+  }
+#endif
 #if defined(HAS_ARGBGRAYROW_LASX)
   if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
     ARGBGrayRow = ARGBGrayRow_LASX;
@@ -3473,7 +3828,7 @@ int ARGBSepia(uint8_t* dst_argb,
               int width,
               int height) {
   int y;
-  void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C;
+  void (*ARGBSepiaRow)(uint8_t* dst_argb, int width) = ARGBSepiaRow_C;
   uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
     return -1;
@@ -3499,6 +3854,11 @@ int ARGBSepia(uint8_t* dst_argb,
     ARGBSepiaRow = ARGBSepiaRow_MSA;
   }
 #endif
+#if defined(HAS_ARGBSEPIAROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_LSX;
+  }
+#endif
 #if defined(HAS_ARGBSEPIAROW_LASX)
   if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
     ARGBSepiaRow = ARGBSepiaRow_LASX;
@@ -3616,7 +3976,7 @@ int ARGBColorTable(uint8_t* dst_argb,
                    int width,
                    int height) {
   int y;
-  void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
+  void (*ARGBColorTableRow)(uint8_t* dst_argb, const uint8_t* table_argb,
                             int width) = ARGBColorTableRow_C;
   uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
@@ -3652,7 +4012,7 @@ int RGBColorTable(uint8_t* dst_argb,
                   int width,
                   int height) {
   int y;
-  void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
+  void (*RGBColorTableRow)(uint8_t* dst_argb, const uint8_t* table_argb,
                            int width) = RGBColorTableRow_C;
   uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
@@ -3697,7 +4057,7 @@ int ARGBQuantize(uint8_t* dst_argb,
                  int width,
                  int height) {
   int y;
-  void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size,
+  void (*ARGBQuantizeRow)(uint8_t* dst_argb, int scale, int interval_size,
                           int interval_offset, int width) = ARGBQuantizeRow_C;
   uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
@@ -3924,6 +4284,11 @@ int ARGBShade(const uint8_t* src_argb,
     ARGBShadeRow = ARGBShadeRow_MSA;
   }
 #endif
+#if defined(HAS_ARGBSHADEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 4)) {
+    ARGBShadeRow = ARGBShadeRow_LSX;
+  }
+#endif
 #if defined(HAS_ARGBSHADEROW_LASX)
   if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 8)) {
     ARGBShadeRow = ARGBShadeRow_LASX;
@@ -3950,7 +4315,7 @@ int InterpolatePlane(const uint8_t* src0,
                      int height,
                      int interpolation) {
   int y;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
   if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
@@ -4008,6 +4373,11 @@ int InterpolatePlane(const uint8_t* src0,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     InterpolateRow(dst, src0, src1 - src0, width, interpolation);
@@ -4030,7 +4400,7 @@ int InterpolatePlane_16(const uint16_t* src0,
                         int height,
                         int interpolation) {
   int y;
-  void (*InterpolateRow_16)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*InterpolateRow_16)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                             ptrdiff_t src_stride, int dst_width,
                             int source_y_fraction) = InterpolateRow_16_C;
   if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
@@ -4213,6 +4583,14 @@ int ARGBShuffle(const uint8_t* src_bgra,
     }
   }
 #endif
+#if defined(HAS_ARGBSHUFFLEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBShuffleRow = ARGBShuffleRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBSHUFFLEROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBShuffleRow = ARGBShuffleRow_Any_LASX;
@@ -4334,6 +4712,8 @@ int GaussPlane_F32(const float* src,
   {
     // 2 pixels on each side, but aligned out to 16 bytes.
     align_buffer_64(rowbuf, (4 + width + 4) * 4);
+    if (!rowbuf)
+      return 1;
     memset(rowbuf, 0, 16);
     memset(rowbuf + (4 + width) * 4, 0, 16);
     float* row = (float*)(rowbuf + 16);
@@ -4444,6 +4824,11 @@ static int ARGBSobelize(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYJRow = ARGBToYJRow_RVV;
+  }
+#endif
 
 #if defined(HAS_SOBELYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
@@ -4477,16 +4862,18 @@ static int ARGBSobelize(const uint8_t* src_argb,
 #endif
   {
     // 3 rows with edges before/after.
-    const int kRowSize = (width + kEdge + 31) & ~31;
-    align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
+    const int row_size = (width + kEdge + 31) & ~31;
+    align_buffer_64(rows, row_size * 2 + (kEdge + row_size * 3 + kEdge));
     uint8_t* row_sobelx = rows;
-    uint8_t* row_sobely = rows + kRowSize;
-    uint8_t* row_y = rows + kRowSize * 2;
+    uint8_t* row_sobely = rows + row_size;
+    uint8_t* row_y = rows + row_size * 2;
 
     // Convert first row.
     uint8_t* row_y0 = row_y + kEdge;
-    uint8_t* row_y1 = row_y0 + kRowSize;
-    uint8_t* row_y2 = row_y1 + kRowSize;
+    uint8_t* row_y1 = row_y0 + row_size;
+    uint8_t* row_y2 = row_y1 + row_size;
+    if (!rows)
+      return 1;
     ARGBToYJRow(src_argb, row_y0, width);
     row_y0[-1] = row_y0[0];
     memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
@@ -4967,6 +5354,11 @@ int ARGBExtractAlpha(const uint8_t* src_argb,
                                                 : ARGBExtractAlphaRow_Any_LSX;
   }
 #endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBExtractAlphaRow = ARGBExtractAlphaRow_RVV;
+  }
+#endif
 
   for (int y = 0; y < height; ++y) {
     ARGBExtractAlphaRow(src_argb, dst_a, width);
@@ -5018,6 +5410,11 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBCopyYToAlphaRow(src_y, dst_argb, width);
@@ -5027,9 +5424,6 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
   return 0;
 }
 
-// TODO(fbarchard): Consider if width is even Y channel can be split
-// directly. A SplitUVRow_Odd function could copy the remaining chroma.
-
 LIBYUV_API
 int YUY2ToNV12(const uint8_t* src_yuy2,
                int src_stride_yuy2,
@@ -5040,13 +5434,10 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
                int width,
                int height) {
   int y;
-  int halfwidth = (width + 1) >> 1;
-  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
-                     int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+      YUY2ToYRow_C;
+  void (*YUY2ToNVUVRow)(const uint8_t* src_yuy2, int stride_yuy2,
+                        uint8_t* dst_uv, int width) = YUY2ToNVUVRow_C;
   if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
@@ -5057,109 +5448,91 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
     src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
     src_stride_yuy2 = -src_stride_yuy2;
   }
-#if defined(HAS_SPLITUVROW_SSE2)
+#if defined(HAS_YUY2TOYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    SplitUVRow = SplitUVRow_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_AVX2)
+#if defined(HAS_YUY2TOYROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitUVRow = SplitUVRow_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_NEON)
+#if defined(HAS_YUY2TOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUVRow = SplitUVRow_Any_NEON;
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_NEON;
+      YUY2ToYRow = YUY2ToYRow_NEON;
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_MSA)
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
-    SplitUVRow = SplitUVRow_Any_MSA;
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
     if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_MSA;
+      YUY2ToYRow = YUY2ToYRow_MSA;
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_LSX)
+#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUV422ROW_LSX)
   if (TestCpuFlag(kCpuHasLSX)) {
-    SplitUVRow = SplitUVRow_Any_LSX;
-    if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
+    YUY2ToYRow = YUY2ToYRow_Any_LSX;
     if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
+      YUY2ToYRow = YUY2ToYRow_LSX;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
+#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    YUY2ToYRow = YUY2ToYRow_Any_LASX;
     if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
+      YUY2ToYRow = YUY2ToYRow_LASX;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
+
+#if defined(HAS_YUY2TONVUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToNVUVRow = YUY2ToNVUVRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
+      YUY2ToNVUVRow = YUY2ToNVUVRow_SSE2;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    InterpolateRow = InterpolateRow_Any_MSA;
+#if defined(HAS_YUY2TONVUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToNVUVRow = YUY2ToNVUVRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_MSA;
+      YUY2ToNVUVRow = YUY2ToNVUVRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    InterpolateRow = InterpolateRow_Any_LSX;
-    if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_LSX;
+#if defined(HAS_YUY2TONVUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToNVUVRow = YUY2ToNVUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToNVUVRow = YUY2ToNVUVRow_NEON;
     }
   }
 #endif
 
-  {
-    int awidth = halfwidth * 2;
-    // row of y and 2 rows of uv
-    align_buffer_64(rows, awidth * 3);
-
-    for (y = 0; y < height - 1; y += 2) {
-      // Split Y from UV.
-      SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
-      memcpy(dst_y, rows, width);
-      SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
-      memcpy(dst_y + dst_stride_y, rows, width);
-      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
-      src_yuy2 += src_stride_yuy2 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_uv += dst_stride_uv;
-    }
-    if (height & 1) {
-      // Split Y from UV.
-      SplitUVRow(src_yuy2, rows, dst_uv, awidth);
-      memcpy(dst_y, rows, width);
-    }
-    free_aligned_buffer_64(rows);
+  for (y = 0; y < height - 1; y += 2) {
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+    YUY2ToNVUVRow(src_yuy2, src_stride_yuy2, dst_uv, width);
+    src_yuy2 += src_stride_yuy2 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    YUY2ToNVUVRow(src_yuy2, 0, dst_uv, width);
   }
   return 0;
 }
@@ -5177,7 +5550,7 @@ int UYVYToNV12(const uint8_t* src_uyvy,
   int halfwidth = (width + 1) >> 1;
   void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
                      int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
 
@@ -5231,6 +5604,12 @@ int UYVYToNV12(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitUVRow = SplitUVRow_RVV;
+  }
+#endif
+
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -5271,11 +5650,18 @@ int UYVYToNV12(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   {
     int awidth = halfwidth * 2;
     // row of y and 2 rows of uv
     align_buffer_64(rows, awidth * 3);
+    if (!rows)
+      return 1;
 
     for (y = 0; y < height - 1; y += 2) {
       // Split Y from UV.
@@ -5336,6 +5722,7 @@ void HalfMergeUVPlane(const uint8_t* src_u,
     HalfMergeUVRow = HalfMergeUVRow_AVX2;
   }
 #endif
+
   for (y = 0; y < height - 1; y += 2) {
     // Merge a row of U and V into a row of UV.
     HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
diff --git a/files/source/rotate.cc b/source/rotate.cc
index f1e83cbd..3f8332c3 100644
--- a/files/source/rotate.cc
+++ b/source/rotate.cc
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+
 #include "libyuv/rotate.h"
 
 #include "libyuv/convert.h"
@@ -138,8 +140,11 @@ void RotatePlane180(const uint8_t* src,
                     int dst_stride,
                     int width,
                     int height) {
-  // Swap first and last row and mirror the content. Uses a temporary row.
+  // Swap top and bottom row and mirror the content. Uses a temporary row.
   align_buffer_64(row, width);
+  assert(row);
+  if (!row)
+    return;
   const uint8_t* src_bot = src + src_stride * (height - 1);
   uint8_t* dst_bot = dst + dst_stride * (height - 1);
   int half_height = (height + 1) >> 1;
@@ -178,6 +183,14 @@ void RotatePlane180(const uint8_t* src,
     }
   }
 #endif
+#if defined(HAS_MIRRORROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MirrorRow = MirrorRow_Any_LSX;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_MIRRORROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     MirrorRow = MirrorRow_Any_LASX;
@@ -206,12 +219,17 @@ void RotatePlane180(const uint8_t* src,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
+#if defined(HAS_COPYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    CopyRow = CopyRow_RVV;
+  }
+#endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
-    CopyRow(src, row, width);        // Copy first row into buffer
-    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
-    MirrorRow(row, dst_bot, width);  // Mirror buffer into last row
+    CopyRow(src, row, width);        // Copy top row into buffer
+    MirrorRow(src_bot, dst, width);  // Mirror bottom row into top row
+    MirrorRow(row, dst_bot, width);  // Mirror buffer into bottom row
     src += src_stride;
     dst += dst_stride;
     src_bot -= src_stride;
@@ -476,6 +494,124 @@ int RotatePlane(const uint8_t* src,
   return -1;
 }
 
+static void TransposePlane_16(const uint16_t* src,
+                              int src_stride,
+                              uint16_t* dst,
+                              int dst_stride,
+                              int width,
+                              int height) {
+  int i = height;
+  // Work across the source in 8x8 tiles
+  while (i >= 8) {
+    TransposeWx8_16_C(src, src_stride, dst, dst_stride, width);
+    src += 8 * src_stride;  // Go down 8 rows.
+    dst += 8;               // Move over 8 columns.
+    i -= 8;
+  }
+
+  if (i > 0) {
+    TransposeWxH_16_C(src, src_stride, dst, dst_stride, width, i);
+  }
+}
+
+static void RotatePlane90_16(const uint16_t* src,
+                             int src_stride,
+                             uint16_t* dst,
+                             int dst_stride,
+                             int width,
+                             int height) {
+  // Rotate by 90 is a transpose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  TransposePlane_16(src, src_stride, dst, dst_stride, width, height);
+}
+
+static void RotatePlane270_16(const uint16_t* src,
+                              int src_stride,
+                              uint16_t* dst,
+                              int dst_stride,
+                              int width,
+                              int height) {
+  // Rotate by 270 is a transpose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  TransposePlane_16(src, src_stride, dst, dst_stride, width, height);
+}
+
+static void RotatePlane180_16(const uint16_t* src,
+                              int src_stride,
+                              uint16_t* dst,
+                              int dst_stride,
+                              int width,
+                              int height) {
+  const uint16_t* src_bot = src + src_stride * (height - 1);
+  uint16_t* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+
+  // Swap top and bottom row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width * 2);
+  uint16_t* row_tmp = (uint16_t*)row;
+  assert(row);
+  if (!row)
+    return;
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    CopyRow_16_C(src, row_tmp, width);        // Copy top row into buffer
+    MirrorRow_16_C(src_bot, dst, width);      // Mirror bottom row into top row
+    MirrorRow_16_C(row_tmp, dst_bot, width);  // Mirror buffer into bottom row
+    src += src_stride;
+    dst += dst_stride;
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+int RotatePlane_16(const uint16_t* src,
+                   int src_stride,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height,
+                   enum RotationMode mode) {
+  if (!src || width <= 0 || height == 0 || !dst) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src = src + (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      CopyPlane_16(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    case kRotate90:
+      RotatePlane90_16(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270_16(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    case kRotate180:
+      RotatePlane180_16(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
 LIBYUV_API
 int I420Rotate(const uint8_t* src_y,
                int src_stride_y,
@@ -544,6 +680,8 @@ int I420Rotate(const uint8_t* src_y,
   return -1;
 }
 
+// I422 has half width x full height UV planes, so rotate by 90 and 270
+// require scaling to maintain 422 subsampling.
 LIBYUV_API
 int I422Rotate(const uint8_t* src_y,
                int src_stride_y,
@@ -562,6 +700,7 @@ int I422Rotate(const uint8_t* src_y,
                enum RotationMode mode) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
+  int r;
   if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
       !dst_u || !dst_v) {
     return -1;
@@ -579,31 +718,54 @@ int I422Rotate(const uint8_t* src_y,
 
   switch (mode) {
     case kRotate0:
-      // copy frame
+      // Copy frame
       CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
       CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
       CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
       return 0;
+
+      // Note on temporary Y plane for UV.
+      // Rotation of UV first fits within the Y destination plane rows.
+      // Y plane is width x height
+      // Y plane rotated is height x width
+      // UV plane is (width / 2) x height
+      // UV plane rotated is height x (width / 2)
+      // UV plane rotated+scaled is (height / 2) x width.
+      // UV plane rotated is a temporary that fits within the Y plane rotated.
+
     case kRotate90:
-      // We need to rotate and rescale, we use plane Y as temporal storage.
-      RotatePlane90(src_u, src_stride_u, dst_y, height, halfwidth, height);
-      ScalePlane(dst_y, height, height, halfwidth, dst_u, halfheight,
-                 halfheight, width, kFilterBilinear);
-      RotatePlane90(src_v, src_stride_v, dst_y, height, halfwidth, height);
-      ScalePlane(dst_y, height, height, halfwidth, dst_v, halfheight,
-                 halfheight, width, kFilterLinear);
+      RotatePlane90(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+                    height);
+      r = ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_u,
+                     dst_stride_u, halfheight, width, kFilterBilinear);
+      if (r != 0) {
+        return r;
+      }
+      RotatePlane90(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+                    height);
+      r = ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_v,
+                     dst_stride_v, halfheight, width, kFilterLinear);
+      if (r != 0) {
+        return r;
+      }
       RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
       return 0;
     case kRotate270:
-      // We need to rotate and rescale, we use plane Y as temporal storage.
-      RotatePlane270(src_u, src_stride_u, dst_y, height, halfwidth, height);
-      ScalePlane(dst_y, height, height, halfwidth, dst_u, halfheight,
-                 halfheight, width, kFilterBilinear);
-      RotatePlane270(src_v, src_stride_v, dst_y, height, halfwidth, height);
-      ScalePlane(dst_y, height, height, halfwidth, dst_v, halfheight,
-                 halfheight, width, kFilterLinear);
+      RotatePlane270(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+                     height);
+      r = ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_u,
+                     dst_stride_u, halfheight, width, kFilterBilinear);
+      if (r != 0) {
+        return r;
+      }
+      RotatePlane270(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+                     height);
+      r = ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_v,
+                     dst_stride_v, halfheight, width, kFilterLinear);
+      if (r != 0) {
+        return r;
+      }
       RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-
       return 0;
     case kRotate180:
       RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
@@ -828,6 +990,241 @@ int Android420ToI420Rotate(const uint8_t* src_y,
   return -1;
 }
 
+LIBYUV_API
+int I010Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v || dst_stride_y < 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return I010Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                      src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                      dst_v, dst_stride_v, width, height);
+    case kRotate90:
+      RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                       halfheight);
+      RotatePlane90_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                       halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      RotatePlane270_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                        halfheight);
+      RotatePlane270_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                        halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                        halfheight);
+      RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                        halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+// I210 has half width x full height UV planes, so rotate by 90 and 270
+// require scaling to maintain 422 subsampling.
+LIBYUV_API
+int I210Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  int r;
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // Copy frame
+      CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+      CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+      return 0;
+
+      // Note on temporary Y plane for UV.
+      // Rotation of UV first fits within the Y destination plane rows.
+      // Y plane is width x height
+      // Y plane rotated is height x width
+      // UV plane is (width / 2) x height
+      // UV plane rotated is height x (width / 2)
+      // UV plane rotated+scaled is (height / 2) x width.
+      // UV plane rotated is a temporary that fits within the Y plane rotated.
+
+    case kRotate90:
+      RotatePlane90_16(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+                       height);
+      r = ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_u,
+                        dst_stride_u, halfheight, width, kFilterBilinear);
+      if (r != 0) {
+        return r;
+      }
+      RotatePlane90_16(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+                       height);
+      r = ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_v,
+                        dst_stride_v, halfheight, width, kFilterLinear);
+      if (r != 0) {
+        return r;
+      }
+      RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270_16(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+                        height);
+      r = ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_u,
+                        dst_stride_u, halfheight, width, kFilterBilinear);
+      if (r != 0) {
+        return r;
+      }
+      RotatePlane270_16(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+                        height);
+      r = ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_v,
+                        dst_stride_v, halfheight, width, kFilterLinear);
+      if (r != 0) {
+        return r;
+      }
+      RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      return 0;
+    case kRotate180:
+      RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                        height);
+      RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                        height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int I410Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v || dst_stride_y < 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case kRotate90:
+      RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      RotatePlane90_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      RotatePlane270_16(src_u, src_stride_u, dst_u, dst_stride_u, width,
+                        height);
+      RotatePlane270_16(src_v, src_stride_v, dst_v, dst_stride_v, width,
+                        height);
+      return 0;
+    case kRotate180:
+      RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, width,
+                        height);
+      RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, width,
+                        height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/source/rotate_any.cc b/source/rotate_any.cc
index 88ca7876..88ca7876 100644
--- a/files/source/rotate_any.cc
+++ b/source/rotate_any.cc
diff --git a/files/source/rotate_argb.cc b/source/rotate_argb.cc
index 539cf98d..d55fac4f 100644
--- a/files/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -8,11 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/rotate.h"
+#include "libyuv/rotate_argb.h"
 
 #include "libyuv/convert.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
 #include "libyuv/row.h"
 #include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */
 
@@ -68,6 +69,11 @@ static int ARGBTranspose(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_RVV;
+  }
+#endif
 
   for (i = 0; i < width; ++i) {  // column of source to row of dest.
     ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height);
@@ -114,7 +120,6 @@ static int ARGBRotate180(const uint8_t* src_argb,
                          int width,
                          int height) {
   // Swap first and last row and mirror the content. Uses a temporary row.
-  align_buffer_64(row, width * 4);
   const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
   uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1);
   int half_height = (height + 1) >> 1;
@@ -123,6 +128,9 @@ static int ARGBRotate180(const uint8_t* src_argb,
       ARGBMirrorRow_C;
   void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
       CopyRow_C;
+  align_buffer_64(row, width * 4);
+  if (!row)
+    return 1;
 #if defined(HAS_ARGBMIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
@@ -155,6 +163,14 @@ static int ARGBRotate180(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBMIRRORROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_ARGBMIRRORROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     ARGBMirrorRow = ARGBMirrorRow_Any_LASX;
@@ -183,6 +199,11 @@ static int ARGBRotate180(const uint8_t* src_argb,
     CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
+#if defined(HAS_COPYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    CopyRow = CopyRow_RVV;
+  }
+#endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
diff --git a/source/rotate_common.cc b/source/rotate_common.cc
new file mode 100644
index 00000000..e72608e9
--- /dev/null
+++ b/source/rotate_common.cc
@@ -0,0 +1,198 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void TransposeWx8_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst[0] = src[0 * src_stride];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
+}
+
+void TransposeUVWx8_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_a[0] = src[0 * src_stride + 0];
+    dst_b[0] = src[0 * src_stride + 1];
+    dst_a[1] = src[1 * src_stride + 0];
+    dst_b[1] = src[1 * src_stride + 1];
+    dst_a[2] = src[2 * src_stride + 0];
+    dst_b[2] = src[2 * src_stride + 1];
+    dst_a[3] = src[3 * src_stride + 0];
+    dst_b[3] = src[3 * src_stride + 1];
+    dst_a[4] = src[4 * src_stride + 0];
+    dst_b[4] = src[4 * src_stride + 1];
+    dst_a[5] = src[5 * src_stride + 0];
+    dst_b[5] = src[5 * src_stride + 1];
+    dst_a[6] = src[6 * src_stride + 0];
+    dst_b[6] = src[6 * src_stride + 1];
+    dst_a[7] = src[7 * src_stride + 0];
+    dst_b[7] = src[7 * src_stride + 1];
+    src += 2;
+    dst_a += dst_stride_a;
+    dst_b += dst_stride_b;
+  }
+}
+
+void TransposeWxH_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst[i * dst_stride + j] = src[j * src_stride + i];
+    }
+  }
+}
+
+void TransposeUVWxH_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height) {
+  int i;
+  for (i = 0; i < width * 2; i += 2) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst_a[((i >> 1) * dst_stride_a) + j] = src[i + (j * src_stride)];
+      dst_b[((i >> 1) * dst_stride_b) + j] = src[i + (j * src_stride) + 1];
+    }
+  }
+}
+
+void TransposeWx8_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst[0] = src[0 * src_stride];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
+}
+
+void TransposeWxH_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width,
+                       int height) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst[i * dst_stride + j] = src[j * src_stride + i];
+    }
+  }
+}
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width) {
+  const uint8_t* src1 = src + src_stride;
+  const uint8_t* src2 = src1 + src_stride;
+  const uint8_t* src3 = src2 + src_stride;
+  uint8_t* dst1 = dst + dst_stride;
+  uint8_t* dst2 = dst1 + dst_stride;
+  uint8_t* dst3 = dst2 + dst_stride;
+  int i;
+  for (i = 0; i < width; i += 4) {
+    uint32_t p00 = ((uint32_t*)(src))[0];
+    uint32_t p10 = ((uint32_t*)(src))[1];
+    uint32_t p20 = ((uint32_t*)(src))[2];
+    uint32_t p30 = ((uint32_t*)(src))[3];
+    uint32_t p01 = ((uint32_t*)(src1))[0];
+    uint32_t p11 = ((uint32_t*)(src1))[1];
+    uint32_t p21 = ((uint32_t*)(src1))[2];
+    uint32_t p31 = ((uint32_t*)(src1))[3];
+    uint32_t p02 = ((uint32_t*)(src2))[0];
+    uint32_t p12 = ((uint32_t*)(src2))[1];
+    uint32_t p22 = ((uint32_t*)(src2))[2];
+    uint32_t p32 = ((uint32_t*)(src2))[3];
+    uint32_t p03 = ((uint32_t*)(src3))[0];
+    uint32_t p13 = ((uint32_t*)(src3))[1];
+    uint32_t p23 = ((uint32_t*)(src3))[2];
+    uint32_t p33 = ((uint32_t*)(src3))[3];
+    ((uint32_t*)(dst))[0] = p00;
+    ((uint32_t*)(dst))[1] = p01;
+    ((uint32_t*)(dst))[2] = p02;
+    ((uint32_t*)(dst))[3] = p03;
+    ((uint32_t*)(dst1))[0] = p10;
+    ((uint32_t*)(dst1))[1] = p11;
+    ((uint32_t*)(dst1))[2] = p12;
+    ((uint32_t*)(dst1))[3] = p13;
+    ((uint32_t*)(dst2))[0] = p20;
+    ((uint32_t*)(dst2))[1] = p21;
+    ((uint32_t*)(dst2))[2] = p22;
+    ((uint32_t*)(dst2))[3] = p23;
+    ((uint32_t*)(dst3))[0] = p30;
+    ((uint32_t*)(dst3))[1] = p31;
+    ((uint32_t*)(dst3))[2] = p32;
+    ((uint32_t*)(dst3))[3] = p33;
+    src += src_stride * 4;  // advance 4 rows
+    src1 += src_stride * 4;
+    src2 += src_stride * 4;
+    src3 += src_stride * 4;
+    dst += 4 * 4;  // advance 4 columns
+    dst1 += 4 * 4;
+    dst2 += 4 * 4;
+    dst3 += 4 * 4;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/rotate_gcc.cc b/source/rotate_gcc.cc
index 1a3f8cbb..fd5eee05 100644
--- a/files/source/rotate_gcc.cc
+++ b/source/rotate_gcc.cc
@@ -365,6 +365,136 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
         "xmm7", "xmm8", "xmm9");
 }
 #endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
+
+#if defined(HAS_TRANSPOSE4X4_32_SSE2)
+// 4 values, little endian view
+// a b c d
+// e f g h
+// i j k l
+// m n o p
+
+// transpose 2x2
+// a e b f   from row 0, 1
+// i m j n   from row 2, 3
+// c g d h   from row 0, 1
+// k o l p   from row 2, 3
+
+// transpose 4x4
+// a e i m   from row 0, 1
+// b f j n   from row 0, 1
+// c g k o   from row 2, 3
+// d h l p   from row 2, 3
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_SSE2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width) {
+  asm volatile(
+      // Main loop transpose 4x4.  Read a column, write a row.
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // a b c d
+      "movdqu      (%0,%3),%%xmm1                \n"  // e f g h
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+      "movdqu      (%0),%%xmm2                   \n"  // i j k l
+      "movdqu      (%0,%3),%%xmm3                \n"  // m n o p
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+
+      // Transpose 2x2
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "movdqa      %%xmm0,%%xmm6                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "punpckldq   %%xmm1,%%xmm4                 \n"  // a e b f   from row 0, 1
+      "punpckldq   %%xmm3,%%xmm5                 \n"  // i m j n   from row 2, 3
+      "punpckhdq   %%xmm1,%%xmm6                 \n"  // c g d h   from row 0, 1
+      "punpckhdq   %%xmm3,%%xmm7                 \n"  // k o l p   from row 2, 3
+
+      // Transpose 4x4
+      "movdqa      %%xmm4,%%xmm0                 \n"
+      "movdqa      %%xmm4,%%xmm1                 \n"
+      "movdqa      %%xmm6,%%xmm2                 \n"
+      "movdqa      %%xmm6,%%xmm3                 \n"
+      "punpcklqdq  %%xmm5,%%xmm0                 \n"  // a e i m   from row 0, 1
+      "punpckhqdq  %%xmm5,%%xmm1                 \n"  // b f j n   from row 0, 1
+      "punpcklqdq  %%xmm7,%%xmm2                 \n"  // c g k o   from row 2, 3
+      "punpckhqdq  %%xmm7,%%xmm3                 \n"  // d h l p   from row 2, 3
+
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         16(%1,%4),%1                  \n"  // dst += stride + 16
+      "movdqu      %%xmm1,-16(%1)                \n"
+      "movdqu      %%xmm2,-16(%1,%4)             \n"
+      "movdqu      %%xmm3,-16(%1,%4,2)           \n"
+      "sub         %4,%1                         \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),                     // %0
+        "+r"(dst),                     // %1
+        "+rm"(width)                   // %2
+      : "r"((ptrdiff_t)(src_stride)),  // %3
+        "r"((ptrdiff_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // defined(HAS_TRANSPOSE4X4_32_SSE2)
+
+#if defined(HAS_TRANSPOSE4X4_32_AVX2)
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_AVX2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width) {
+  asm volatile(
+      // Main loop transpose 2 blocks of 4x4.  Read a column, write a row.
+      "1:                                        \n"
+      "vmovdqu     (%0),%%xmm0                   \n"  // a b c d
+      "vmovdqu     (%0,%3),%%xmm1                \n"  // e f g h
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+      "vmovdqu     (%0),%%xmm2                   \n"  // i j k l
+      "vmovdqu     (%0,%3),%%xmm3                \n"  // m n o p
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+
+      "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // a b c d
+      "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1      \n"  // e f g h
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+      "vinserti128 $1,(%0),%%ymm2,%%ymm2         \n"  // i j k l
+      "vinserti128 $1,(%0,%3),%%ymm3,%%ymm3      \n"  // m n o p
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+
+      // Transpose 2x2
+      "vpunpckldq  %%ymm1,%%ymm0,%%ymm4          \n"  // a e b f   from row 0, 1
+      "vpunpckldq  %%ymm3,%%ymm2,%%ymm5          \n"  // i m j n   from row 2, 3
+      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm6          \n"  // c g d h   from row 0, 1
+      "vpunpckhdq  %%ymm3,%%ymm2,%%ymm7          \n"  // k o l p   from row 2, 3
+
+      // Transpose 4x4
+      "vpunpcklqdq %%ymm5,%%ymm4,%%ymm0          \n"  // a e i m   from row 0, 1
+      "vpunpckhqdq %%ymm5,%%ymm4,%%ymm1          \n"  // b f j n   from row 0, 1
+      "vpunpcklqdq %%ymm7,%%ymm6,%%ymm2          \n"  // c g k o   from row 2, 3
+      "vpunpckhqdq %%ymm7,%%ymm6,%%ymm3          \n"  // d h l p   from row 2, 3
+
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         32(%1,%4),%1                  \n"  // dst += stride + 32
+      "vmovdqu     %%ymm1,-32(%1)                \n"
+      "vmovdqu     %%ymm2,-32(%1,%4)             \n"
+      "vmovdqu     %%ymm3,-32(%1,%4,2)           \n"
+      "sub         %4,%1                         \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                     // %0
+        "+r"(dst),                     // %1
+        "+rm"(width)                   // %2
+      : "r"((ptrdiff_t)(src_stride)),  // %3
+        "r"((ptrdiff_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // defined(HAS_TRANSPOSE4X4_32_AVX2)
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus
diff --git a/files/source/rotate_lsx.cc b/source/rotate_lsx.cc
index 94a2b91c..94a2b91c 100644
--- a/files/source/rotate_lsx.cc
+++ b/source/rotate_lsx.cc
diff --git a/files/source/rotate_msa.cc b/source/rotate_msa.cc
index 99bdca65..99bdca65 100644
--- a/files/source/rotate_msa.cc
+++ b/source/rotate_msa.cc
diff --git a/files/source/rotate_neon.cc b/source/rotate_neon.cc
index 844df2bf..569a7318 100644
--- a/files/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -410,6 +410,46 @@ void TransposeUVWx8_NEON(const uint8_t* src,
       : "r"(&kVTbl4x4TransposeDi)  // %8
       : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
 }
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_NEON(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width) {
+  const uint8_t* src1 = src + src_stride;
+  const uint8_t* src2 = src1 + src_stride;
+  const uint8_t* src3 = src2 + src_stride;
+  uint8_t* dst1 = dst + dst_stride;
+  uint8_t* dst2 = dst1 + dst_stride;
+  uint8_t* dst3 = dst2 + dst_stride;
+  asm volatile(
+      // Main loop transpose 4x4.  Read a column, write a row.
+      "1:                                        \n"
+      "vld4.32     {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
+      "vld4.32     {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n"
+      "vld4.32     {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n"
+      "vld4.32     {d1[1], d3[1], d5[1], d7[1]}, [%3], %9 \n"
+      "subs        %8, %8, #4                    \n"  // w -= 4
+      "vst1.8      {q0}, [%4]!                   \n"
+      "vst1.8      {q1}, [%5]!                   \n"
+      "vst1.8      {q2}, [%6]!                   \n"
+      "vst1.8      {q3}, [%7]!                   \n"
+      "bgt         1b                            \n"
+
+      : "+r"(src),                        // %0
+        "+r"(src1),                       // %1
+        "+r"(src2),                       // %2
+        "+r"(src3),                       // %3
+        "+r"(dst),                        // %4
+        "+r"(dst1),                       // %5
+        "+r"(dst2),                       // %6
+        "+r"(dst3),                       // %7
+        "+r"(width)                       // %8
+      : "r"((ptrdiff_t)(src_stride * 4))  // %9
+      : "memory", "cc", "q0", "q1", "q2", "q3");
+}
+
 #endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/files/source/rotate_neon64.cc b/source/rotate_neon64.cc
index 43c15817..95047fa7 100644
--- a/files/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@@ -201,13 +201,13 @@ void TransposeWx8_NEON(const uint8_t* src,
 
       "4:                                          \n"
 
-      : "=&r"(src_temp),                          // %0
-        "+r"(src),                                // %1
-        "+r"(dst),                                // %2
-        "+r"(width)                               // %3
-      : "r"(&kVTbl4x4Transpose),                  // %4
-        "r"(static_cast<ptrdiff_t>(src_stride)),  // %5
-        "r"(static_cast<ptrdiff_t>(dst_stride))   // %6
+      : "=&r"(src_temp),             // %0
+        "+r"(src),                   // %1
+        "+r"(dst),                   // %2
+        "+r"(width)                  // %3
+      : "r"(&kVTbl4x4Transpose),     // %4
+        "r"((ptrdiff_t)src_stride),  // %5
+        "r"((ptrdiff_t)dst_stride)   // %6
       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
         "v17", "v18", "v19", "v20", "v21", "v22", "v23");
 }
@@ -423,18 +423,57 @@ void TransposeUVWx8_NEON(const uint8_t* src,
 
       "4:                                        \n"
 
-      : "=&r"(src_temp),                            // %0
-        "+r"(src),                                  // %1
-        "+r"(dst_a),                                // %2
-        "+r"(dst_b),                                // %3
-        "+r"(width)                                 // %4
-      : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
-        "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
-        "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
-        "r"(&kVTbl4x4TransposeDi)                   // %8
+      : "=&r"(src_temp),               // %0
+        "+r"(src),                     // %1
+        "+r"(dst_a),                   // %2
+        "+r"(dst_b),                   // %3
+        "+r"(width)                    // %4
+      : "r"((ptrdiff_t)src_stride),    // %5
+        "r"((ptrdiff_t)dst_stride_a),  // %6
+        "r"((ptrdiff_t)dst_stride_b),  // %7
+        "r"(&kVTbl4x4TransposeDi)      // %8
       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
         "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
 }
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_NEON(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width) {
+  const uint8_t* src1 = src + src_stride;
+  const uint8_t* src2 = src1 + src_stride;
+  const uint8_t* src3 = src2 + src_stride;
+  uint8_t* dst1 = dst + dst_stride;
+  uint8_t* dst2 = dst1 + dst_stride;
+  uint8_t* dst3 = dst2 + dst_stride;
+  asm volatile(
+      // Main loop transpose 4x4.  Read a column, write a row.
+      "1:                                        \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[3], [%3], %9 \n"
+      "subs        %w8, %w8, #4                  \n"  // w -= 4
+      "st1         {v0.4s}, [%4], 16             \n"
+      "st1         {v1.4s}, [%5], 16             \n"
+      "st1         {v2.4s}, [%6], 16             \n"
+      "st1         {v3.4s}, [%7], 16             \n"
+      "b.gt        1b                            \n"
+      : "+r"(src),                        // %0
+        "+r"(src1),                       // %1
+        "+r"(src2),                       // %2
+        "+r"(src3),                       // %3
+        "+r"(dst),                        // %4
+        "+r"(dst1),                       // %5
+        "+r"(dst2),                       // %6
+        "+r"(dst3),                       // %7
+        "+r"(width)                       // %8
+      : "r"((ptrdiff_t)(src_stride * 4))  // %9
+      : "memory", "cc", "v0", "v1", "v2", "v3");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/files/source/rotate_win.cc b/source/rotate_win.cc
index a78873f8..a78873f8 100644
--- a/files/source/rotate_win.cc
+++ b/source/rotate_win.cc
diff --git a/files/source/row_any.cc b/source/row_any.cc
index 3781a9f2..e574543c 100644
--- a/files/source/row_any.cc
+++ b/source/row_any.cc
@@ -19,7 +19,7 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// memset for temp is meant to clear the source buffer (not dest) so that
+// memset for vin is meant to clear the source buffer so that
 // SIMD that reads full multiple of 16 bytes will not trigger msan errors.
 // memset is not needed for production, as the garbage values are processed but
 // not used, although there may be edge cases for subsampling.
@@ -35,20 +35,20 @@ extern "C" {
   void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
                const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
                int width) {                                                  \
-    SIMD_ALIGNED(uint8_t temp[64 * 5]);                                      \
-    memset(temp, 0, 64 * 4); /* for msan */                                  \
+    SIMD_ALIGNED(uint8_t vin[64 * 4]);                                       \
+    SIMD_ALIGNED(uint8_t vout[64]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
       ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, n);                      \
     }                                                                        \
-    memcpy(temp, y_buf + n, r);                                              \
-    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-    memcpy(temp + 192, a_buf + n, r);                                        \
-    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, MASK + 1); \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
-           SS(r, DUVSHIFT) * BPP);                                           \
+    memcpy(vin, y_buf + n, r);                                               \
+    memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));                \
+    memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+    memcpy(vin + 192, a_buf + n, r);                                         \
+    ANY_SIMD(vin, vin + 64, vin + 128, vin + 192, vout, MASK + 1);           \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP);    \
   }
 
 #ifdef HAS_MERGEARGBROW_SSE2
@@ -68,25 +68,25 @@ ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15)
   void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
                const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
                const struct YuvConstants* yuvconstants, int width) {         \
-    SIMD_ALIGNED(uint8_t temp[64 * 5]);                                      \
-    memset(temp, 0, 64 * 4); /* for msan */                                  \
+    SIMD_ALIGNED(uint8_t vin[64 * 4]);                                       \
+    SIMD_ALIGNED(uint8_t vout[64]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
       ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
     }                                                                        \
-    memcpy(temp, y_buf + n, r);                                              \
-    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-    memcpy(temp + 192, a_buf + n, r);                                        \
+    memcpy(vin, y_buf + n, r);                                               \
+    memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));                \
+    memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+    memcpy(vin + 192, a_buf + n, r);                                         \
     if (width & 1) {                                                         \
-      temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1];             \
-      temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];           \
+      vin[64 + SS(r, UVSHIFT)] = vin[64 + SS(r, UVSHIFT) - 1];               \
+      vin[128 + SS(r, UVSHIFT)] = vin[128 + SS(r, UVSHIFT) - 1];             \
     }                                                                        \
-    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
-             yuvconstants, MASK + 1);                                        \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
-           SS(r, DUVSHIFT) * BPP);                                           \
+    ANY_SIMD(vin, vin + 64, vin + 128, vin + 192, vout, yuvconstants,        \
+             MASK + 1);                                                      \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP);    \
   }
 
 #ifdef HAS_I444ALPHATOARGBROW_SSSE3
@@ -113,6 +113,9 @@ ANY41C(I444AlphaToARGBRow_Any_MSA, I444AlphaToARGBRow_MSA, 0, 0, 4, 7)
 #ifdef HAS_I422ALPHATOARGBROW_MSA
 ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
 #endif
+#ifdef HAS_I422ALPHATOARGBROW_LSX
+ANY41C(I422AlphaToARGBRow_Any_LSX, I422AlphaToARGBRow_LSX, 1, 0, 4, 15)
+#endif
 #ifdef HAS_I422ALPHATOARGBROW_LASX
 ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15)
 #endif
@@ -123,21 +126,20 @@ ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15)
   void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, const T* a_buf, \
                uint8_t* dst_ptr, const struct YuvConstants* yuvconstants,      \
                int width) {                                                    \
-    SIMD_ALIGNED(T temp[16 * 4]);                                              \
-    SIMD_ALIGNED(uint8_t out[64]);                                             \
-    memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */                    \
+    SIMD_ALIGNED(T vin[16 * 4]);                                               \
+    SIMD_ALIGNED(uint8_t vout[64]);                                            \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                       \
     int r = width & MASK;                                                      \
     int n = width & ~MASK;                                                     \
     if (n > 0) {                                                               \
       ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);          \
     }                                                                          \
-    memcpy(temp, y_buf + n, r * SBPP);                                         \
-    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);          \
-    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);          \
-    memcpy(temp + 48, a_buf + n, r * SBPP);                                    \
-    ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, yuvconstants,         \
-             MASK + 1);                                                        \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);       \
+    memcpy(vin, y_buf + n, r * SBPP);                                          \
+    memcpy(vin + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);           \
+    memcpy(vin + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);           \
+    memcpy(vin + 48, a_buf + n, r * SBPP);                                     \
+    ANY_SIMD(vin, vin + 16, vin + 32, vin + 48, vout, yuvconstants, MASK + 1); \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP);      \
   }
 
 #ifdef HAS_I210ALPHATOARGBROW_SSSE3
@@ -190,20 +192,20 @@ ANY41CT(I410AlphaToARGBRow_Any_AVX2,
 #define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK)          \
   void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
                const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \
-    SIMD_ALIGNED(STYPE temp[16 * 4]);                                      \
-    SIMD_ALIGNED(DTYPE out[64]);                                           \
-    memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */                \
+    SIMD_ALIGNED(STYPE vin[16 * 4]);                                       \
+    SIMD_ALIGNED(DTYPE vout[64]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for msan */                            \
     int r = width & MASK;                                                  \
     int n = width & ~MASK;                                                 \
     if (n > 0) {                                                           \
       ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n);             \
     }                                                                      \
-    memcpy(temp, r_buf + n, r * SBPP);                                     \
-    memcpy(temp + 16, g_buf + n, r * SBPP);                                \
-    memcpy(temp + 32, b_buf + n, r * SBPP);                                \
-    memcpy(temp + 48, a_buf + n, r * SBPP);                                \
-    ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, depth, MASK + 1); \
-    memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP);                     \
+    memcpy(vin, r_buf + n, r * SBPP);                                      \
+    memcpy(vin + 16, g_buf + n, r * SBPP);                                 \
+    memcpy(vin + 32, b_buf + n, r * SBPP);                                 \
+    memcpy(vin + 48, a_buf + n, r * SBPP);                                 \
+    ANY_SIMD(vin, vin + 16, vin + 32, vin + 48, vout, depth, MASK + 1);    \
+    memcpy((uint8_t*)dst_ptr + n * BPP, vout, r * BPP);                    \
   }
 
 #ifdef HAS_MERGEAR64ROW_AVX2
@@ -237,22 +239,22 @@ ANY41PT(MergeARGB16To8Row_Any_NEON,
 #undef ANY41PT
 
 // Any 3 planes to 1.
-#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,          \
-               const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \
-    SIMD_ALIGNED(uint8_t temp[64 * 4]);                             \
-    memset(temp, 0, 64 * 3); /* for YUY2 and msan */                \
-    int r = width & MASK;                                           \
-    int n = width & ~MASK;                                          \
-    if (n > 0) {                                                    \
-      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                    \
-    }                                                               \
-    memcpy(temp, y_buf + n, r);                                     \
-    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
-    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));     \
-    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);    \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,             \
-           SS(r, DUVSHIFT) * BPP);                                  \
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)            \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                \
+               const uint8_t* v_buf, uint8_t* dst_ptr, int width) {       \
+    SIMD_ALIGNED(uint8_t vin[64 * 3]);                                    \
+    SIMD_ALIGNED(uint8_t vout[64]);                                       \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                  \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                          \
+    }                                                                     \
+    memcpy(vin, y_buf + n, r);                                            \
+    memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));             \
+    memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));            \
+    ANY_SIMD(vin, vin + 64, vin + 128, vout, MASK + 1);                   \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
   }
 
 // Merge functions.
@@ -285,6 +287,9 @@ ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
 #ifdef HAS_I422TOYUY2ROW_MSA
 ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
 #endif
+#ifdef HAS_I422TOYUY2ROW_LSX
+ANY31(I422ToYUY2Row_Any_LSX, I422ToYUY2Row_LSX, 1, 1, 4, 15)
+#endif
 #ifdef HAS_I422TOYUY2ROW_LASX
 ANY31(I422ToYUY2Row_Any_LASX, I422ToYUY2Row_LASX, 1, 1, 4, 31)
 #endif
@@ -294,6 +299,9 @@ ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
 #ifdef HAS_I422TOUYVYROW_MSA
 ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
 #endif
+#ifdef HAS_I422TOUYVYROW_LSX
+ANY31(I422ToUYVYRow_Any_LSX, I422ToUYVYRow_LSX, 1, 1, 4, 15)
+#endif
 #ifdef HAS_I422TOUYVYROW_LASX
 ANY31(I422ToUYVYRow_Any_LASX, I422ToUYVYRow_LASX, 1, 1, 4, 31)
 #endif
@@ -308,28 +316,27 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
 // Note that odd width replication includes 444 due to implementation
 // on arm that subsamples 444 to 422 internally.
 // Any 3 planes to 1 with yuvconstants
-#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,           \
-               const uint8_t* v_buf, uint8_t* dst_ptr,               \
-               const struct YuvConstants* yuvconstants, int width) { \
-    SIMD_ALIGNED(uint8_t temp[128 * 4]);                             \
-    memset(temp, 0, 128 * 3); /* for YUY2 and msan */                \
-    int r = width & MASK;                                            \
-    int n = width & ~MASK;                                           \
-    if (n > 0) {                                                     \
-      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);       \
-    }                                                                \
-    memcpy(temp, y_buf + n, r);                                      \
-    memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
-    memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
-    if (width & 1) {                                                 \
-      temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];   \
-      temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1];   \
-    }                                                                \
-    ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \
-             MASK + 1);                                              \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384,              \
-           SS(r, DUVSHIFT) * BPP);                                   \
+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)           \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                \
+               const uint8_t* v_buf, uint8_t* dst_ptr,                    \
+               const struct YuvConstants* yuvconstants, int width) {      \
+    SIMD_ALIGNED(uint8_t vin[128 * 3]);                                   \
+    SIMD_ALIGNED(uint8_t vout[128]);                                      \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                  \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);            \
+    }                                                                     \
+    memcpy(vin, y_buf + n, r);                                            \
+    memcpy(vin + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));            \
+    memcpy(vin + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));            \
+    if (width & 1) {                                                      \
+      vin[128 + SS(r, UVSHIFT)] = vin[128 + SS(r, UVSHIFT) - 1];          \
+      vin[256 + SS(r, UVSHIFT)] = vin[256 + SS(r, UVSHIFT) - 1];          \
+    }                                                                     \
+    ANY_SIMD(vin, vin + 128, vin + 256, vout, yuvconstants, MASK + 1);    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
   }
 
 #ifdef HAS_I422TOARGBROW_SSSE3
@@ -359,6 +366,9 @@ ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)
 #ifdef HAS_I444TOARGBROW_SSSE3
 ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
 #endif
+#ifdef HAS_I444TORGB24ROW_SSSE3
+ANY31C(I444ToRGB24Row_Any_SSSE3, I444ToRGB24Row_SSSE3, 0, 0, 3, 15)
+#endif
 #ifdef HAS_I422TORGB24ROW_AVX2
 ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
 #endif
@@ -374,6 +384,9 @@ ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
 #ifdef HAS_I444TOARGBROW_AVX2
 ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
 #endif
+#ifdef HAS_I444TORGB24ROW_AVX2
+ANY31C(I444ToRGB24Row_Any_AVX2, I444ToRGB24Row_AVX2, 0, 0, 3, 31)
+#endif
 #ifdef HAS_I422TOARGB4444ROW_AVX2
 ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15)
 #endif
@@ -383,6 +396,9 @@ ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15)
 #ifdef HAS_I422TORGB565ROW_AVX2
 ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)
 #endif
+#ifdef HAS_I444TORGB24ROW_NEON
+ANY31C(I444ToRGB24Row_Any_NEON, I444ToRGB24Row_NEON, 0, 0, 3, 7)
+#endif
 #ifdef HAS_I422TOARGBROW_NEON
 ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
 ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
@@ -401,6 +417,14 @@ ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
 ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
 ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
 #endif
+#ifdef HAS_I422TOARGBROW_LSX
+ANY31C(I422ToARGBRow_Any_LSX, I422ToARGBRow_LSX, 1, 0, 4, 15)
+ANY31C(I422ToRGBARow_Any_LSX, I422ToRGBARow_LSX, 1, 0, 4, 15)
+ANY31C(I422ToRGB24Row_Any_LSX, I422ToRGB24Row_LSX, 1, 0, 3, 15)
+ANY31C(I422ToRGB565Row_Any_LSX, I422ToRGB565Row_LSX, 1, 0, 2, 15)
+ANY31C(I422ToARGB4444Row_Any_LSX, I422ToARGB4444Row_LSX, 1, 0, 2, 15)
+ANY31C(I422ToARGB1555Row_Any_LSX, I422ToARGB1555Row_LSX, 1, 0, 2, 15)
+#endif
 #ifdef HAS_I422TOARGBROW_LASX
 ANY31C(I422ToARGBRow_Any_LASX, I422ToARGBRow_LASX, 1, 0, 4, 31)
 ANY31C(I422ToRGBARow_Any_LASX, I422ToRGBARow_LASX, 1, 0, 4, 31)
@@ -420,19 +444,19 @@ ANY31C(I444ToARGBRow_Any_LSX, I444ToARGBRow_LSX, 0, 0, 4, 15)
   void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf,            \
                uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
                int width) {                                               \
-    SIMD_ALIGNED(T temp[16 * 3]);                                         \
-    SIMD_ALIGNED(uint8_t out[64]);                                        \
-    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */               \
+    SIMD_ALIGNED(T vin[16 * 3]);                                          \
+    SIMD_ALIGNED(uint8_t vout[64]);                                       \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                  \
     int r = width & MASK;                                                 \
     int n = width & ~MASK;                                                \
     if (n > 0) {                                                          \
       ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);            \
     }                                                                     \
-    memcpy(temp, y_buf + n, r * SBPP);                                    \
-    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
-    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
-    ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1);    \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);  \
+    memcpy(vin, y_buf + n, r * SBPP);                                     \
+    memcpy(vin + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);      \
+    memcpy(vin + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);      \
+    ANY_SIMD(vin, vin + 16, vin + 32, vout, yuvconstants, MASK + 1);      \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
   }
 
 #ifdef HAS_I210TOAR30ROW_SSSE3
@@ -477,19 +501,19 @@ ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
 #define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK)          \
   void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
                DTYPE* dst_ptr, int depth, int width) {                     \
-    SIMD_ALIGNED(STYPE temp[16 * 3]);                                      \
-    SIMD_ALIGNED(DTYPE out[64]);                                           \
-    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */                \
+    SIMD_ALIGNED(STYPE vin[16 * 3]);                                       \
+    SIMD_ALIGNED(DTYPE vout[64]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                   \
     int r = width & MASK;                                                  \
     int n = width & ~MASK;                                                 \
     if (n > 0) {                                                           \
       ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n);                    \
     }                                                                      \
-    memcpy(temp, r_buf + n, r * SBPP);                                     \
-    memcpy(temp + 16, g_buf + n, r * SBPP);                                \
-    memcpy(temp + 32, b_buf + n, r * SBPP);                                \
-    ANY_SIMD(temp, temp + 16, temp + 32, out, depth, MASK + 1);            \
-    memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP);                     \
+    memcpy(vin, r_buf + n, r * SBPP);                                      \
+    memcpy(vin + 16, g_buf + n, r * SBPP);                                 \
+    memcpy(vin + 32, b_buf + n, r * SBPP);                                 \
+    ANY_SIMD(vin, vin + 16, vin + 32, vout, depth, MASK + 1);              \
+    memcpy((uint8_t*)dst_ptr + n * BPP, vout, r * BPP);                    \
   }
 
 #ifdef HAS_MERGEXR30ROW_AVX2
@@ -541,18 +565,19 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON,
 #define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
   void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
                int width) {                                                   \
-    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                      \
-    memset(temp, 0, 128 * 2); /* for msan */                                  \
+    SIMD_ALIGNED(uint8_t vin[128 * 2]);                                       \
+    SIMD_ALIGNED(uint8_t vout[128]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for msan */                               \
     int r = width & MASK;                                                     \
     int n = width & ~MASK;                                                    \
     if (n > 0) {                                                              \
       ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                    \
     }                                                                         \
-    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
-    memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
+    memcpy(vin, y_buf + n * SBPP, r * SBPP);                                  \
+    memcpy(vin + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                        \
            SS(r, UVSHIFT) * SBPP2);                                           \
-    ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                         \
-    memcpy(dst_ptr + n * BPP, temp + 256, r * BPP);                           \
+    ANY_SIMD(vin, vin + 128, vout, MASK + 1);                                 \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                                 \
   }
 
 // Merge functions.
@@ -560,7 +585,10 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON,
 ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
 #endif
 #ifdef HAS_MERGEUVROW_AVX2
-ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
+ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX512BW
+ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31)
 #endif
 #ifdef HAS_MERGEUVROW_NEON
 ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
@@ -611,18 +639,27 @@ ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
 #ifdef HAS_ARGBMULTIPLYROW_MSA
 ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
 #endif
+#ifdef HAS_ARGBMULTIPLYROW_LSX
+ANY21(ARGBMultiplyRow_Any_LSX, ARGBMultiplyRow_LSX, 0, 4, 4, 4, 3)
+#endif
 #ifdef HAS_ARGBMULTIPLYROW_LASX
 ANY21(ARGBMultiplyRow_Any_LASX, ARGBMultiplyRow_LASX, 0, 4, 4, 4, 7)
 #endif
 #ifdef HAS_ARGBADDROW_MSA
 ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBADDROW_LSX
+ANY21(ARGBAddRow_Any_LSX, ARGBAddRow_LSX, 0, 4, 4, 4, 3)
+#endif
 #ifdef HAS_ARGBADDROW_LASX
 ANY21(ARGBAddRow_Any_LASX, ARGBAddRow_LASX, 0, 4, 4, 4, 7)
 #endif
 #ifdef HAS_ARGBSUBTRACTROW_MSA
 ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBSUBTRACTROW_LSX
+ANY21(ARGBSubtractRow_Any_LSX, ARGBSubtractRow_LSX, 0, 4, 4, 4, 3)
+#endif
 #ifdef HAS_ARGBSUBTRACTROW_LASX
 ANY21(ARGBSubtractRow_Any_LASX, ARGBSubtractRow_LASX, 0, 4, 4, 4, 7)
 #endif
@@ -664,22 +701,53 @@ ANY21(SobelXYRow_Any_LSX, SobelXYRow_LSX, 0, 1, 1, 4, 15)
 #endif
 #undef ANY21
 
+// Any 2 planes to 1 with stride
+// width is measured in source pixels. 4 bytes contains 2 pixels
+#define ANY21S(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                        \
+  void NAMEANY(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, \
+               int width) {                                               \
+    SIMD_ALIGNED(uint8_t vin[32 * 2]);                                    \
+    SIMD_ALIGNED(uint8_t vout[32]);                                       \
+    memset(vin, 0, sizeof(vin)); /* for msan */                           \
+    int awidth = (width + 1) / 2;                                         \
+    int r = awidth & MASK;                                                \
+    int n = awidth & ~MASK;                                               \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_yuy2, stride_yuy2, dst_uv, n * 2);                     \
+    }                                                                     \
+    memcpy(vin, src_yuy2 + n * SBPP, r * SBPP);                           \
+    memcpy(vin + 32, src_yuy2 + stride_yuy2 + n * SBPP, r * SBPP);        \
+    ANY_SIMD(vin, 32, vout, MASK + 1);                                    \
+    memcpy(dst_uv + n * BPP, vout, r * BPP);                              \
+  }
+
+#ifdef HAS_YUY2TONVUVROW_NEON
+ANY21S(YUY2ToNVUVRow_Any_NEON, YUY2ToNVUVRow_NEON, 4, 2, 7)
+#endif
+#ifdef HAS_YUY2TONVUVROW_SSE2
+ANY21S(YUY2ToNVUVRow_Any_SSE2, YUY2ToNVUVRow_SSE2, 4, 2, 7)
+#endif
+#ifdef HAS_YUY2TONVUVROW_AVX2
+ANY21S(YUY2ToNVUVRow_Any_AVX2, YUY2ToNVUVRow_AVX2, 4, 2, 15)
+#endif
+
 // Any 2 planes to 1 with yuvconstants
 #define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)            \
   void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
                const struct YuvConstants* yuvconstants, int width) {          \
-    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                      \
-    memset(temp, 0, 128 * 2); /* for msan */                                  \
+    SIMD_ALIGNED(uint8_t vin[128 * 2]);                                       \
+    SIMD_ALIGNED(uint8_t vout[128]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for msan */                               \
     int r = width & MASK;                                                     \
     int n = width & ~MASK;                                                    \
     if (n > 0) {                                                              \
       ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                      \
     }                                                                         \
-    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
-    memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
+    memcpy(vin, y_buf + n * SBPP, r * SBPP);                                  \
+    memcpy(vin + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                        \
            SS(r, UVSHIFT) * SBPP2);                                           \
-    ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1);           \
-    memcpy(dst_ptr + n * BPP, temp + 256, r * BPP);                           \
+    ANY_SIMD(vin, vin + 128, vout, yuvconstants, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                                 \
   }
 
 // Biplanar to RGB.
@@ -758,21 +826,21 @@ ANY21C(NV12ToRGB565Row_Any_LASX, NV12ToRGB565Row_LASX, 1, 1, 2, 2, 15)
 #undef ANY21C
 
 // Any 2 planes of 16 bit to 1 with yuvconstants
-#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK)      \
-  void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr,              \
-               const struct YuvConstants* yuvconstants, int width) {           \
-    SIMD_ALIGNED(T temp[16 * 3]);                                              \
-    SIMD_ALIGNED(uint8_t out[64]);                                             \
-    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */                    \
-    int r = width & MASK;                                                      \
-    int n = width & ~MASK;                                                     \
-    if (n > 0) {                                                               \
-      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                       \
-    }                                                                          \
-    memcpy(temp, y_buf + n, r * SBPP);                                         \
-    memcpy(temp + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \
-    ANY_SIMD(temp, temp + 16, out, yuvconstants, MASK + 1);                    \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);       \
+#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK)     \
+  void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr,             \
+               const struct YuvConstants* yuvconstants, int width) {          \
+    SIMD_ALIGNED(T vin[16 * 2]);                                              \
+    SIMD_ALIGNED(uint8_t vout[64]);                                           \
+    memset(vin, 0, sizeof(vin)); /* for msan */                               \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                      \
+    }                                                                         \
+    memcpy(vin, y_buf + n, r * SBPP);                                         \
+    memcpy(vin + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \
+    ANY_SIMD(vin, vin + 16, vout, yuvconstants, MASK + 1);                    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP);     \
   }
 
 #ifdef HAS_P210TOAR30ROW_SSSE3
@@ -806,21 +874,22 @@ ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
 #define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK)                     \
   void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \
                int width) {                                          \
-    SIMD_ALIGNED(T temp[16 * 4]);                                    \
-    memset(temp, 0, 16 * 4 * BPP); /* for msan */                    \
+    SIMD_ALIGNED(T vin[16 * 2]);                                     \
+    SIMD_ALIGNED(T vout[16]);                                        \
+    memset(vin, 0, sizeof(vin)); /* for msan */                      \
     int r = width & MASK;                                            \
     int n = width & ~MASK;                                           \
     if (n > 0) {                                                     \
       ANY_SIMD(src_u, src_v, dst_uv, depth, n);                      \
     }                                                                \
-    memcpy(temp, src_u + n, r * BPP);                                \
-    memcpy(temp + 16, src_v + n, r * BPP);                           \
-    ANY_SIMD(temp, temp + 16, temp + 32, depth, MASK + 1);           \
-    memcpy(dst_uv + n * 2, temp + 32, r * BPP * 2);                  \
+    memcpy(vin, src_u + n, r * BPP);                                 \
+    memcpy(vin + 16, src_v + n, r * BPP);                            \
+    ANY_SIMD(vin, vin + 16, vout, depth, MASK + 1);                  \
+    memcpy(dst_uv + n * 2, vout, r * BPP * 2);                       \
   }
 
 #ifdef HAS_MERGEUVROW_16_AVX2
-ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 15)
+ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 7)
 #endif
 #ifdef HAS_MERGEUVROW_16_NEON
 ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7)
@@ -829,18 +898,19 @@ ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7)
 #undef ANY21CT
 
 // Any 1 to 1.
-#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
-    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
-    memset(temp, 0, 128); /* for YUY2 and msan */                         \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
-    }                                                                     \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
-    ANY_SIMD(temp, temp + 128, MASK + 1);                                 \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {    \
+    SIMD_ALIGNED(uint8_t vin[128]);                                      \
+    SIMD_ALIGNED(uint8_t vout[128]);                                     \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                 \
+    int r = width & MASK;                                                \
+    int n = width & ~MASK;                                               \
+    if (n > 0) {                                                         \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                     \
+    }                                                                    \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(vin, vout, MASK + 1);                                       \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                            \
   }
 
 #ifdef HAS_COPYROW_AVX
@@ -931,6 +1001,13 @@ ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
 ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
 ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
 #endif
+#if defined(HAS_ARGBTORGB24ROW_LSX)
+ANY11(ARGBToRGB24Row_Any_LSX, ARGBToRGB24Row_LSX, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_LSX, ARGBToRAWRow_LSX, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_LSX, ARGBToRGB565Row_LSX, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_LSX, ARGBToARGB1555Row_LSX, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_LSX, ARGBToARGB4444Row_LSX, 0, 4, 2, 7)
+#endif
 #if defined(HAS_ARGBTORGB24ROW_LASX)
 ANY11(ARGBToRGB24Row_Any_LASX, ARGBToRGB24Row_LASX, 0, 4, 3, 31)
 ANY11(ARGBToRAWRow_Any_LASX, ARGBToRAWRow_LASX, 0, 4, 3, 31)
@@ -959,6 +1036,9 @@ ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
 #ifdef HAS_ARGBTOYJROW_AVX2
 ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
 #endif
+#ifdef HAS_ABGRTOYJROW_AVX2
+ANY11(ABGRToYJRow_Any_AVX2, ABGRToYJRow_AVX2, 0, 4, 1, 31)
+#endif
 #ifdef HAS_RGBATOYJROW_AVX2
 ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31)
 #endif
@@ -983,6 +1063,9 @@ ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
 #ifdef HAS_ARGBTOYJROW_SSSE3
 ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
 #endif
+#ifdef HAS_ABGRTOYJROW_SSSE3
+ANY11(ABGRToYJRow_Any_SSSE3, ABGRToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
 #ifdef HAS_RGBATOYJROW_SSSE3
 ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
 #endif
@@ -992,12 +1075,18 @@ ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15)
 #ifdef HAS_ARGBTOYROW_MSA
 ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
 #endif
+#ifdef HAS_ARGBTOYROW_LSX
+ANY11(ARGBToYRow_Any_LSX, ARGBToYRow_LSX, 0, 4, 1, 15)
+#endif
 #ifdef HAS_ARGBTOYROW_LASX
 ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31)
 #endif
 #ifdef HAS_ARGBTOYJROW_NEON
 ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 15)
 #endif
+#ifdef HAS_ABGRTOYJROW_NEON
+ANY11(ABGRToYJRow_Any_NEON, ABGRToYJRow_NEON, 0, 4, 1, 15)
+#endif
 #ifdef HAS_RGBATOYJROW_NEON
 ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 15)
 #endif
@@ -1007,9 +1096,21 @@ ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
 #ifdef HAS_ARGBTOYJROW_LSX
 ANY11(ARGBToYJRow_Any_LSX, ARGBToYJRow_LSX, 0, 4, 1, 15)
 #endif
+#ifdef HAS_RGBATOYJROW_LSX
+ANY11(RGBAToYJRow_Any_LSX, RGBAToYJRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ABGRTOYJROW_LSX
+ANY11(ABGRToYJRow_Any_LSX, ABGRToYJRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYJROW_LASX
+ANY11(RGBAToYJRow_Any_LASX, RGBAToYJRow_LASX, 0, 4, 1, 31)
+#endif
 #ifdef HAS_ARGBTOYJROW_LASX
 ANY11(ARGBToYJRow_Any_LASX, ARGBToYJRow_LASX, 0, 4, 1, 31)
 #endif
+#ifdef HAS_ABGRTOYJROW_LASX
+ANY11(ABGRToYJRow_Any_LASX, ABGRToYJRow_LASX, 0, 4, 1, 31)
+#endif
 #ifdef HAS_BGRATOYROW_NEON
 ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 15)
 #endif
@@ -1019,6 +1120,9 @@ ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
 #ifdef HAS_BGRATOYROW_LSX
 ANY11(BGRAToYRow_Any_LSX, BGRAToYRow_LSX, 0, 4, 1, 15)
 #endif
+#ifdef HAS_BGRATOYROW_LASX
+ANY11(BGRAToYRow_Any_LASX, BGRAToYRow_LASX, 0, 4, 1, 31)
+#endif
 #ifdef HAS_ABGRTOYROW_NEON
 ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 15)
 #endif
@@ -1028,6 +1132,9 @@ ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
 #ifdef HAS_ABGRTOYROW_LSX
 ANY11(ABGRToYRow_Any_LSX, ABGRToYRow_LSX, 0, 4, 1, 15)
 #endif
+#ifdef HAS_ABGRTOYROW_LASX
+ANY11(ABGRToYRow_Any_LASX, ABGRToYRow_LASX, 0, 4, 1, 31)
+#endif
 #ifdef HAS_RGBATOYROW_NEON
 ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 15)
 #endif
@@ -1037,6 +1144,9 @@ ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
 #ifdef HAS_RGBATOYROW_LSX
 ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15)
 #endif
+#ifdef HAS_RGBATOYROW_LASX
+ANY11(RGBAToYRow_Any_LASX, RGBAToYRow_LASX, 0, 4, 1, 31)
+#endif
 #ifdef HAS_RGB24TOYROW_NEON
 ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15)
 #endif
@@ -1055,6 +1165,12 @@ ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
 #ifdef HAS_RGB24TOYROW_LSX
 ANY11(RGB24ToYRow_Any_LSX, RGB24ToYRow_LSX, 0, 3, 1, 15)
 #endif
+#ifdef HAS_RGB24TOYJROW_LSX
+ANY11(RGB24ToYJRow_Any_LSX, RGB24ToYJRow_LSX, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_LASX
+ANY11(RGB24ToYJRow_Any_LASX, RGB24ToYJRow_LASX, 0, 3, 1, 31)
+#endif
 #ifdef HAS_RGB24TOYROW_LASX
 ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31)
 #endif
@@ -1079,6 +1195,12 @@ ANY11(RAWToYRow_Any_LSX, RAWToYRow_LSX, 0, 3, 1, 15)
 #ifdef HAS_RAWTOYROW_LASX
 ANY11(RAWToYRow_Any_LASX, RAWToYRow_LASX, 0, 3, 1, 31)
 #endif
+#ifdef HAS_RAWTOYJROW_LSX
+ANY11(RAWToYJRow_Any_LSX, RAWToYJRow_LSX, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYJROW_LASX
+ANY11(RAWToYJRow_Any_LASX, RAWToYJRow_LASX, 0, 3, 1, 31)
+#endif
 #ifdef HAS_RGB565TOYROW_NEON
 ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
 #endif
@@ -1115,12 +1237,18 @@ ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
 #ifdef HAS_YUY2TOYROW_MSA
 ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
 #endif
+#ifdef HAS_YUY2TOYROW_LSX
+ANY11(YUY2ToYRow_Any_LSX, YUY2ToYRow_LSX, 1, 4, 1, 15)
+#endif
 #ifdef HAS_YUY2TOYROW_LASX
 ANY11(YUY2ToYRow_Any_LASX, YUY2ToYRow_LASX, 1, 4, 1, 31)
 #endif
 #ifdef HAS_UYVYTOYROW_MSA
 ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
 #endif
+#ifdef HAS_UYVYTOYROW_LSX
+ANY11(UYVYToYRow_Any_LSX, UYVYToYRow_LSX, 1, 4, 1, 15)
+#endif
 #ifdef HAS_UYVYTOYROW_LASX
 ANY11(UYVYToYRow_Any_LASX, UYVYToYRow_LASX, 1, 4, 1, 31)
 #endif
@@ -1217,6 +1345,9 @@ ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
 #ifdef HAS_ARGBATTENUATEROW_MSA
 ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBATTENUATEROW_LSX
+ANY11(ARGBAttenuateRow_Any_LSX, ARGBAttenuateRow_LSX, 0, 4, 4, 7)
+#endif
 #ifdef HAS_ARGBATTENUATEROW_LASX
 ANY11(ARGBAttenuateRow_Any_LASX, ARGBAttenuateRow_LASX, 0, 4, 4, 15)
 #endif
@@ -1238,19 +1369,21 @@ ANY11(ARGBExtractAlphaRow_Any_LSX, ARGBExtractAlphaRow_LSX, 0, 4, 1, 15)
 #undef ANY11
 
 // Any 1 to 1 blended.  Destination is read, modify, write.
-#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
-    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
-    memset(temp, 0, 64 * 2); /* for msan */                               \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
-    }                                                                     \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
-    memcpy(temp + 64, dst_ptr + n * BPP, r * BPP);                        \
-    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
-    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                        \
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)              \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {    \
+    SIMD_ALIGNED(uint8_t vin[64]);                                       \
+    SIMD_ALIGNED(uint8_t vout[64]);                                      \
+    memset(vin, 0, sizeof(vin));   /* for msan */                        \
+    memset(vout, 0, sizeof(vout)); /* for msan */                        \
+    int r = width & MASK;                                                \
+    int n = width & ~MASK;                                               \
+    if (n > 0) {                                                         \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                     \
+    }                                                                    \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    memcpy(vout, dst_ptr + n * BPP, r * BPP);                            \
+    ANY_SIMD(vin, vout, MASK + 1);                                       \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                            \
   }
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
@@ -1270,16 +1403,17 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
 // Any 1 to 1 with parameter.
 #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
   void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
-    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                        \
-    memset(temp, 0, 64); /* for msan */                                        \
+    SIMD_ALIGNED(uint8_t vin[64]);                                             \
+    SIMD_ALIGNED(uint8_t vout[64]);                                            \
+    memset(vin, 0, sizeof(vin)); /* for msan */                                \
     int r = width & MASK;                                                      \
     int n = width & ~MASK;                                                     \
     if (n > 0) {                                                               \
       ANY_SIMD(src_ptr, dst_ptr, param, n);                                    \
     }                                                                          \
-    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                                \
-    ANY_SIMD(temp, temp + 64, param, MASK + 1);                                \
-    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                             \
+    memcpy(vin, src_ptr + n * SBPP, r * SBPP);                                 \
+    ANY_SIMD(vin, vout, param, MASK + 1);                                      \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                                  \
   }
 
 #if defined(HAS_I400TOARGBROW_SSE2)
@@ -1355,6 +1489,14 @@ ANY11P(ARGBToRGB565DitherRow_Any_MSA,
        2,
        7)
 #endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LSX)
+ANY11P(ARGBToRGB565DitherRow_Any_LSX,
+       ARGBToRGB565DitherRow_LSX,
+       const uint32_t,
+       4,
+       2,
+       7)
+#endif
 #if defined(HAS_ARGBTORGB565DITHERROW_LASX)
 ANY11P(ARGBToRGB565DitherRow_Any_LASX,
        ARGBToRGB565DitherRow_LASX,
@@ -1375,6 +1517,9 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
 #ifdef HAS_ARGBSHUFFLEROW_MSA
 ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBSHUFFLEROW_LSX
+ANY11P(ARGBShuffleRow_Any_LSX, ARGBShuffleRow_LSX, const uint8_t*, 4, 4, 7)
+#endif
 #ifdef HAS_ARGBSHUFFLEROW_LASX
 ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15)
 #endif
@@ -1384,17 +1529,17 @@ ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15)
 // Any 1 to 1 with type
 #define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)  \
   void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int width) { \
-    SIMD_ALIGNED(uint8_t temp[(MASK + 1) * SBPP]);                \
-    SIMD_ALIGNED(uint8_t out[(MASK + 1) * BPP]);                  \
-    memset(temp, 0, (MASK + 1) * SBPP); /* for msan */            \
+    SIMD_ALIGNED(uint8_t vin[(MASK + 1) * SBPP]);                 \
+    SIMD_ALIGNED(uint8_t vout[(MASK + 1) * BPP]);                 \
+    memset(vin, 0, sizeof(vin)); /* for msan */                   \
     int r = width & MASK;                                         \
     int n = width & ~MASK;                                        \
     if (n > 0) {                                                  \
       ANY_SIMD(src_ptr, dst_ptr, n);                              \
     }                                                             \
-    memcpy(temp, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP);       \
-    ANY_SIMD((STYPE*)temp, (DTYPE*)out, MASK + 1);                \
-    memcpy((uint8_t*)(dst_ptr) + n * BPP, out, r * BPP);          \
+    memcpy(vin, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP);        \
+    ANY_SIMD((STYPE*)vin, (DTYPE*)vout, MASK + 1);                \
+    memcpy((uint8_t*)(dst_ptr) + n * BPP, vout, r * BPP);         \
   }
 
 #ifdef HAS_ARGBTOAR64ROW_SSSE3
@@ -1450,17 +1595,17 @@ ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
 // Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
 #define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)             \
   void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
-    SIMD_ALIGNED(STYPE temp[32]);                                            \
-    SIMD_ALIGNED(DTYPE out[32]);                                             \
-    memset(temp, 0, 32 * SBPP); /* for msan */                               \
+    SIMD_ALIGNED(STYPE vin[32]);                                             \
+    SIMD_ALIGNED(DTYPE vout[32]);                                            \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
       ANY_SIMD(src_ptr, dst_ptr, scale, n);                                  \
     }                                                                        \
-    memcpy(temp, src_ptr + n, r * SBPP);                                     \
-    ANY_SIMD(temp, out, scale, MASK + 1);                                    \
-    memcpy(dst_ptr + n, out, r * BPP);                                       \
+    memcpy(vin, src_ptr + n, r * SBPP);                                      \
+    ANY_SIMD(vin, vout, scale, MASK + 1);                                    \
+    memcpy(dst_ptr + n, vout, r * BPP);                                      \
   }
 
 #ifdef HAS_CONVERT16TO8ROW_SSSE3
@@ -1537,17 +1682,17 @@ ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15)
 // Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
 #define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK)             \
   void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
-    SIMD_ALIGNED(ST temp[32]);                                          \
-    SIMD_ALIGNED(T out[32]);                                            \
-    memset(temp, 0, SBPP * 32); /* for msan */                          \
+    SIMD_ALIGNED(ST vin[32]);                                           \
+    SIMD_ALIGNED(T vout[32]);                                           \
+    memset(vin, 0, sizeof(vin)); /* for msan */                         \
     int r = width & MASK;                                               \
     int n = width & ~MASK;                                              \
     if (n > 0) {                                                        \
       ANY_SIMD(src_ptr, dst_ptr, param, n);                             \
     }                                                                   \
-    memcpy(temp, src_ptr + n, r * SBPP);                                \
-    ANY_SIMD(temp, out, param, MASK + 1);                               \
-    memcpy(dst_ptr + n, out, r * BPP);                                  \
+    memcpy(vin, src_ptr + n, r * SBPP);                                 \
+    ANY_SIMD(vin, vout, param, MASK + 1);                               \
+    memcpy(dst_ptr + n, vout, r * BPP);                                 \
   }
 
 #ifdef HAS_HALFFLOATROW_SSE2
@@ -1588,20 +1733,22 @@ ANY11P16(HalfFloatRow_Any_LSX, HalfFloatRow_LSX, uint16_t, uint16_t, 2, 2, 31)
 #undef ANY11P16
 
 // Any 1 to 1 with yuvconstants
-#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr,                  \
-               const struct YuvConstants* yuvconstants, int width) {      \
-    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
-    memset(temp, 0, 128); /* for YUY2 and msan */                         \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                        \
-    }                                                                     \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
-    ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                   \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)              \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr,                 \
+               const struct YuvConstants* yuvconstants, int width) {     \
+    SIMD_ALIGNED(uint8_t vin[128]);                                      \
+    SIMD_ALIGNED(uint8_t vout[128]);                                     \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                 \
+    int r = width & MASK;                                                \
+    int n = width & ~MASK;                                               \
+    if (n > 0) {                                                         \
+      ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                       \
+    }                                                                    \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(vin, vout, yuvconstants, MASK + 1);                         \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                            \
   }
+
 #if defined(HAS_YUY2TOARGBROW_SSSE3)
 ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
 ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
@@ -1628,21 +1775,21 @@ ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7)
 #define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK)           \
   void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \
                int width, int source_y_fraction) {                   \
-    SIMD_ALIGNED(TS temps[64 * 2]);                                  \
-    SIMD_ALIGNED(TD tempd[64]);                                      \
-    memset(temps, 0, sizeof(temps)); /* for msan */                  \
+    SIMD_ALIGNED(TS vin[64 * 2]);                                    \
+    SIMD_ALIGNED(TD vout[64]);                                       \
+    memset(vin, 0, sizeof(vin)); /* for msan */                      \
     int r = width & MASK;                                            \
     int n = width & ~MASK;                                           \
     if (n > 0) {                                                     \
       ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction);  \
     }                                                                \
-    memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS));        \
+    memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS));          \
     if (source_y_fraction) {                                         \
-      memcpy(temps + 64, src_ptr + src_stride + n * SBPP,            \
+      memcpy(vin + 64, src_ptr + src_stride + n * SBPP,              \
              r * SBPP * sizeof(TS));                                 \
     }                                                                \
-    ANY_SIMD(tempd, temps, 64, MASK + 1, source_y_fraction);         \
-    memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD));          \
+    ANY_SIMD(vout, vin, 64, MASK + 1, source_y_fraction);            \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD));           \
   }
 
 #ifdef HAS_INTERPOLATEROW_AVX2
@@ -1682,21 +1829,21 @@ ANY11I(InterpolateRow_16_Any_NEON,
 #define ANY11IS(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK)                \
   void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride,       \
                int scale, int width, int source_y_fraction) {              \
-    SIMD_ALIGNED(TS temps[64 * 2]);                                        \
-    SIMD_ALIGNED(TD tempd[64]);                                            \
-    memset(temps, 0, sizeof(temps)); /* for msan */                        \
+    SIMD_ALIGNED(TS vin[64 * 2]);                                          \
+    SIMD_ALIGNED(TD vout[64]);                                             \
+    memset(vin, 0, sizeof(vin)); /* for msan */                            \
     int r = width & MASK;                                                  \
     int n = width & ~MASK;                                                 \
     if (n > 0) {                                                           \
       ANY_SIMD(dst_ptr, src_ptr, src_stride, scale, n, source_y_fraction); \
     }                                                                      \
-    memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS));              \
+    memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS));                \
     if (source_y_fraction) {                                               \
-      memcpy(temps + 64, src_ptr + src_stride + n * SBPP,                  \
+      memcpy(vin + 64, src_ptr + src_stride + n * SBPP,                    \
              r * SBPP * sizeof(TS));                                       \
     }                                                                      \
-    ANY_SIMD(tempd, temps, 64, scale, MASK + 1, source_y_fraction);        \
-    memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD));                \
+    ANY_SIMD(vout, vin, 64, scale, MASK + 1, source_y_fraction);           \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD));                 \
   }
 
 #ifdef HAS_INTERPOLATEROW_16TO8_NEON
@@ -1721,18 +1868,19 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2,
 #undef ANY11IS
 
 // Any 1 to 1 mirror.
-#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                              \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
-    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
-    memset(temp, 0, 64); /* for msan */                                   \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                            \
-    }                                                                     \
-    memcpy(temp, src_ptr, r* BPP);                                        \
-    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
-    memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                          \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
+    SIMD_ALIGNED(uint8_t vin[64]);                                    \
+    SIMD_ALIGNED(uint8_t vout[64]);                                   \
+    memset(vin, 0, sizeof(vin)); /* for msan */                       \
+    int r = width & MASK;                                             \
+    int n = width & ~MASK;                                            \
+    if (n > 0) {                                                      \
+      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                        \
+    }                                                                 \
+    memcpy(vin, src_ptr, r* BPP);                                     \
+    ANY_SIMD(vin, vout, MASK + 1);                                    \
+    memcpy(dst_ptr + n * BPP, vout + (MASK + 1 - r) * BPP, r * BPP);  \
   }
 
 #ifdef HAS_MIRRORROW_AVX2
@@ -1747,6 +1895,9 @@ ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31)
 #ifdef HAS_MIRRORROW_MSA
 ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
 #endif
+#ifdef HAS_MIRRORROW_LSX
+ANY11M(MirrorRow_Any_LSX, MirrorRow_LSX, 1, 31)
+#endif
 #ifdef HAS_MIRRORROW_LASX
 ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63)
 #endif
@@ -1762,6 +1913,9 @@ ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
 #ifdef HAS_MIRRORUVROW_MSA
 ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7)
 #endif
+#ifdef HAS_MIRRORUVROW_LSX
+ANY11M(MirrorUVRow_Any_LSX, MirrorUVRow_LSX, 2, 7)
+#endif
 #ifdef HAS_MIRRORUVROW_LASX
 ANY11M(MirrorUVRow_Any_LASX, MirrorUVRow_LASX, 2, 15)
 #endif
@@ -1777,6 +1931,9 @@ ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
 #ifdef HAS_ARGBMIRRORROW_MSA
 ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
 #endif
+#ifdef HAS_ARGBMIRRORROW_LSX
+ANY11M(ARGBMirrorRow_Any_LSX, ARGBMirrorRow_LSX, 4, 7)
+#endif
 #ifdef HAS_ARGBMIRRORROW_LASX
 ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15)
 #endif
@@ -1791,15 +1948,14 @@ ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15)
 // Any 1 plane. (memset)
 #define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)        \
   void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
-    SIMD_ALIGNED(uint8_t temp[64]);                  \
-    memset(temp, 0, 64); /* for msan */              \
+    SIMD_ALIGNED(uint8_t vout[64]);                  \
     int r = width & MASK;                            \
     int n = width & ~MASK;                           \
     if (n > 0) {                                     \
       ANY_SIMD(dst_ptr, v32, n);                     \
     }                                                \
-    ANY_SIMD(temp, v32, MASK + 1);                   \
-    memcpy(dst_ptr + n * BPP, temp, r * BPP);        \
+    ANY_SIMD(vout, v32, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);        \
   }
 
 #ifdef HAS_SETROW_X86
@@ -1823,20 +1979,21 @@ ANY1(ARGBSetRow_Any_LSX, ARGBSetRow_LSX, uint32_t, 4, 3)
 #undef ANY1
 
 // Any 1 to 2.  Outputs UV planes.
-#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)          \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v,  \
-               int width) {                                             \
-    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                \
-    memset(temp, 0, 128); /* for msan */                                \
-    int r = width & MASK;                                               \
-    int n = width & ~MASK;                                              \
-    if (n > 0) {                                                        \
-      ANY_SIMD(src_ptr, dst_u, dst_v, n);                               \
-    }                                                                   \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
-    ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                   \
-    memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));       \
-    memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));       \
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)         \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \
+               int width) {                                            \
+    SIMD_ALIGNED(uint8_t vin[128]);                                    \
+    SIMD_ALIGNED(uint8_t vout[128 * 2]);                               \
+    memset(vin, 0, sizeof(vin)); /* for msan */                        \
+    int r = width & MASK;                                              \
+    int n = width & ~MASK;                                             \
+    if (n > 0) {                                                       \
+      ANY_SIMD(src_ptr, dst_u, dst_v, n);                              \
+    }                                                                  \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+    ANY_SIMD(vin, vout, vout + 128, MASK + 1);                         \
+    memcpy(dst_u + (n >> DUVSHIFT), vout, SS(r, DUVSHIFT));            \
+    memcpy(dst_v + (n >> DUVSHIFT), vout + 128, SS(r, DUVSHIFT));      \
   }
 
 #ifdef HAS_SPLITUVROW_SSE2
@@ -1875,6 +2032,11 @@ ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
 ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
 ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
 #endif
+#ifdef HAS_YUY2TOUV422ROW_LSX
+ANY12(ARGBToUV444Row_Any_LSX, ARGBToUV444Row_LSX, 0, 4, 0, 15)
+ANY12(YUY2ToUV422Row_Any_LSX, YUY2ToUV422Row_LSX, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_LSX, UYVYToUV422Row_LSX, 1, 4, 1, 15)
+#endif
 #ifdef HAS_YUY2TOUV422ROW_LASX
 ANY12(ARGBToUV444Row_Any_LASX, ARGBToUV444Row_LASX, 0, 4, 0, 31)
 ANY12(YUY2ToUV422Row_Any_LASX, YUY2ToUV422Row_LASX, 1, 4, 1, 31)
@@ -1885,17 +2047,18 @@ ANY12(UYVYToUV422Row_Any_LASX, UYVYToUV422Row_LASX, 1, 4, 1, 31)
 // Any 2 16 bit planes with parameter to 1
 #define ANY12PT(NAMEANY, ANY_SIMD, T, BPP, MASK)                            \
   void NAMEANY(const T* src_uv, T* dst_u, T* dst_v, int depth, int width) { \
-    SIMD_ALIGNED(T temp[16 * 4]);                                           \
-    memset(temp, 0, 16 * 4 * BPP); /* for msan */                           \
+    SIMD_ALIGNED(T vin[16 * 2]);                                            \
+    SIMD_ALIGNED(T vout[16 * 2]);                                           \
+    memset(vin, 0, sizeof(vin)); /* for msan */                             \
     int r = width & MASK;                                                   \
     int n = width & ~MASK;                                                  \
     if (n > 0) {                                                            \
       ANY_SIMD(src_uv, dst_u, dst_v, depth, n);                             \
     }                                                                       \
-    memcpy(temp, src_uv + n * 2, r * BPP * 2);                              \
-    ANY_SIMD(temp, temp + 32, temp + 48, depth, MASK + 1);                  \
-    memcpy(dst_u + n, temp + 32, r * BPP);                                  \
-    memcpy(dst_v + n, temp + 48, r * BPP);                                  \
+    memcpy(vin, src_uv + n * 2, r * BPP * 2);                               \
+    ANY_SIMD(vin, vout, vout + 16, depth, MASK + 1);                        \
+    memcpy(dst_u + n, vout, r * BPP);                                       \
+    memcpy(dst_v + n, vout + 16, r * BPP);                                  \
   }
 
 #ifdef HAS_SPLITUVROW_16_AVX2
@@ -1909,21 +2072,22 @@ ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7)
 #undef ANY21CT
 
 // Any 1 to 3.  Outputs RGB planes.
-#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                                \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g,     \
-               uint8_t* dst_b, int width) {                                \
-    SIMD_ALIGNED(uint8_t temp[16 * 6]);                                    \
-    memset(temp, 0, 16 * 3); /* for msan */                                \
-    int r = width & MASK;                                                  \
-    int n = width & ~MASK;                                                 \
-    if (n > 0) {                                                           \
-      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n);                           \
-    }                                                                      \
-    memcpy(temp, src_ptr + n * BPP, r * BPP);                              \
-    ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \
-    memcpy(dst_r + n, temp + 16 * 3, r);                                   \
-    memcpy(dst_g + n, temp + 16 * 4, r);                                   \
-    memcpy(dst_b + n, temp + 16 * 5, r);                                   \
+#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                            \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \
+               uint8_t* dst_b, int width) {                            \
+    SIMD_ALIGNED(uint8_t vin[16 * 3]);                                 \
+    SIMD_ALIGNED(uint8_t vout[16 * 3]);                                \
+    memset(vin, 0, sizeof(vin)); /* for msan */                        \
+    int r = width & MASK;                                              \
+    int n = width & ~MASK;                                             \
+    if (n > 0) {                                                       \
+      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n);                       \
+    }                                                                  \
+    memcpy(vin, src_ptr + n * BPP, r * BPP);                           \
+    ANY_SIMD(vin, vout, vout + 16, vout + 32, MASK + 1);               \
+    memcpy(dst_r + n, vout, r);                                        \
+    memcpy(dst_g + n, vout + 16, r);                                   \
+    memcpy(dst_b + n, vout + 32, r);                                   \
   }
 
 #ifdef HAS_SPLITRGBROW_SSSE3
@@ -1946,23 +2110,23 @@ ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15)
 #endif
 
 // Any 1 to 4.  Outputs ARGB planes.
-#define ANY14(NAMEANY, ANY_SIMD, BPP, MASK)                                    \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g,         \
-               uint8_t* dst_b, uint8_t* dst_a, int width) {                    \
-    SIMD_ALIGNED(uint8_t temp[16 * 8]);                                        \
-    memset(temp, 0, 16 * 4); /* for msan */                                    \
-    int r = width & MASK;                                                      \
-    int n = width & ~MASK;                                                     \
-    if (n > 0) {                                                               \
-      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n);                        \
-    }                                                                          \
-    memcpy(temp, src_ptr + n * BPP, r * BPP);                                  \
-    ANY_SIMD(temp, temp + 16 * 4, temp + 16 * 5, temp + 16 * 6, temp + 16 * 7, \
-             MASK + 1);                                                        \
-    memcpy(dst_r + n, temp + 16 * 4, r);                                       \
-    memcpy(dst_g + n, temp + 16 * 5, r);                                       \
-    memcpy(dst_b + n, temp + 16 * 6, r);                                       \
-    memcpy(dst_a + n, temp + 16 * 7, r);                                       \
+#define ANY14(NAMEANY, ANY_SIMD, BPP, MASK)                            \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \
+               uint8_t* dst_b, uint8_t* dst_a, int width) {            \
+    SIMD_ALIGNED(uint8_t vin[16 * 4]);                                 \
+    SIMD_ALIGNED(uint8_t vout[16 * 4]);                                \
+    memset(vin, 0, sizeof(vin)); /* for msan */                        \
+    int r = width & MASK;                                              \
+    int n = width & ~MASK;                                             \
+    if (n > 0) {                                                       \
+      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n);                \
+    }                                                                  \
+    memcpy(vin, src_ptr + n * BPP, r * BPP);                           \
+    ANY_SIMD(vin, vout, vout + 16, vout + 32, vout + 48, MASK + 1);    \
+    memcpy(dst_r + n, vout, r);                                        \
+    memcpy(dst_g + n, vout + 16, r);                                   \
+    memcpy(dst_b + n, vout + 32, r);                                   \
+    memcpy(dst_a + n, vout + 48, r);                                   \
   }
 
 #ifdef HAS_SPLITARGBROW_SSE2
@@ -1983,25 +2147,26 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
 #define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
   void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u,       \
                uint8_t* dst_v, int width) {                                  \
-    SIMD_ALIGNED(uint8_t temp[128 * 4]);                                     \
-    memset(temp, 0, 128 * 2); /* for msan */                                 \
+    SIMD_ALIGNED(uint8_t vin[128 * 2]);                                      \
+    SIMD_ALIGNED(uint8_t vout[128 * 2]);                                     \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
       ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n);                        \
     }                                                                        \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
-    memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,          \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);       \
+    memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,           \
            SS(r, UVSHIFT) * BPP);                                            \
     if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
-      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+      memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP,   \
              BPP);                                                           \
-      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
-             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
+      memcpy(vin + 128 + SS(r, UVSHIFT) * BPP,                               \
+             vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                   \
     }                                                                        \
-    ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
-    memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
-    memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
+    ANY_SIMD(vin, 128, vout, vout + 128, MASK + 1);                          \
+    memcpy(dst_u + (n >> 1), vout, SS(r, 1));                                \
+    memcpy(dst_v + (n >> 1), vout + 128, SS(r, 1));                          \
   }
 
 #ifdef HAS_ARGBTOUVROW_AVX2
@@ -2013,9 +2178,17 @@ ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
 #ifdef HAS_ARGBTOUVJROW_AVX2
 ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
 #endif
+#ifdef HAS_ABGRTOUVJROW_AVX2
+ANY12S(ABGRToUVJRow_Any_AVX2, ABGRToUVJRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVJROW_SSSE3
+ANY12S(ABGRToUVJRow_Any_SSSE3, ABGRToUVJRow_SSSE3, 0, 4, 15)
+#endif
 #ifdef HAS_ARGBTOUVROW_SSSE3
 ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
-ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
 ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
 ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
 ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
@@ -2034,12 +2207,18 @@ ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
 #ifdef HAS_ARGBTOUVROW_MSA
 ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
 #endif
+#ifdef HAS_ARGBTOUVROW_LSX
+ANY12S(ARGBToUVRow_Any_LSX, ARGBToUVRow_LSX, 0, 4, 15)
+#endif
 #ifdef HAS_ARGBTOUVROW_LASX
 ANY12S(ARGBToUVRow_Any_LASX, ARGBToUVRow_LASX, 0, 4, 31)
 #endif
 #ifdef HAS_ARGBTOUVJROW_NEON
 ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ABGRTOUVJROW_NEON
+ANY12S(ABGRToUVJRow_Any_NEON, ABGRToUVJRow_NEON, 0, 4, 15)
+#endif
 #ifdef HAS_ARGBTOUVJROW_MSA
 ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
 #endif
@@ -2142,12 +2321,18 @@ ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
 #ifdef HAS_YUY2TOUVROW_MSA
 ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
 #endif
+#ifdef HAS_YUY2TOUVROW_LSX
+ANY12S(YUY2ToUVRow_Any_LSX, YUY2ToUVRow_LSX, 1, 4, 15)
+#endif
 #ifdef HAS_YUY2TOUVROW_LASX
 ANY12S(YUY2ToUVRow_Any_LASX, YUY2ToUVRow_LASX, 1, 4, 31)
 #endif
 #ifdef HAS_UYVYTOUVROW_MSA
 ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
 #endif
+#ifdef HAS_UYVYTOUVROW_LSX
+ANY12S(UYVYToUVRow_Any_LSX, UYVYToUVRow_LSX, 1, 4, 15)
+#endif
 #ifdef HAS_UYVYTOUVROW_LASX
 ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31)
 #endif
@@ -2158,24 +2343,25 @@ ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31)
 #define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
   void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu,      \
                int width) {                                                  \
-    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                     \
-    memset(temp, 0, 128 * 2); /* for msan */                                 \
+    SIMD_ALIGNED(uint8_t vin[128 * 2]);                                      \
+    SIMD_ALIGNED(uint8_t vout[128]);                                         \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
       ANY_SIMD(src_ptr, src_stride, dst_vu, n);                              \
     }                                                                        \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
-    memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,          \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);       \
+    memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,           \
            SS(r, UVSHIFT) * BPP);                                            \
     if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
-      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+      memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP,   \
              BPP);                                                           \
-      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
-             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
+      memcpy(vin + 128 + SS(r, UVSHIFT) * BPP,                               \
+             vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                   \
     }                                                                        \
-    ANY_SIMD(temp, 128, temp + 256, MASK + 1);                               \
-    memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2);                 \
+    ANY_SIMD(vin, 128, vout, MASK + 1);                                      \
+    memcpy(dst_vu + (n >> 1) * 2, vout, SS(r, 1) * 2);                       \
   }
 
 #ifdef HAS_AYUVTOVUROW_NEON
@@ -2184,42 +2370,53 @@ ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
 #endif
 #undef ANY11S
 
-#define ANYDETILE(NAMEANY, ANY_SIMD, MASK)                                  \
-  void NAMEANY(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, \
-               int width) {                                                 \
-    SIMD_ALIGNED(uint8_t temp[16 * 2]);                                     \
-    memset(temp, 0, 16); /* for msan */                                     \
-    int r = width & MASK;                                                   \
-    int n = width & ~MASK;                                                  \
-    if (n > 0) {                                                            \
-      ANY_SIMD(src, src_tile_stride, dst, n);                               \
-    }                                                                       \
-    memcpy(temp, src + (n / 16) * src_tile_stride, r);                      \
-    ANY_SIMD(temp, src_tile_stride, temp + 16, MASK + 1);                   \
-    memcpy(dst + n, temp + 16, r);                                          \
+#define ANYDETILE(NAMEANY, ANY_SIMD, T, BPP, MASK)                           \
+  void NAMEANY(const T* src, ptrdiff_t src_tile_stride, T* dst, int width) { \
+    SIMD_ALIGNED(T vin[16]);                                                 \
+    SIMD_ALIGNED(T vout[16]);                                                \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src, src_tile_stride, dst, n);                                \
+    }                                                                        \
+    memcpy(vin, src + (n / 16) * src_tile_stride, r * BPP);                  \
+    ANY_SIMD(vin, src_tile_stride, vout, MASK + 1);                          \
+    memcpy(dst + n, vout, r * BPP);                                          \
   }
 
 #ifdef HAS_DETILEROW_NEON
-ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, 15)
+ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, uint8_t, 1, 15)
 #endif
 #ifdef HAS_DETILEROW_SSE2
-ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, 15)
+ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, uint8_t, 1, 15)
+#endif
+#ifdef HAS_DETILEROW_16_NEON
+ANYDETILE(DetileRow_16_Any_NEON, DetileRow_16_NEON, uint16_t, 2, 15)
+#endif
+#ifdef HAS_DETILEROW_16_SSE2
+ANYDETILE(DetileRow_16_Any_SSE2, DetileRow_16_SSE2, uint16_t, 2, 15)
+#endif
+#ifdef HAS_DETILEROW_16_AVX
+ANYDETILE(DetileRow_16_Any_AVX, DetileRow_16_AVX, uint16_t, 2, 15)
 #endif
 
+// DetileSplitUVRow width is in bytes
 #define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK)                \
   void NAMEANY(const uint8_t* src_uv, ptrdiff_t src_tile_stride, \
                uint8_t* dst_u, uint8_t* dst_v, int width) {      \
-    SIMD_ALIGNED(uint8_t temp[16 * 2]);                          \
-    memset(temp, 0, 16 * 2); /* for msan */                      \
+    SIMD_ALIGNED(uint8_t vin[16]);                               \
+    SIMD_ALIGNED(uint8_t vout[8 * 2]);                           \
+    memset(vin, 0, sizeof(vin)); /* for msan */                  \
     int r = width & MASK;                                        \
     int n = width & ~MASK;                                       \
     if (n > 0) {                                                 \
       ANY_SIMD(src_uv, src_tile_stride, dst_u, dst_v, n);        \
     }                                                            \
-    memcpy(temp, src_uv + (n / 16) * src_tile_stride, r);        \
-    ANY_SIMD(temp, src_tile_stride, temp + 16, temp + 24, r);    \
-    memcpy(dst_u + n / 2, temp + 16, (r + 1) / 2);               \
-    memcpy(dst_v + n / 2, temp + 24, (r + 1) / 2);               \
+    memcpy(vin, src_uv + (n / 16) * src_tile_stride, r);         \
+    ANY_SIMD(vin, src_tile_stride, vout, vout + 8, r);           \
+    memcpy(dst_u + n / 2, vout, (r + 1) / 2);                    \
+    memcpy(dst_v + n / 2, vout + 8, (r + 1) / 2);                \
   }
 
 #ifdef HAS_DETILESPLITUVROW_NEON
@@ -2229,6 +2426,33 @@ ANYDETILESPLITUV(DetileSplitUVRow_Any_NEON, DetileSplitUVRow_NEON, 15)
 ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15)
 #endif
 
+#define ANYDETILEMERGE(NAMEANY, ANY_SIMD, MASK)                                \
+  void NAMEANY(const uint8_t* src_y, ptrdiff_t src_y_tile_stride,              \
+               const uint8_t* src_uv, ptrdiff_t src_uv_tile_stride,            \
+               uint8_t* dst_yuy2, int width) {                                 \
+    SIMD_ALIGNED(uint8_t vin[16 * 2]);                                         \
+    SIMD_ALIGNED(uint8_t vout[16 * 2]);                                        \
+    memset(vin, 0, sizeof(vin)); /* for msan */                                \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, dst_yuy2, \
+               n);                                                             \
+    }                                                                          \
+    memcpy(vin, src_y + (n / 16) * src_y_tile_stride, r);                      \
+    memcpy(vin + 16, src_uv + (n / 16) * src_uv_tile_stride, r);               \
+    ANY_SIMD(vin, src_y_tile_stride, vin + 16, src_uv_tile_stride, vout, r);   \
+    memcpy(dst_yuy2 + 2 * n, vout, 2 * r);                                     \
+  }
+
+#ifdef HAS_DETILETOYUY2_NEON
+ANYDETILEMERGE(DetileToYUY2_Any_NEON, DetileToYUY2_NEON, 15)
+#endif
+
+#ifdef HAS_DETILETOYUY2_SSE2
+ANYDETILEMERGE(DetileToYUY2_Any_SSE2, DetileToYUY2_SSE2, 15)
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/source/row_common.cc b/source/row_common.cc
index 83442496..3afc4b4d 100644
--- a/files/source/row_common.cc
+++ b/source/row_common.cc
@@ -21,6 +21,12 @@ namespace libyuv {
 extern "C" {
 #endif
 
+#ifdef __cplusplus
+#define STATIC_CAST(type, expr) static_cast<type>(expr)
+#else
+#define STATIC_CAST(type, expr) (type)(expr)
+#endif
+
 // This macro controls YUV to RGB using unsigned math to extend range of
 // YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B:
 // LIBYUV_UNLIMITED_DATA
@@ -42,7 +48,6 @@ extern "C" {
                                    defined(__i386__) || defined(_M_IX86))
 #define LIBYUV_ARGBTOUV_PAVGB 1
 #define LIBYUV_RGBTOU_TRUNCATE 1
-#define LIBYUV_ATTENUATE_DUP 1
 #endif
 #if defined(LIBYUV_BIT_EXACT)
 #define LIBYUV_UNATTENUATE_DUP 1
@@ -182,12 +187,13 @@ void RGB565ToARGBRow_C(const uint8_t* src_rgb565,
                        int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8_t b = src_rgb565[0] & 0x1f;
-    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8_t r = src_rgb565[1] >> 3;
-    dst_argb[0] = (b << 3) | (b >> 2);
-    dst_argb[1] = (g << 2) | (g >> 4);
-    dst_argb[2] = (r << 3) | (r >> 2);
+    uint8_t b = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
+    uint8_t g = STATIC_CAST(
+        uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+    uint8_t r = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
+    dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+    dst_argb[1] = STATIC_CAST(uint8_t, (g << 2) | (g >> 4));
+    dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
     dst_argb[3] = 255u;
     dst_argb += 4;
     src_rgb565 += 2;
@@ -199,13 +205,14 @@ void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8_t b = src_argb1555[0] & 0x1f;
-    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
-    uint8_t a = src_argb1555[1] >> 7;
-    dst_argb[0] = (b << 3) | (b >> 2);
-    dst_argb[1] = (g << 3) | (g >> 2);
-    dst_argb[2] = (r << 3) | (r >> 2);
+    uint8_t b = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f);
+    uint8_t g = STATIC_CAST(
+        uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
+    uint8_t r = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2);
+    uint8_t a = STATIC_CAST(uint8_t, src_argb1555[1] >> 7);
+    dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+    dst_argb[1] = STATIC_CAST(uint8_t, (g << 3) | (g >> 2));
+    dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
     dst_argb[3] = -a;
     dst_argb += 4;
     src_argb1555 += 2;
@@ -217,14 +224,14 @@ void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8_t b = src_argb4444[0] & 0x0f;
-    uint8_t g = src_argb4444[0] >> 4;
-    uint8_t r = src_argb4444[1] & 0x0f;
-    uint8_t a = src_argb4444[1] >> 4;
-    dst_argb[0] = (b << 4) | b;
-    dst_argb[1] = (g << 4) | g;
-    dst_argb[2] = (r << 4) | r;
-    dst_argb[3] = (a << 4) | a;
+    uint8_t b = STATIC_CAST(uint8_t, src_argb4444[0] & 0x0f);
+    uint8_t g = STATIC_CAST(uint8_t, src_argb4444[0] >> 4);
+    uint8_t r = STATIC_CAST(uint8_t, src_argb4444[1] & 0x0f);
+    uint8_t a = STATIC_CAST(uint8_t, src_argb4444[1] >> 4);
+    dst_argb[0] = STATIC_CAST(uint8_t, (b << 4) | b);
+    dst_argb[1] = STATIC_CAST(uint8_t, (g << 4) | g);
+    dst_argb[2] = STATIC_CAST(uint8_t, (r << 4) | r);
+    dst_argb[3] = STATIC_CAST(uint8_t, (a << 4) | a);
     dst_argb += 4;
     src_argb4444 += 2;
   }
@@ -274,6 +281,54 @@ void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
   }
 }
 
+void ARGBToABGRRow_C(const uint8_t* src_argb, uint8_t* dst_abgr, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_argb[0];
+    uint8_t g = src_argb[1];
+    uint8_t r = src_argb[2];
+    uint8_t a = src_argb[3];
+    dst_abgr[0] = r;
+    dst_abgr[1] = g;
+    dst_abgr[2] = b;
+    dst_abgr[3] = a;
+    dst_abgr += 4;
+    src_argb += 4;
+  }
+}
+
+void ARGBToBGRARow_C(const uint8_t* src_argb, uint8_t* dst_bgra, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_argb[0];
+    uint8_t g = src_argb[1];
+    uint8_t r = src_argb[2];
+    uint8_t a = src_argb[3];
+    dst_bgra[0] = a;
+    dst_bgra[1] = r;
+    dst_bgra[2] = g;
+    dst_bgra[3] = b;
+    dst_bgra += 4;
+    src_argb += 4;
+  }
+}
+
+void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgba, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_argb[0];
+    uint8_t g = src_argb[1];
+    uint8_t r = src_argb[2];
+    uint8_t a = src_argb[3];
+    dst_rgba[0] = a;
+    dst_rgba[1] = b;
+    dst_rgba[2] = g;
+    dst_rgba[3] = r;
+    dst_rgba += 4;
+    src_argb += 4;
+  }
+}
+
 void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
@@ -302,6 +357,22 @@ void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   }
 }
 
+void RGBAToARGBRow_C(const uint8_t* src_rgba, uint8_t* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t a = src_rgba[0];
+    uint8_t b = src_rgba[1];
+    uint8_t g = src_rgba[2];
+    uint8_t r = src_rgba[3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    dst_argb += 4;
+    src_rgba += 4;
+  }
+}
+
 void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -320,7 +391,7 @@ void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t b0 = src_argb[0] >> 3;
     uint8_t g0 = src_argb[1] >> 2;
     uint8_t r0 = src_argb[2] >> 3;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16_t*)(dst_rgb) = STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
   }
 }
 
@@ -334,29 +405,31 @@ void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
 // or the upper byte for big endian.
 void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
                              uint8_t* dst_rgb,
-                             const uint32_t dither4,
+                             uint32_t dither4,
                              int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     int dither0 = ((const unsigned char*)(&dither4))[x & 3];
     int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
-    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
-    uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;
-    uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;
-    uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;
-    *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 11);
-    *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 11);
+    uint8_t b0 = STATIC_CAST(uint8_t, clamp255(src_argb[0] + dither0) >> 3);
+    uint8_t g0 = STATIC_CAST(uint8_t, clamp255(src_argb[1] + dither0) >> 2);
+    uint8_t r0 = STATIC_CAST(uint8_t, clamp255(src_argb[2] + dither0) >> 3);
+    uint8_t b1 = STATIC_CAST(uint8_t, clamp255(src_argb[4] + dither1) >> 3);
+    uint8_t g1 = STATIC_CAST(uint8_t, clamp255(src_argb[5] + dither1) >> 2);
+    uint8_t r1 = STATIC_CAST(uint8_t, clamp255(src_argb[6] + dither1) >> 3);
+    *(uint16_t*)(dst_rgb + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
+    *(uint16_t*)(dst_rgb + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 11));
     dst_rgb += 4;
     src_argb += 8;
   }
   if (width & 1) {
     int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
-    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+    uint8_t b0 = STATIC_CAST(uint8_t, clamp255(src_argb[0] + dither0) >> 3);
+    uint8_t g0 = STATIC_CAST(uint8_t, clamp255(src_argb[1] + dither0) >> 2);
+    uint8_t r0 = STATIC_CAST(uint8_t, clamp255(src_argb[2] + dither0) >> 3);
+    *(uint16_t*)(dst_rgb) = STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
   }
 }
 
@@ -371,8 +444,10 @@ void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t g1 = src_argb[5] >> 3;
     uint8_t r1 = src_argb[6] >> 3;
     uint8_t a1 = src_argb[7] >> 7;
-    *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
-    *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 10) | (a1 << 15);
+    *(uint16_t*)(dst_rgb + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | (a0 << 15));
+    *(uint16_t*)(dst_rgb + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 10) | (a1 << 15));
     dst_rgb += 4;
     src_argb += 8;
   }
@@ -381,7 +456,8 @@ void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t g0 = src_argb[1] >> 3;
     uint8_t r0 = src_argb[2] >> 3;
     uint8_t a0 = src_argb[3] >> 7;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+    *(uint16_t*)(dst_rgb) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | (a0 << 15));
   }
 }
 
@@ -396,8 +472,10 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t g1 = src_argb[5] >> 4;
     uint8_t r1 = src_argb[6] >> 4;
     uint8_t a1 = src_argb[7] >> 4;
-    *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
-    *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 4) | (r1 << 8) | (a1 << 12);
+    *(uint16_t*)(dst_rgb + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | (a0 << 12));
+    *(uint16_t*)(dst_rgb + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 4) | (r1 << 8) | (a1 << 12));
     dst_rgb += 4;
     src_argb += 8;
   }
@@ -406,18 +484,20 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t g0 = src_argb[1] >> 4;
     uint8_t r0 = src_argb[2] >> 4;
     uint8_t a0 = src_argb[3] >> 4;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+    *(uint16_t*)(dst_rgb) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | (a0 << 12));
   }
 }
 
 void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
+    uint32_t r0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
     uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);
-    uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
+    uint32_t b0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
     uint32_t a0 = (src_abgr[3] >> 6);
-    *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);
+    *(uint32_t*)(dst_ar30) =
+        STATIC_CAST(uint32_t, b0 | (g0 << 10) | (r0 << 20) | (a0 << 30));
     dst_ar30 += 4;
     src_abgr += 4;
   }
@@ -430,7 +510,8 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
     uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);
     uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);
     uint32_t a0 = (src_argb[3] >> 6);
-    *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
+    *(uint32_t*)(dst_ar30) =
+        STATIC_CAST(uint32_t, b0 | (g0 << 10) | (r0 << 20) | (a0 << 30));
     dst_ar30 += 4;
     src_argb += 4;
   }
@@ -439,10 +520,14 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
 void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_ar64[0] = src_argb[0] * 0x0101;
-    dst_ar64[1] = src_argb[1] * 0x0101;
-    dst_ar64[2] = src_argb[2] * 0x0101;
-    dst_ar64[3] = src_argb[3] * 0x0101;
+    uint16_t b = src_argb[0] * 0x0101;
+    uint16_t g = src_argb[1] * 0x0101;
+    uint16_t r = src_argb[2] * 0x0101;
+    uint16_t a = src_argb[3] * 0x0101;
+    dst_ar64[0] = b;
+    dst_ar64[1] = g;
+    dst_ar64[2] = r;
+    dst_ar64[3] = a;
     dst_ar64 += 4;
     src_argb += 4;
   }
@@ -451,10 +536,14 @@ void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
 void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_ab64[0] = src_argb[2] * 0x0101;
-    dst_ab64[1] = src_argb[1] * 0x0101;
-    dst_ab64[2] = src_argb[0] * 0x0101;
-    dst_ab64[3] = src_argb[3] * 0x0101;
+    uint16_t b = src_argb[0] * 0x0101;
+    uint16_t g = src_argb[1] * 0x0101;
+    uint16_t r = src_argb[2] * 0x0101;
+    uint16_t a = src_argb[3] * 0x0101;
+    dst_ab64[0] = r;
+    dst_ab64[1] = g;
+    dst_ab64[2] = b;
+    dst_ab64[3] = a;
     dst_ab64 += 4;
     src_argb += 4;
   }
@@ -463,10 +552,14 @@ void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
 void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_argb[0] = src_ar64[0] >> 8;
-    dst_argb[1] = src_ar64[1] >> 8;
-    dst_argb[2] = src_ar64[2] >> 8;
-    dst_argb[3] = src_ar64[3] >> 8;
+    uint8_t b = src_ar64[0] >> 8;
+    uint8_t g = src_ar64[1] >> 8;
+    uint8_t r = src_ar64[2] >> 8;
+    uint8_t a = src_ar64[3] >> 8;
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
     dst_argb += 4;
     src_ar64 += 4;
   }
@@ -475,15 +568,35 @@ void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
 void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_argb[0] = src_ab64[2] >> 8;
-    dst_argb[1] = src_ab64[1] >> 8;
-    dst_argb[2] = src_ab64[0] >> 8;
-    dst_argb[3] = src_ab64[3] >> 8;
+    uint8_t r = src_ab64[0] >> 8;
+    uint8_t g = src_ab64[1] >> 8;
+    uint8_t b = src_ab64[2] >> 8;
+    uint8_t a = src_ab64[3] >> 8;
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
     dst_argb += 4;
     src_ab64 += 4;
   }
 }
 
+void AR64ToAB64Row_C(const uint16_t* src_ar64, uint16_t* dst_ab64, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint16_t b = src_ar64[0];
+    uint16_t g = src_ar64[1];
+    uint16_t r = src_ar64[2];
+    uint16_t a = src_ar64[3];
+    dst_ab64[0] = r;
+    dst_ab64[1] = g;
+    dst_ab64[2] = b;
+    dst_ab64[3] = a;
+    dst_ab64 += 4;
+    src_ar64 += 4;
+  }
+}
+
 // TODO(fbarchard): Make shuffle compatible with SIMD versions
 void AR64ShuffleRow_C(const uint8_t* src_ar64,
                       uint8_t* dst_ar64,
@@ -514,8 +627,8 @@ void AR64ShuffleRow_C(const uint8_t* src_ar64,
 
 #ifdef LIBYUV_RGB7
 // Old 7 bit math for compatibility on unsupported platforms.
-static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
-  return ((33 * r + 65 * g + 13 * b) >> 7) + 16;
+static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, ((33 * r + 65 * g + 13 * b) >> 7) + 16);
 }
 #else
 // 8 bit
@@ -524,8 +637,8 @@ static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
 //  return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
 //  0x7e80) >> 8;
 
-static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
-  return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
+static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8);
 }
 #endif
 
@@ -533,29 +646,31 @@ static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
 
 // LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round.
 #ifdef LIBYUV_RGBTOU_TRUNCATE
-static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
-  return (112 * b - 74 * g - 38 * r + 0x8000) >> 8;
+static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8000) >> 8);
 }
-static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
-  return (112 * r - 94 * g - 18 * b + 0x8000) >> 8;
+static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8000) >> 8);
 }
 #else
 // TODO(fbarchard): Add rounding to x86 SIMD and use this
-static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
-  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
+static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8);
 }
-static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
-  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
+static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8080) >> 8);
 }
 #endif
 
 // LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb.
 #if !defined(LIBYUV_ARGBTOUV_PAVGB)
 static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
-  return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8;
+  return STATIC_CAST(
+      uint8_t, ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8);
 }
 static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
-  return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8;
+  return STATIC_CAST(
+      uint8_t, ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8);
 }
 #endif
 
@@ -674,28 +789,28 @@ MAKEROWY(RAW, 0, 1, 2, 3)
 
 #ifdef LIBYUV_RGB7
 // Old 7 bit math for compatibility on unsupported platforms.
-static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
   return (38 * r + 75 * g + 15 * b + 64) >> 7;
 }
 #else
 // 8 bit
-static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
   return (77 * r + 150 * g + 29 * b + 128) >> 8;
 }
 #endif
 
 #if defined(LIBYUV_ARGBTOUV_PAVGB)
-static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
+static __inline uint8_t RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
   return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
 }
-static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
+static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
   return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
 }
 #else
-static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
+static __inline uint8_t RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
   return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
 }
-static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
+static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
   return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
 }
 #endif
@@ -782,6 +897,7 @@ static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
 #endif
 
 MAKEROWYJ(ARGB, 2, 1, 0, 4)
+MAKEROWYJ(ABGR, 0, 1, 2, 4)
 MAKEROWYJ(RGBA, 3, 2, 1, 4)
 MAKEROWYJ(RGB24, 2, 1, 0, 3)
 MAKEROWYJ(RAW, 0, 1, 2, 3)
@@ -791,11 +907,12 @@ void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
     uint8_t b = src_rgb565[0] & 0x1f;
-    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t g = STATIC_CAST(
+        uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
     uint8_t r = src_rgb565[1] >> 3;
-    b = (b << 3) | (b >> 2);
-    g = (g << 2) | (g >> 4);
-    r = (r << 3) | (r >> 2);
+    b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+    g = STATIC_CAST(uint8_t, (g << 2) | (g >> 4));
+    r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
     dst_y[0] = RGBToY(r, g, b);
     src_rgb565 += 2;
     dst_y += 1;
@@ -806,11 +923,12 @@ void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
     uint8_t b = src_argb1555[0] & 0x1f;
-    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t g = STATIC_CAST(
+        uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
     uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
-    b = (b << 3) | (b >> 2);
-    g = (g << 3) | (g >> 2);
-    r = (r << 3) | (r >> 2);
+    b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+    g = STATIC_CAST(uint8_t, (g << 3) | (g >> 2));
+    r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
     dst_y[0] = RGBToY(r, g, b);
     src_argb1555 += 2;
     dst_y += 1;
@@ -823,9 +941,9 @@ void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
     uint8_t b = src_argb4444[0] & 0x0f;
     uint8_t g = src_argb4444[0] >> 4;
     uint8_t r = src_argb4444[1] & 0x0f;
-    b = (b << 4) | b;
-    g = (g << 4) | g;
-    r = (r << 4) | r;
+    b = STATIC_CAST(uint8_t, (b << 4) | b);
+    g = STATIC_CAST(uint8_t, (g << 4) | g);
+    r = STATIC_CAST(uint8_t, (r << 4) | r);
     dst_y[0] = RGBToY(r, g, b);
     src_argb4444 += 2;
     dst_y += 1;
@@ -840,31 +958,35 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
   const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8_t b0 = src_rgb565[0] & 0x1f;
-    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8_t r0 = src_rgb565[1] >> 3;
-    uint8_t b1 = src_rgb565[2] & 0x1f;
-    uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
-    uint8_t r1 = src_rgb565[3] >> 3;
-    uint8_t b2 = next_rgb565[0] & 0x1f;
-    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8_t r2 = next_rgb565[1] >> 3;
-    uint8_t b3 = next_rgb565[2] & 0x1f;
-    uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
-    uint8_t r3 = next_rgb565[3] >> 3;
-
-    b0 = (b0 << 3) | (b0 >> 2);
-    g0 = (g0 << 2) | (g0 >> 4);
-    r0 = (r0 << 3) | (r0 >> 2);
-    b1 = (b1 << 3) | (b1 >> 2);
-    g1 = (g1 << 2) | (g1 >> 4);
-    r1 = (r1 << 3) | (r1 >> 2);
-    b2 = (b2 << 3) | (b2 >> 2);
-    g2 = (g2 << 2) | (g2 >> 4);
-    r2 = (r2 << 3) | (r2 >> 2);
-    b3 = (b3 << 3) | (b3 >> 2);
-    g3 = (g3 << 2) | (g3 >> 4);
-    r3 = (r3 << 3) | (r3 >> 2);
+    uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
+    uint8_t g0 = STATIC_CAST(
+        uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+    uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
+    uint8_t b1 = STATIC_CAST(uint8_t, src_rgb565[2] & 0x1f);
+    uint8_t g1 = STATIC_CAST(
+        uint8_t, (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3));
+    uint8_t r1 = STATIC_CAST(uint8_t, src_rgb565[3] >> 3);
+    uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f);
+    uint8_t g2 = STATIC_CAST(
+        uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3));
+    uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3);
+    uint8_t b3 = STATIC_CAST(uint8_t, next_rgb565[2] & 0x1f);
+    uint8_t g3 = STATIC_CAST(
+        uint8_t, (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3));
+    uint8_t r3 = STATIC_CAST(uint8_t, next_rgb565[3] >> 3);
+
+    b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+    g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4));
+    r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+    b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2));
+    g1 = STATIC_CAST(uint8_t, (g1 << 2) | (g1 >> 4));
+    r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2));
+    b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+    g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4));
+    r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
+    b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2));
+    g3 = STATIC_CAST(uint8_t, (g3 << 2) | (g3 >> 4));
+    r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2));
 
 #if LIBYUV_ARGBTOUV_PAVGB
     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
@@ -886,19 +1008,20 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
     dst_v += 1;
   }
   if (width & 1) {
-    uint8_t b0 = src_rgb565[0] & 0x1f;
-    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8_t r0 = src_rgb565[1] >> 3;
-    uint8_t b2 = next_rgb565[0] & 0x1f;
-    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8_t r2 = next_rgb565[1] >> 3;
-
-    b0 = (b0 << 3) | (b0 >> 2);
-    g0 = (g0 << 2) | (g0 >> 4);
-    r0 = (r0 << 3) | (r0 >> 2);
-    b2 = (b2 << 3) | (b2 >> 2);
-    g2 = (g2 << 2) | (g2 >> 4);
-    r2 = (r2 << 3) | (r2 >> 2);
+    uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
+    uint8_t g0 = STATIC_CAST(
+        uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+    uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
+    uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f);
+    uint8_t g2 = STATIC_CAST(
+        uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3));
+    uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3);
+    b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+    g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4));
+    r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+    b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+    g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4));
+    r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
 
 #if LIBYUV_ARGBTOUV_PAVGB
     uint8_t ab = AVGB(b0, b2);
@@ -924,31 +1047,35 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
   const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8_t b0 = src_argb1555[0] & 0x1f;
-    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8_t b1 = src_argb1555[2] & 0x1f;
-    uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
-    uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2;
-    uint8_t b2 = next_argb1555[0] & 0x1f;
-    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
-    uint8_t b3 = next_argb1555[2] & 0x1f;
-    uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
-    uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
-
-    b0 = (b0 << 3) | (b0 >> 2);
-    g0 = (g0 << 3) | (g0 >> 2);
-    r0 = (r0 << 3) | (r0 >> 2);
-    b1 = (b1 << 3) | (b1 >> 2);
-    g1 = (g1 << 3) | (g1 >> 2);
-    r1 = (r1 << 3) | (r1 >> 2);
-    b2 = (b2 << 3) | (b2 >> 2);
-    g2 = (g2 << 3) | (g2 >> 2);
-    r2 = (r2 << 3) | (r2 >> 2);
-    b3 = (b3 << 3) | (b3 >> 2);
-    g3 = (g3 << 3) | (g3 >> 2);
-    r3 = (r3 << 3) | (r3 >> 2);
+    uint8_t b0 = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f);
+    uint8_t g0 = STATIC_CAST(
+        uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
+    uint8_t r0 = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2);
+    uint8_t b1 = STATIC_CAST(uint8_t, src_argb1555[2] & 0x1f);
+    uint8_t g1 = STATIC_CAST(
+        uint8_t, (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3));
+    uint8_t r1 = STATIC_CAST(uint8_t, (src_argb1555[3] & 0x7c) >> 2);
+    uint8_t b2 = STATIC_CAST(uint8_t, next_argb1555[0] & 0x1f);
+    uint8_t g2 = STATIC_CAST(
+        uint8_t, (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3));
+    uint8_t r2 = STATIC_CAST(uint8_t, (next_argb1555[1] & 0x7c) >> 2);
+    uint8_t b3 = STATIC_CAST(uint8_t, next_argb1555[2] & 0x1f);
+    uint8_t g3 = STATIC_CAST(
+        uint8_t, (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3));
+    uint8_t r3 = STATIC_CAST(uint8_t, (next_argb1555[3] & 0x7c) >> 2);
+
+    b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+    g0 = STATIC_CAST(uint8_t, (g0 << 3) | (g0 >> 2));
+    r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+    b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2));
+    g1 = STATIC_CAST(uint8_t, (g1 << 3) | (g1 >> 2));
+    r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2));
+    b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+    g2 = STATIC_CAST(uint8_t, (g2 << 3) | (g2 >> 2));
+    r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
+    b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2));
+    g3 = STATIC_CAST(uint8_t, (g3 << 3) | (g3 >> 2));
+    r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2));
 
 #if LIBYUV_ARGBTOUV_PAVGB
     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
@@ -970,19 +1097,21 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
     dst_v += 1;
   }
   if (width & 1) {
-    uint8_t b0 = src_argb1555[0] & 0x1f;
-    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8_t b2 = next_argb1555[0] & 0x1f;
-    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
-
-    b0 = (b0 << 3) | (b0 >> 2);
-    g0 = (g0 << 3) | (g0 >> 2);
-    r0 = (r0 << 3) | (r0 >> 2);
-    b2 = (b2 << 3) | (b2 >> 2);
-    g2 = (g2 << 3) | (g2 >> 2);
-    r2 = (r2 << 3) | (r2 >> 2);
+    uint8_t b0 = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f);
+    uint8_t g0 = STATIC_CAST(
+        uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
+    uint8_t r0 = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2);
+    uint8_t b2 = STATIC_CAST(uint8_t, next_argb1555[0] & 0x1f);
+    uint8_t g2 = STATIC_CAST(
+        uint8_t, (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3));
+    uint8_t r2 = STATIC_CAST(uint8_t, (next_argb1555[1] & 0x7c) >> 2);
+
+    b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+    g0 = STATIC_CAST(uint8_t, (g0 << 3) | (g0 >> 2));
+    r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+    b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+    g2 = STATIC_CAST(uint8_t, (g2 << 3) | (g2 >> 2));
+    r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
 
 #if LIBYUV_ARGBTOUV_PAVGB
     uint8_t ab = AVGB(b0, b2);
@@ -1021,18 +1150,18 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
     uint8_t g3 = next_argb4444[2] >> 4;
     uint8_t r3 = next_argb4444[3] & 0x0f;
 
-    b0 = (b0 << 4) | b0;
-    g0 = (g0 << 4) | g0;
-    r0 = (r0 << 4) | r0;
-    b1 = (b1 << 4) | b1;
-    g1 = (g1 << 4) | g1;
-    r1 = (r1 << 4) | r1;
-    b2 = (b2 << 4) | b2;
-    g2 = (g2 << 4) | g2;
-    r2 = (r2 << 4) | r2;
-    b3 = (b3 << 4) | b3;
-    g3 = (g3 << 4) | g3;
-    r3 = (r3 << 4) | r3;
+    b0 = STATIC_CAST(uint8_t, (b0 << 4) | b0);
+    g0 = STATIC_CAST(uint8_t, (g0 << 4) | g0);
+    r0 = STATIC_CAST(uint8_t, (r0 << 4) | r0);
+    b1 = STATIC_CAST(uint8_t, (b1 << 4) | b1);
+    g1 = STATIC_CAST(uint8_t, (g1 << 4) | g1);
+    r1 = STATIC_CAST(uint8_t, (r1 << 4) | r1);
+    b2 = STATIC_CAST(uint8_t, (b2 << 4) | b2);
+    g2 = STATIC_CAST(uint8_t, (g2 << 4) | g2);
+    r2 = STATIC_CAST(uint8_t, (r2 << 4) | r2);
+    b3 = STATIC_CAST(uint8_t, (b3 << 4) | b3);
+    g3 = STATIC_CAST(uint8_t, (g3 << 4) | g3);
+    r3 = STATIC_CAST(uint8_t, (r3 << 4) | r3);
 
 #if LIBYUV_ARGBTOUV_PAVGB
     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
@@ -1061,12 +1190,12 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
     uint8_t g2 = next_argb4444[0] >> 4;
     uint8_t r2 = next_argb4444[1] & 0x0f;
 
-    b0 = (b0 << 4) | b0;
-    g0 = (g0 << 4) | g0;
-    r0 = (r0 << 4) | r0;
-    b2 = (b2 << 4) | b2;
-    g2 = (g2 << 4) | g2;
-    r2 = (r2 << 4) | r2;
+    b0 = STATIC_CAST(uint8_t, (b0 << 4) | b0);
+    g0 = STATIC_CAST(uint8_t, (g0 << 4) | g0);
+    r0 = STATIC_CAST(uint8_t, (r0 << 4) | r0);
+    b2 = STATIC_CAST(uint8_t, (b2 << 4) | b2);
+    g2 = STATIC_CAST(uint8_t, (g2 << 4) | g2);
+    r2 = STATIC_CAST(uint8_t, (r2 << 4) | r2);
 
 #if LIBYUV_ARGBTOUV_PAVGB
     uint8_t ab = AVGB(b0, b2);
@@ -1123,9 +1252,9 @@ void ARGBSepiaRow_C(uint8_t* dst_argb, int width) {
     int sg = (b * 22 + g * 88 + r * 45) >> 7;
     int sr = (b * 24 + g * 98 + r * 50) >> 7;
     // b does not over flow. a is preserved from original.
-    dst_argb[0] = sb;
-    dst_argb[1] = clamp255(sg);
-    dst_argb[2] = clamp255(sr);
+    dst_argb[0] = STATIC_CAST(uint8_t, sb);
+    dst_argb[1] = STATIC_CAST(uint8_t, clamp255(sg));
+    dst_argb[2] = STATIC_CAST(uint8_t, clamp255(sr));
     dst_argb += 4;
   }
 }
@@ -1154,10 +1283,10 @@ void ARGBColorMatrixRow_C(const uint8_t* src_argb,
     int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
               a * matrix_argb[15]) >>
              6;
-    dst_argb[0] = Clamp(sb);
-    dst_argb[1] = Clamp(sg);
-    dst_argb[2] = Clamp(sr);
-    dst_argb[3] = Clamp(sa);
+    dst_argb[0] = STATIC_CAST(uint8_t, Clamp(sb));
+    dst_argb[1] = STATIC_CAST(uint8_t, Clamp(sg));
+    dst_argb[2] = STATIC_CAST(uint8_t, Clamp(sr));
+    dst_argb[3] = STATIC_CAST(uint8_t, Clamp(sa));
     src_argb += 4;
     dst_argb += 4;
   }
@@ -1207,9 +1336,12 @@ void ARGBQuantizeRow_C(uint8_t* dst_argb,
     int b = dst_argb[0];
     int g = dst_argb[1];
     int r = dst_argb[2];
-    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
-    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
-    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
+    dst_argb[0] = STATIC_CAST(
+        uint8_t, (b * scale >> 16) * interval_size + interval_offset);
+    dst_argb[1] = STATIC_CAST(
+        uint8_t, (g * scale >> 16) * interval_size + interval_offset);
+    dst_argb[2] = STATIC_CAST(
+        uint8_t, (r * scale >> 16) * interval_size + interval_offset);
     dst_argb += 4;
   }
 }
@@ -1260,10 +1392,10 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb,
     const uint32_t g_scale = src_argb1[1];
     const uint32_t r_scale = src_argb1[2];
     const uint32_t a_scale = src_argb1[3];
-    dst_argb[0] = SHADE(b, b_scale);
-    dst_argb[1] = SHADE(g, g_scale);
-    dst_argb[2] = SHADE(r, r_scale);
-    dst_argb[3] = SHADE(a, a_scale);
+    dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_scale));
+    dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_scale));
+    dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_scale));
+    dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_scale));
     src_argb += 4;
     src_argb1 += 4;
     dst_argb += 4;
@@ -1288,10 +1420,10 @@ void ARGBAddRow_C(const uint8_t* src_argb,
     const int g_add = src_argb1[1];
     const int r_add = src_argb1[2];
     const int a_add = src_argb1[3];
-    dst_argb[0] = SHADE(b, b_add);
-    dst_argb[1] = SHADE(g, g_add);
-    dst_argb[2] = SHADE(r, r_add);
-    dst_argb[3] = SHADE(a, a_add);
+    dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_add));
+    dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_add));
+    dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_add));
+    dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_add));
     src_argb += 4;
     src_argb1 += 4;
     dst_argb += 4;
@@ -1315,10 +1447,10 @@ void ARGBSubtractRow_C(const uint8_t* src_argb,
     const int g_sub = src_argb1[1];
     const int r_sub = src_argb1[2];
     const int a_sub = src_argb1[3];
-    dst_argb[0] = SHADE(b, b_sub);
-    dst_argb[1] = SHADE(g, g_sub);
-    dst_argb[2] = SHADE(r, r_sub);
-    dst_argb[3] = SHADE(a, a_sub);
+    dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_sub));
+    dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_sub));
+    dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_sub));
+    dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_sub));
     src_argb += 4;
     src_argb1 += 4;
     dst_argb += 4;
@@ -1431,7 +1563,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 
 // clang-format off
 
-#if defined(__aarch64__) || defined(__arm__)
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
 // Bias values include subtract 128 from U and V, bias from Y and rounding.
 // For B and R bias is negative. For G bias is positive.
 #define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                             \
@@ -1627,7 +1759,7 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)
 
 #undef MAKEYUVCONSTANTS
 
-#if defined(__aarch64__) || defined(__arm__)
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
 #define LOAD_YUV_CONSTANTS                 \
   int ub = yuvconstants->kUVCoeff[0];      \
   int vr = yuvconstants->kUVCoeff[1];      \
@@ -1675,9 +1807,9 @@ static __inline void YuvPixel(uint8_t y,
   LOAD_YUV_CONSTANTS;
   uint32_t y32 = y * 0x0101;
   CALC_RGB16;
-  *b = Clamp((int32_t)(b16) >> 6);
-  *g = Clamp((int32_t)(g16) >> 6);
-  *r = Clamp((int32_t)(r16) >> 6);
+  *b = STATIC_CAST(uint8_t, Clamp((int32_t)(b16) >> 6));
+  *g = STATIC_CAST(uint8_t, Clamp((int32_t)(g16) >> 6));
+  *r = STATIC_CAST(uint8_t, Clamp((int32_t)(r16) >> 6));
 }
 
 // Reads 8 bit YUV and leaves result as 16 bit.
@@ -1706,9 +1838,9 @@ static __inline void YuvPixel10_16(uint16_t y,
                                    int* r,
                                    const struct YuvConstants* yuvconstants) {
   LOAD_YUV_CONSTANTS;
-  uint32_t y32 = y << 6;
-  u = clamp255(u >> 2);
-  v = clamp255(v >> 2);
+  uint32_t y32 = (y << 6) | (y >> 4);
+  u = STATIC_CAST(uint8_t, clamp255(u >> 2));
+  v = STATIC_CAST(uint8_t, clamp255(v >> 2));
   CALC_RGB16;
   *b = b16;
   *g = g16;
@@ -1725,9 +1857,9 @@ static __inline void YuvPixel12_16(int16_t y,
                                    int* r,
                                    const struct YuvConstants* yuvconstants) {
   LOAD_YUV_CONSTANTS;
-  uint32_t y32 = y << 4;
-  u = clamp255(u >> 4);
-  v = clamp255(v >> 4);
+  uint32_t y32 = (y << 4) | (y >> 8);
+  u = STATIC_CAST(uint8_t, clamp255(u >> 4));
+  v = STATIC_CAST(uint8_t, clamp255(v >> 4));
   CALC_RGB16;
   *b = b16;
   *g = g16;
@@ -1747,9 +1879,9 @@ static __inline void YuvPixel10(uint16_t y,
   int g16;
   int r16;
   YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants);
-  *b = Clamp(b16 >> 6);
-  *g = Clamp(g16 >> 6);
-  *r = Clamp(r16 >> 6);
+  *b = STATIC_CAST(uint8_t, Clamp(b16 >> 6));
+  *g = STATIC_CAST(uint8_t, Clamp(g16 >> 6));
+  *r = STATIC_CAST(uint8_t, Clamp(r16 >> 6));
 }
 
 // C reference code that mimics the YUV 12 bit assembly.
@@ -1765,9 +1897,9 @@ static __inline void YuvPixel12(uint16_t y,
   int g16;
   int r16;
   YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants);
-  *b = Clamp(b16 >> 6);
-  *g = Clamp(g16 >> 6);
-  *r = Clamp(r16 >> 6);
+  *b = STATIC_CAST(uint8_t, Clamp(b16 >> 6));
+  *g = STATIC_CAST(uint8_t, Clamp(g16 >> 6));
+  *r = STATIC_CAST(uint8_t, Clamp(r16 >> 6));
 }
 
 // C reference code that mimics the YUV 16 bit assembly.
@@ -1781,12 +1913,12 @@ static __inline void YuvPixel16_8(uint16_t y,
                                   const struct YuvConstants* yuvconstants) {
   LOAD_YUV_CONSTANTS;
   uint32_t y32 = y;
-  u = clamp255(u >> 8);
-  v = clamp255(v >> 8);
+  u = STATIC_CAST(uint16_t, clamp255(u >> 8));
+  v = STATIC_CAST(uint16_t, clamp255(v >> 8));
   CALC_RGB16;
-  *b = Clamp((int32_t)(b16) >> 6);
-  *g = Clamp((int32_t)(g16) >> 6);
-  *r = Clamp((int32_t)(r16) >> 6);
+  *b = STATIC_CAST(uint8_t, Clamp((int32_t)(b16) >> 6));
+  *g = STATIC_CAST(uint8_t, Clamp((int32_t)(g16) >> 6));
+  *r = STATIC_CAST(uint8_t, Clamp((int32_t)(r16) >> 6));
 }
 
 // C reference code that mimics the YUV 16 bit assembly.
@@ -1800,8 +1932,8 @@ static __inline void YuvPixel16_16(uint16_t y,
                                    const struct YuvConstants* yuvconstants) {
   LOAD_YUV_CONSTANTS;
   uint32_t y32 = y;
-  u = clamp255(u >> 8);
-  v = clamp255(v >> 8);
+  u = STATIC_CAST(uint16_t, clamp255(u >> 8));
+  v = STATIC_CAST(uint16_t, clamp255(v >> 8));
   CALC_RGB16;
   *b = b16;
   *g = g16;
@@ -1815,7 +1947,7 @@ static __inline void YPixel(uint8_t y,
                             uint8_t* g,
                             uint8_t* r,
                             const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__) || defined(__arm__)
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
   int yg = yuvconstants->kRGBCoeffBias[0];
   int ygb = yuvconstants->kRGBCoeffBias[4];
 #else
@@ -1823,9 +1955,10 @@ static __inline void YPixel(uint8_t y,
   int yg = yuvconstants->kYToRgb[0];
 #endif
   uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
-  *b = Clamp(((int32_t)(y1) + ygb) >> 6);
-  *g = Clamp(((int32_t)(y1) + ygb) >> 6);
-  *r = Clamp(((int32_t)(y1) + ygb) >> 6);
+  uint8_t b8 = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6));
+  *b = b8;
+  *g = b8;
+  *r = b8;
 }
 
 void I444ToARGBRow_C(const uint8_t* src_y,
@@ -1846,6 +1979,23 @@ void I444ToARGBRow_C(const uint8_t* src_y,
   }
 }
 
+void I444ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_u,
+                      const uint8_t* src_v,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 3;  // Advance 1 pixel.
+  }
+}
+
 // Also used for 420
 void I422ToARGBRow_C(const uint8_t* src_y,
                      const uint8_t* src_u,
@@ -1929,10 +2079,10 @@ void I210AlphaToARGBRow_C(const uint16_t* src_y,
   for (x = 0; x < width - 1; x += 2) {
     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
                rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = clamp255(src_a[0] >> 2);
+    rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2));
     YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
                rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = clamp255(src_a[1] >> 2);
+    rgb_buf[7] = STATIC_CAST(uint8_t, clamp255(src_a[1] >> 2));
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1942,7 +2092,7 @@ void I210AlphaToARGBRow_C(const uint16_t* src_y,
   if (width & 1) {
     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
                rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = clamp255(src_a[0] >> 2);
+    rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2));
   }
 }
 
@@ -1957,7 +2107,7 @@ void I410AlphaToARGBRow_C(const uint16_t* src_y,
   for (x = 0; x < width; ++x) {
     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
                rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = clamp255(src_a[0] >> 2);
+    rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2));
     src_y += 1;
     src_u += 1;
     src_v += 1;
@@ -2283,8 +2433,10 @@ void I422ToARGB4444Row_C(const uint8_t* src_y,
     b1 = b1 >> 4;
     g1 = g1 >> 4;
     r1 = r1 >> 4;
-    *(uint16_t*)(dst_argb4444 + 0) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
-    *(uint16_t*)(dst_argb4444 + 2) = b1 | (g1 << 4) | (r1 << 8) | 0xf000;
+    *(uint16_t*)(dst_argb4444 + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | 0xf000);
+    *(uint16_t*)(dst_argb4444 + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 4) | (r1 << 8) | 0xf000);
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -2295,7 +2447,8 @@ void I422ToARGB4444Row_C(const uint8_t* src_y,
     b0 = b0 >> 4;
     g0 = g0 >> 4;
     r0 = r0 >> 4;
-    *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
+    *(uint16_t*)(dst_argb4444) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | 0xf000);
   }
 }
 
@@ -2321,8 +2474,10 @@ void I422ToARGB1555Row_C(const uint8_t* src_y,
     b1 = b1 >> 3;
     g1 = g1 >> 3;
     r1 = r1 >> 3;
-    *(uint16_t*)(dst_argb1555 + 0) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
-    *(uint16_t*)(dst_argb1555 + 2) = b1 | (g1 << 5) | (r1 << 10) | 0x8000;
+    *(uint16_t*)(dst_argb1555 + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | 0x8000);
+    *(uint16_t*)(dst_argb1555 + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 10) | 0x8000);
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -2333,7 +2488,8 @@ void I422ToARGB1555Row_C(const uint8_t* src_y,
     b0 = b0 >> 3;
     g0 = g0 >> 3;
     r0 = r0 >> 3;
-    *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
+    *(uint16_t*)(dst_argb1555) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | 0x8000);
   }
 }
 
@@ -2359,8 +2515,10 @@ void I422ToRGB565Row_C(const uint8_t* src_y,
     b1 = b1 >> 3;
     g1 = g1 >> 2;
     r1 = r1 >> 3;
-    *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11);  // for ubsan
-    *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11);
+    *(uint16_t*)(dst_rgb565 + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
+    *(uint16_t*)(dst_rgb565 + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 11));
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -2371,7 +2529,8 @@ void I422ToRGB565Row_C(const uint8_t* src_y,
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
-    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16_t*)(dst_rgb565 + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
   }
 }
 
@@ -2486,8 +2645,12 @@ void NV12ToRGB565Row_C(const uint8_t* src_y,
     b1 = b1 >> 3;
     g1 = g1 >> 2;
     r1 = r1 >> 3;
-    *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11);
-    *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11);
+    *(uint16_t*)(dst_rgb565 + 0) = STATIC_CAST(uint16_t, b0) |
+                                   STATIC_CAST(uint16_t, g0 << 5) |
+                                   STATIC_CAST(uint16_t, r0 << 11);
+    *(uint16_t*)(dst_rgb565 + 2) = STATIC_CAST(uint16_t, b1) |
+                                   STATIC_CAST(uint16_t, g1 << 5) |
+                                   STATIC_CAST(uint16_t, r1 << 11);
     src_y += 2;
     src_uv += 2;
     dst_rgb565 += 4;  // Advance 2 pixels.
@@ -2497,7 +2660,9 @@ void NV12ToRGB565Row_C(const uint8_t* src_y,
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
-    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16_t*)(dst_rgb565) = STATIC_CAST(uint16_t, b0) |
+                               STATIC_CAST(uint16_t, g0 << 5) |
+                               STATIC_CAST(uint16_t, r0 << 11);
   }
 }
 
@@ -2603,6 +2768,19 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
   }
 }
 
+void MirrorRow_16_C(const uint16_t* src, uint16_t* dst, int width) {
+  int x;
+  src += width - 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst[x] = src[0];
+    dst[x + 1] = src[-1];
+    src -= 2;
+  }
+  if (width & 1) {
+    dst[width - 1] = src[0];
+  }
+}
+
 void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
   int x;
   src_uv += (width - 1) << 1;
@@ -2714,6 +2892,21 @@ void DetileRow_C(const uint8_t* src,
   }
 }
 
+void DetileRow_16_C(const uint16_t* src,
+                    ptrdiff_t src_tile_stride,
+                    uint16_t* dst,
+                    int width) {
+  int x;
+  for (x = 0; x < width - 15; x += 16) {
+    memcpy(dst, src, 16 * sizeof(uint16_t));
+    dst += 16;
+    src += src_tile_stride;
+  }
+  if (width & 15) {
+    memcpy(dst, src, (width & 15) * sizeof(uint16_t));
+  }
+}
+
 void DetileSplitUVRow_C(const uint8_t* src_uv,
                         ptrdiff_t src_tile_stride,
                         uint8_t* dst_u,
@@ -2731,6 +2924,51 @@ void DetileSplitUVRow_C(const uint8_t* src_uv,
   }
 }
 
+void DetileToYUY2_C(const uint8_t* src_y,
+                    ptrdiff_t src_y_tile_stride,
+                    const uint8_t* src_uv,
+                    ptrdiff_t src_uv_tile_stride,
+                    uint8_t* dst_yuy2,
+                    int width) {
+  for (int x = 0; x < width - 15; x += 16) {
+    for (int i = 0; i < 8; i++) {
+      dst_yuy2[0] = src_y[0];
+      dst_yuy2[1] = src_uv[0];
+      dst_yuy2[2] = src_y[1];
+      dst_yuy2[3] = src_uv[1];
+      dst_yuy2 += 4;
+      src_y += 2;
+      src_uv += 2;
+    }
+    src_y += src_y_tile_stride - 16;
+    src_uv += src_uv_tile_stride - 16;
+  }
+}
+
+// Unpack MT2T into tiled P010 64 pixels at a time. MT2T's bitstream is encoded
+// in 80 byte blocks representing 64 pixels each. The first 16 bytes of the
+// block contain all of the lower 2 bits of each pixel packed together, and the
+// next 64 bytes represent all the upper 8 bits of the pixel. The lower bits are
+// packed into 1x4 blocks, whereas the upper bits are packed in normal raster
+// order.
+void UnpackMT2T_C(const uint8_t* src, uint16_t* dst, size_t size) {
+  for (size_t i = 0; i < size; i += 80) {
+    const uint8_t* src_lower_bits = src;
+    const uint8_t* src_upper_bits = src + 16;
+
+    for (int j = 0; j < 4; j++) {
+      for (int k = 0; k < 16; k++) {
+        *dst++ = ((src_lower_bits[k] >> (j * 2)) & 0x3) << 6 |
+                 (uint16_t)*src_upper_bits << 8 |
+                 (uint16_t)*src_upper_bits >> 2;
+        src_upper_bits++;
+      }
+    }
+
+    src += 80;
+  }
+}
+
 void SplitRGBRow_C(const uint8_t* src_rgb,
                    uint8_t* dst_r,
                    uint8_t* dst_g,
@@ -2823,10 +3061,10 @@ void MergeAR64Row_C(const uint16_t* src_r,
   int shift = 16 - depth;
   int max = (1 << depth) - 1;
   for (x = 0; x < width; ++x) {
-    dst_ar64[0] = ClampMax(src_b[x], max) << shift;
-    dst_ar64[1] = ClampMax(src_g[x], max) << shift;
-    dst_ar64[2] = ClampMax(src_r[x], max) << shift;
-    dst_ar64[3] = ClampMax(src_a[x], max) << shift;
+    dst_ar64[0] = STATIC_CAST(uint16_t, ClampMax(src_b[x], max) << shift);
+    dst_ar64[1] = STATIC_CAST(uint16_t, ClampMax(src_g[x], max) << shift);
+    dst_ar64[2] = STATIC_CAST(uint16_t, ClampMax(src_r[x], max) << shift);
+    dst_ar64[3] = STATIC_CAST(uint16_t, ClampMax(src_a[x], max) << shift);
     dst_ar64 += 4;
   }
 }
@@ -2843,10 +3081,10 @@ void MergeARGB16To8Row_C(const uint16_t* src_r,
   int x;
   int shift = depth - 8;
   for (x = 0; x < width; ++x) {
-    dst_argb[0] = clamp255(src_b[x] >> shift);
-    dst_argb[1] = clamp255(src_g[x] >> shift);
-    dst_argb[2] = clamp255(src_r[x] >> shift);
-    dst_argb[3] = clamp255(src_a[x] >> shift);
+    dst_argb[0] = STATIC_CAST(uint8_t, clamp255(src_b[x] >> shift));
+    dst_argb[1] = STATIC_CAST(uint8_t, clamp255(src_g[x] >> shift));
+    dst_argb[2] = STATIC_CAST(uint8_t, clamp255(src_r[x] >> shift));
+    dst_argb[3] = STATIC_CAST(uint8_t, clamp255(src_a[x] >> shift));
     dst_argb += 4;
   }
 }
@@ -2863,9 +3101,9 @@ void MergeXR64Row_C(const uint16_t* src_r,
   int shift = 16 - depth;
   int max = (1 << depth) - 1;
   for (x = 0; x < width; ++x) {
-    dst_ar64[0] = ClampMax(src_b[x], max) << shift;
-    dst_ar64[1] = ClampMax(src_g[x], max) << shift;
-    dst_ar64[2] = ClampMax(src_r[x], max) << shift;
+    dst_ar64[0] = STATIC_CAST(uint16_t, ClampMax(src_b[x], max) << shift);
+    dst_ar64[1] = STATIC_CAST(uint16_t, ClampMax(src_g[x], max) << shift);
+    dst_ar64[2] = STATIC_CAST(uint16_t, ClampMax(src_r[x], max) << shift);
     dst_ar64[3] = 0xffff;
     dst_ar64 += 4;
   }
@@ -2882,9 +3120,9 @@ void MergeXRGB16To8Row_C(const uint16_t* src_r,
   int x;
   int shift = depth - 8;
   for (x = 0; x < width; ++x) {
-    dst_argb[0] = clamp255(src_b[x] >> shift);
-    dst_argb[1] = clamp255(src_g[x] >> shift);
-    dst_argb[2] = clamp255(src_r[x] >> shift);
+    dst_argb[0] = STATIC_CAST(uint8_t, clamp255(src_b[x] >> shift));
+    dst_argb[1] = STATIC_CAST(uint8_t, clamp255(src_g[x] >> shift));
+    dst_argb[2] = STATIC_CAST(uint8_t, clamp255(src_r[x] >> shift));
     dst_argb[3] = 0xff;
     dst_argb += 4;
   }
@@ -2930,8 +3168,8 @@ void MergeUVRow_16_C(const uint16_t* src_u,
   assert(depth <= 16);
   int x;
   for (x = 0; x < width; ++x) {
-    dst_uv[0] = src_u[x] << shift;
-    dst_uv[1] = src_v[x] << shift;
+    dst_uv[0] = STATIC_CAST(uint16_t, src_u[x] << shift);
+    dst_uv[1] = STATIC_CAST(uint16_t, src_v[x] << shift);
     dst_uv += 2;
   }
 }
@@ -2959,7 +3197,7 @@ void MultiplyRow_16_C(const uint16_t* src_y,
                       int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_y[x] = src_y[x] * scale;
+    dst_y[x] = STATIC_CAST(uint16_t, src_y[x] * scale);
   }
 }
 
@@ -2990,7 +3228,7 @@ void Convert16To8Row_C(const uint16_t* src_y,
   assert(scale <= 32768);
 
   for (x = 0; x < width; ++x) {
-    dst_y[x] = C16TO8(src_y[x], scale);
+    dst_y[x] = STATIC_CAST(uint8_t, C16TO8(src_y[x], scale));
   }
 }
 
@@ -3043,6 +3281,21 @@ void YUY2ToUVRow_C(const uint8_t* src_yuy2,
   }
 }
 
+// Filter 2 rows of YUY2 UV's (422) into UV (NV12).
+void YUY2ToNVUVRow_C(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_uv,
+                     int width) {
+  // Output a row of UV values, filtering 2 rows of YUY2.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_uv[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+    dst_uv[1] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+    src_yuy2 += 4;
+    dst_uv += 2;
+  }
+}
+
 // Copy row of YUY2 UV's (422) into U and V (422).
 void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
@@ -3138,9 +3391,9 @@ void ARGBBlendRow_C(const uint8_t* src_argb,
     uint32_t bb = src_argb1[0];
     uint32_t bg = src_argb1[1];
     uint32_t br = src_argb1[2];
-    dst_argb[0] = BLEND(fb, bb, a);
-    dst_argb[1] = BLEND(fg, bg, a);
-    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a));
+    dst_argb[1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a));
+    dst_argb[2] = STATIC_CAST(uint8_t, BLEND(fr, br, a));
     dst_argb[3] = 255u;
 
     fb = src_argb[4 + 0];
@@ -3150,9 +3403,9 @@ void ARGBBlendRow_C(const uint8_t* src_argb,
     bb = src_argb1[4 + 0];
     bg = src_argb1[4 + 1];
     br = src_argb1[4 + 2];
-    dst_argb[4 + 0] = BLEND(fb, bb, a);
-    dst_argb[4 + 1] = BLEND(fg, bg, a);
-    dst_argb[4 + 2] = BLEND(fr, br, a);
+    dst_argb[4 + 0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a));
+    dst_argb[4 + 1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a));
+    dst_argb[4 + 2] = STATIC_CAST(uint8_t, BLEND(fr, br, a));
     dst_argb[4 + 3] = 255u;
     src_argb += 8;
     src_argb1 += 8;
@@ -3167,9 +3420,9 @@ void ARGBBlendRow_C(const uint8_t* src_argb,
     uint32_t bb = src_argb1[0];
     uint32_t bg = src_argb1[1];
     uint32_t br = src_argb1[2];
-    dst_argb[0] = BLEND(fb, bb, a);
-    dst_argb[1] = BLEND(fg, bg, a);
-    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a));
+    dst_argb[1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a));
+    dst_argb[2] = STATIC_CAST(uint8_t, BLEND(fr, br, a));
     dst_argb[3] = 255u;
   }
 }
@@ -3196,12 +3449,7 @@ void BlendPlaneRow_C(const uint8_t* src0,
 }
 #undef UBLEND
 
-#if LIBYUV_ATTENUATE_DUP
-// This code mimics the SSSE3 version for better testability.
-#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
-#else
-#define ATTENUATE(f, a) (f * a + 128) >> 8
-#endif
+#define ATTENUATE(f, a) (f * a + 255) >> 8
 
 // Multiply source RGB by alpha and store to destination.
 void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
@@ -3214,7 +3462,7 @@ void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
     dst_argb[0] = ATTENUATE(b, a);
     dst_argb[1] = ATTENUATE(g, a);
     dst_argb[2] = ATTENUATE(r, a);
-    dst_argb[3] = a;
+    dst_argb[3] = STATIC_CAST(uint8_t, a);
     b = src_argb[4];
     g = src_argb[5];
     r = src_argb[6];
@@ -3222,7 +3470,7 @@ void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
     dst_argb[4] = ATTENUATE(b, a);
     dst_argb[5] = ATTENUATE(g, a);
     dst_argb[6] = ATTENUATE(r, a);
-    dst_argb[7] = a;
+    dst_argb[7] = STATIC_CAST(uint8_t, a);
     src_argb += 8;
     dst_argb += 8;
   }
@@ -3235,7 +3483,7 @@ void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
     dst_argb[0] = ATTENUATE(b, a);
     dst_argb[1] = ATTENUATE(g, a);
     dst_argb[2] = ATTENUATE(r, a);
-    dst_argb[3] = a;
+    dst_argb[3] = STATIC_CAST(uint8_t, a);
   }
 }
 #undef ATTENUATE
@@ -3307,10 +3555,10 @@ void ARGBUnattenuateRow_C(const uint8_t* src_argb,
     const uint32_t ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
 
     // Clamping should not be necessary but is free in assembly.
-    dst_argb[0] = UNATTENUATE(b, ia);
-    dst_argb[1] = UNATTENUATE(g, ia);
-    dst_argb[2] = UNATTENUATE(r, ia);
-    dst_argb[3] = a;
+    dst_argb[0] = STATIC_CAST(uint8_t, UNATTENUATE(b, ia));
+    dst_argb[1] = STATIC_CAST(uint8_t, UNATTENUATE(g, ia));
+    dst_argb[2] = STATIC_CAST(uint8_t, UNATTENUATE(r, ia));
+    dst_argb[3] = STATIC_CAST(uint8_t, a);
     src_argb += 4;
     dst_argb += 4;
   }
@@ -3344,12 +3592,20 @@ void CumulativeSumToAverageRow_C(const int32_t* tl,
   int i;
   assert(area != 0);
 
-  ooa = 1.0f / area;
+  ooa = 1.0f / STATIC_CAST(float, area);
   for (i = 0; i < count; ++i) {
-    dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
-    dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
-    dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
-    dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+    dst[0] =
+        (uint8_t)(STATIC_CAST(float, bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) *
+                  ooa);
+    dst[1] =
+        (uint8_t)(STATIC_CAST(float, bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) *
+                  ooa);
+    dst[2] =
+        (uint8_t)(STATIC_CAST(float, bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) *
+                  ooa);
+    dst[3] =
+        (uint8_t)(STATIC_CAST(float, bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) *
+                  ooa);
     dst += 4;
     tl += 4;
     bl += 4;
@@ -3407,7 +3663,9 @@ static void HalfRow_16To8_C(const uint16_t* src_uv,
                             int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_uv[x] = C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale);
+    dst_uv[x] = STATIC_CAST(
+        uint8_t,
+        C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale));
   }
 }
 
@@ -3433,8 +3691,9 @@ void InterpolateRow_C(uint8_t* dst_ptr,
     return;
   }
   for (x = 0; x < width; ++x) {
-    dst_ptr[0] =
-        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+    dst_ptr[0] = STATIC_CAST(
+        uint8_t,
+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8);
     ++src_ptr;
     ++src_ptr1;
     ++dst_ptr;
@@ -3463,8 +3722,9 @@ void InterpolateRow_16_C(uint16_t* dst_ptr,
     return;
   }
   for (x = 0; x < width; ++x) {
-    dst_ptr[0] =
-        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+    dst_ptr[0] = STATIC_CAST(
+        uint16_t,
+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8);
     ++src_ptr;
     ++src_ptr1;
     ++dst_ptr;
@@ -3501,9 +3761,11 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr,
     return;
   }
   for (x = 0; x < width; ++x) {
-    dst_ptr[0] = C16TO8(
-        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8,
-        scale);
+    dst_ptr[0] = STATIC_CAST(
+        uint8_t,
+        C16TO8(
+            (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8,
+            scale));
     src_ptr += 1;
     src_ptr1 += 1;
     dst_ptr += 1;
@@ -3615,10 +3877,10 @@ void ARGBPolynomialRow_C(const uint8_t* src_argb,
     dr += poly[14] * r3;
     da += poly[15] * a3;
 
-    dst_argb[0] = Clamp((int32_t)(db));
-    dst_argb[1] = Clamp((int32_t)(dg));
-    dst_argb[2] = Clamp((int32_t)(dr));
-    dst_argb[3] = Clamp((int32_t)(da));
+    dst_argb[0] = STATIC_CAST(uint8_t, Clamp((int32_t)(db)));
+    dst_argb[1] = STATIC_CAST(uint8_t, Clamp((int32_t)(dg)));
+    dst_argb[2] = STATIC_CAST(uint8_t, Clamp((int32_t)(dr)));
+    dst_argb[3] = STATIC_CAST(uint8_t, Clamp((int32_t)(da)));
     src_argb += 4;
     dst_argb += 4;
   }
@@ -4023,6 +4285,32 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y,
 }
 #endif
 
+#if defined(HAS_I444TORGB24ROW_AVX2)
+void I444ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I444ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
+    src_y += twidth;
+    src_u += twidth;
+    src_v += twidth;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
 #if defined(HAS_NV12TORGB565ROW_AVX2)
 void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
                           const uint8_t* src_uv,
@@ -4164,8 +4452,9 @@ void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
 void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    *dst++ =
-        (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
+    *dst++ = STATIC_CAST(
+        uint16_t,
+        (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8);
     ++src;
   }
 }
@@ -4325,6 +4614,8 @@ void HalfMergeUVRow_C(const uint8_t* src_u,
   }
 }
 
+#undef STATIC_CAST
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/source/row_gcc.cc b/source/row_gcc.cc
index dce8c439..d8074987 100644
--- a/files/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -27,6 +27,9 @@ static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
 static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
                                 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
 
+static const uvec8 kABGRToYJ = {77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u,
+                                77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u};
+
 static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
                                 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
@@ -39,12 +42,18 @@ static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
                                127, -84, -43, 0, 127, -84, -43, 0};
 
+static const vec8 kABGRToUJ = {-43, -84, 127, 0, -43, -84, 127, 0,
+                               -43, -84, 127, 0, -43, -84, 127, 0};
+
 static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
                               -18, -94, 112, 0, -18, -94, 112, 0};
 
 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
                                -20, -107, 127, 0, -20, -107, 127, 0};
 
+static const vec8 kABGRToVJ = {127, -107, -20, 0, 127, -107, -20, 0,
+                               127, -107, -20, 0, 127, -107, -20, 0};
+
 // Constants for BGRA
 static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
                                0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
@@ -729,7 +738,7 @@ void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 
 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
                                 uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width) {
   asm volatile(
       "movd        %3,%%xmm6                     \n"
@@ -777,7 +786,7 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
                                 uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width) {
   asm volatile(
       "vbroadcastss %3,%%xmm6                    \n"
@@ -1201,6 +1210,7 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
       "lea         0x40(%1),%1                   \n"
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
+      "vzeroupper                                \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_ar64),  // %1
         "+r"(width)      // %2
@@ -1228,6 +1238,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
       "lea         0x40(%1),%1                   \n"
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
+      "vzeroupper                                \n"
       : "+r"(src_argb),             // %0
         "+r"(dst_ab64),             // %1
         "+r"(width)                 // %2
@@ -1256,6 +1267,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
       "lea         0x20(%1),%1                   \n"
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
+      "vzeroupper                                \n"
       : "+r"(src_ar64),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
@@ -1284,6 +1296,7 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
       "lea         0x20(%1),%1                   \n"
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
+      "vzeroupper                                \n"
       : "+r"(src_ab64),          // %0
         "+r"(dst_argb),          // %1
         "+r"(width)              // %2
@@ -1398,6 +1411,24 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 }
 #endif  // HAS_ARGBTOYJROW_SSSE3
 
+#ifdef HAS_ABGRTOYJROW_SSSE3
+// Convert 16 ABGR pixels (64 bytes) to 16 YJ values.
+// Same as ABGRToYRow but different coefficients, no add 16.
+void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      LABELALIGN RGBTOY(xmm5)
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kABGRToYJ),  // %3
+        "m"(kSub128)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_ABGRTOYJROW_SSSE3
+
 #ifdef HAS_RGBATOYJROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16.
@@ -1416,7 +1447,8 @@ void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
 }
 #endif  // HAS_RGBATOYJROW_SSSE3
 
-#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \
+    defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
 // vpermd for vphaddw + vpackuswb vpermd.
 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
 #endif
@@ -1429,9 +1461,8 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vbroadcastf128 %5,%%ymm7                  \n"
-      "vmovdqu     %6,%%ymm6                     \n"
-
-      LABELALIGN RGBTOY_AVX2(ymm7)
+      "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
+      ymm7) "vzeroupper                                \n"
       : "+r"(src_argb),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
@@ -1451,9 +1482,8 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
       "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vbroadcastf128 %5,%%ymm7                  \n"
-      "vmovdqu     %6,%%ymm6                     \n"
-
-      LABELALIGN RGBTOY_AVX2(ymm7)
+      "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
+      ymm7) "vzeroupper                                \n"
       : "+r"(src_abgr),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
@@ -1472,9 +1502,8 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   asm volatile(
       "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
-      "vmovdqu     %5,%%ymm6                     \n"
-
-      LABELALIGN RGBTOY_AVX2(ymm5)
+      "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
+      ymm5) "vzeroupper                                \n"
       : "+r"(src_argb),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
@@ -1486,15 +1515,32 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 }
 #endif  // HAS_ARGBTOYJROW_AVX2
 
+#ifdef HAS_ABGRTOYJROW_AVX2
+// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
+void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
+      ymm5) "vzeroupper                                \n"
+      : "+r"(src_abgr),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kABGRToYJ),         // %3
+        "m"(kSub128),           // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ABGRTOYJROW_AVX2
+
 #ifdef HAS_RGBATOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
   asm volatile(
       "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
-      "vmovdqu     %5,%%ymm6                     \n"
-
-      LABELALIGN RGBTOY_AVX2(
+      "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
       ymm5) "vzeroupper                                \n"
       : "+r"(src_rgba),         // %0
         "+r"(dst_y),            // %1
@@ -1571,11 +1617,15 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBTOUVROW_SSSE3
 
-#ifdef HAS_ARGBTOUVROW_AVX2
+#if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \
+    defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2)
 // vpshufb for vphaddw + vpackuswb packed to shorts.
 static const lvec8 kShufARGBToUV_AVX = {
     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+#endif
+
+#if defined(HAS_ARGBTOUVROW_AVX2)
 void ARGBToUVRow_AVX2(const uint8_t* src_argb,
                       int src_stride_argb,
                       uint8_t* dst_u,
@@ -1765,6 +1815,71 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBTOUVJROW_AVX2
 
+// TODO(fbarchard): Pass kABGRToVJ / kABGRToUJ as matrix
+#ifdef HAS_ABGRTOUVJROW_AVX2
+void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
+      "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
+      "lea         0x80(%0),%0                   \n"
+      "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
+      "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
+      "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
+
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
+      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
+      "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %8,%%ymm0,%%ymm0              \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_abgr),                    // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kSub128),                      // %5
+        "m"(kABGRToVJ),                    // %6
+        "m"(kABGRToUJ),                    // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ABGRTOUVJROW_AVX2
+
 #ifdef HAS_ARGBTOUVJROW_SSSE3
 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
                         int src_stride_argb,
@@ -1831,6 +1946,72 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBTOUVJROW_SSSE3
 
+#ifdef HAS_ABGRTOUVJROW_SSSE3
+void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr,
+                        int src_stride_abgr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  asm volatile(
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "paddw       %%xmm5,%%xmm0                 \n"
+      "paddw       %%xmm5,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_abgr),                    // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kABGRToVJ),                    // %5
+        "m"(kABGRToUJ),                    // %6
+        "m"(kSub128)                       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+#endif  // HAS_ABGRTOUVJROW_SSSE3
+
 #ifdef HAS_ARGBTOUV444ROW_SSSE3
 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
                           uint8_t* dst_u,
@@ -2153,9 +2334,6 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 UV from 422 10 bit, upsample to 8 UV
-// TODO(fbarchard): Consider shufb to replace pack/unpack
-// TODO(fbarchard): Consider pmulhuw to replace psraw
-// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
 #define READYUV210                                                \
   "movq       (%[u_buf]),%%xmm3                               \n" \
   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
@@ -2165,7 +2343,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
   "packuswb   %%xmm3,%%xmm3                                   \n" \
   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "movdqa     %%xmm4,%%xmm2                                   \n" \
   "psllw      $6,%%xmm4                                       \n" \
+  "psrlw      $4,%%xmm2                                       \n" \
+  "paddw      %%xmm2,%%xmm4                                   \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 
 #define READYUVA210                                               \
@@ -2177,7 +2358,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
   "packuswb   %%xmm3,%%xmm3                                   \n" \
   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "movdqa     %%xmm4,%%xmm2                                   \n" \
   "psllw      $6,%%xmm4                                       \n" \
+  "psrlw      $4,%%xmm2                                       \n" \
+  "paddw      %%xmm2,%%xmm4                                   \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
   "movdqu     (%[a_buf]),%%xmm5                               \n" \
   "psraw      $2,%%xmm5                                       \n" \
@@ -2196,7 +2380,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
   "punpckhwd  %%xmm2,%%xmm1                                   \n" \
   "packuswb   %%xmm1,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "movdqa     %%xmm4,%%xmm2                                   \n" \
   "psllw      $6,%%xmm4                                       \n" \
+  "psrlw      $4,%%xmm2                                       \n" \
+  "paddw      %%xmm2,%%xmm4                                   \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 
 // Read 8 UV from 444 10 bit.  With 8 Alpha.
@@ -2211,7 +2398,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
   "punpckhwd  %%xmm2,%%xmm1                                   \n" \
   "packuswb   %%xmm1,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
-  "psllw      $0x6,%%xmm4                                     \n" \
+  "movdqa     %%xmm4,%%xmm2                                   \n" \
+  "psllw      $6,%%xmm4                                       \n" \
+  "psrlw      $4,%%xmm2                                       \n" \
+  "paddw      %%xmm2,%%xmm4                                   \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
   "movdqu     (%[a_buf]),%%xmm5                               \n" \
   "psraw      $2,%%xmm5                                       \n" \
@@ -2228,7 +2418,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
   "packuswb   %%xmm3,%%xmm3                                   \n" \
   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
-  "psllw      $0x4,%%xmm4                                     \n" \
+  "movdqa     %%xmm4,%%xmm2                                   \n" \
+  "psllw      $4,%%xmm4                                       \n" \
+  "psrlw      $8,%%xmm2                                       \n" \
+  "paddw      %%xmm2,%%xmm4                                   \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
@@ -2399,6 +2592,20 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
   "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
   "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
 
+// Store 8 RGB24 values.
+#define STORERGB24                                                      \
+  "punpcklbw   %%xmm1,%%xmm0                                        \n" \
+  "punpcklbw   %%xmm2,%%xmm2                                        \n" \
+  "movdqa      %%xmm0,%%xmm1                                        \n" \
+  "punpcklwd   %%xmm2,%%xmm0                                        \n" \
+  "punpckhwd   %%xmm2,%%xmm1                                        \n" \
+  "pshufb      %%xmm5,%%xmm0                                        \n" \
+  "pshufb      %%xmm6,%%xmm1                                        \n" \
+  "palignr     $0xc,%%xmm0,%%xmm1                                   \n" \
+  "movq        %%xmm0,(%[dst_rgb24])                                \n" \
+  "movdqu      %%xmm1,0x8(%[dst_rgb24])                             \n" \
+  "lea         0x18(%[dst_rgb24]),%[dst_rgb24]                      \n"
+
 // Store 8 AR30 values.
 #define STOREAR30                                                  \
   "psraw      $0x4,%%xmm0                                      \n" \
@@ -2508,17 +2715,43 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
       "1:                                        \n"
     READYUV422
     YUVTORGB(yuvconstants)
-      "punpcklbw   %%xmm1,%%xmm0                 \n"
-      "punpcklbw   %%xmm2,%%xmm2                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklwd   %%xmm2,%%xmm0                 \n"
-      "punpckhwd   %%xmm2,%%xmm1                 \n"
-      "pshufb      %%xmm5,%%xmm0                 \n"
-      "pshufb      %%xmm6,%%xmm1                 \n"
-      "palignr     $0xc,%%xmm0,%%xmm1            \n"
-      "movq        %%xmm0,(%[dst_rgb24])         \n"
-      "movdqu      %%xmm1,0x8(%[dst_rgb24])      \n"
-      "lea         0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+    STORERGB24
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
+#if defined(__i386__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
+    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void OMITFP I444ToRGB24Row_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_rgb24,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+      "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+      "sub         %[u_buf],%[v_buf]             \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV444
+    YUVTORGB(yuvconstants)
+    STORERGB24
       "subl        $0x8,%[width]                 \n"
       "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -3209,7 +3442,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
+  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
+  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
+  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
 
 // Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
@@ -3224,7 +3459,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
+  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
+  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
+  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
   "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
   "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
@@ -3242,7 +3479,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
   "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
   "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
+  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
+  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
+  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
 
 // Read 8 UV from 212 12 bit, upsample to 16 UV
@@ -3257,7 +3496,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $0x4,%%ymm4,%%ymm4                               \n" \
+  "vpsllw     $4,%%ymm4,%%ymm2                                 \n" \
+  "vpsrlw     $8,%%ymm4,%%ymm4                                 \n" \
+  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
 
 // Read 16 UV from 410. With 16 Alpha.
@@ -3271,7 +3512,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
   "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
   "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
+  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
+  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
+  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
   "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
   "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
@@ -4785,6 +5028,84 @@ void DetileRow_SSE2(const uint8_t* src,
 }
 #endif  // HAS_DETILEROW_SSE2
 
+#ifdef HAS_DETILEROW_16_SSE2
+void DetileRow_16_SSE2(const uint16_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint16_t* dst,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src),            // %0
+        "+r"(dst),            // %1
+        "+r"(width)           // %2
+      : "r"(src_tile_stride)  // %3
+      : "cc", "memory", "xmm0", "xmm1");
+}
+#endif  // HAS_DETILEROW_SSE2
+
+#ifdef HAS_DETILEROW_16_AVX
+void DetileRow_16_AVX(const uint16_t* src,
+                      ptrdiff_t src_tile_stride,
+                      uint16_t* dst,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),            // %0
+        "+r"(dst),            // %1
+        "+r"(width)           // %2
+      : "r"(src_tile_stride)  // %3
+      : "cc", "memory", "xmm0");
+}
+#endif  // HAS_DETILEROW_AVX
+
+#ifdef HAS_DETILETOYUY2_SSE2
+// Read 16 Y, 8 UV, and write 8 YUYV.
+void DetileToYUY2_SSE2(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // Load 16 Y
+      "sub         $0x10,%3                      \n"
+      "lea         (%0,%4),%0                    \n"
+      "movdqu      (%1),%%xmm1                   \n"  // Load 8 UV
+      "lea         (%1,%5),%1                    \n"
+      "movdqu      %%xmm0,%%xmm2                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm2                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "movdqu      %%xmm2,0x10(%2)               \n"
+      "lea         0x20(%2),%2                   \n"
+      "jg          1b                            \n"
+      : "+r"(src_y),                            // %0
+        "+r"(src_uv),                           // %1
+        "+r"(dst_yuy2),                         // %2
+        "+r"(width)                             // %3
+      : "r"(src_y_tile_stride),                 // %4
+        "r"(src_uv_tile_stride)                 // %5
+      : "cc", "memory", "xmm0", "xmm1", "xmm2"  // Clobber list
+  );
+}
+#endif
+
 #ifdef HAS_DETILESPLITUVROW_SSSE3
 // TODO(greenjustin): Look into generating these constants instead of loading
 // them since this can cause branch mispredicts for fPIC code on 32-bit
@@ -4821,36 +5142,59 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
 }
 #endif  // HAS_DETILESPLITUVROW_SSSE3
 
+#ifdef HAS_MERGEUVROW_AVX512BW
+void MergeUVRow_AVX512BW(const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_uv,
+                         int width) {
+      asm volatile("sub         %0,%1                         \n"
+
+               LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbw   (%0),%%zmm0                   \n"
+      "vpmovzxbw   0x00(%0,%1,1),%%zmm1          \n"
+      "lea         0x20(%0),%0                   \n"
+      "vpsllw      $0x8,%%zmm1,%%zmm1            \n"
+      "vporq       %%zmm0,%%zmm1,%%zmm2          \n"
+      "vmovdqu64   %%zmm2,(%2)                   \n"
+      "lea         0x40(%2),%2                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+               : "+r"(src_u),   // %0
+                 "+r"(src_v),   // %1
+                 "+r"(dst_uv),  // %2
+                 "+r"(width)    // %3
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_MERGEUVROW_AVX512BW
+
 #ifdef HAS_MERGEUVROW_AVX2
 void MergeUVRow_AVX2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile(
-
-      "sub         %0,%1                         \n"
+      asm volatile("sub         %0,%1                         \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x00(%0,%1,1),%%ymm1          \n"
-      "lea         0x20(%0),%0                   \n"
-      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm2          \n"
-      "vpunpckhbw  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vextractf128 $0x0,%%ymm2,(%2)             \n"
-      "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
-      "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
-      "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
-      "lea         0x40(%2),%2                   \n"
-      "sub         $0x20,%3                      \n"
+      "vpmovzxbw   (%0),%%ymm0                   \n"
+      "vpmovzxbw   0x00(%0,%1,1),%%ymm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "vpsllw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpor        %%ymm0,%%ymm1,%%ymm2          \n"
+      "vmovdqu     %%ymm2,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_u),   // %0
-        "+r"(src_v),   // %1
-        "+r"(dst_uv),  // %2
-        "+r"(width)    // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+               : "+r"(src_u),   // %0
+                 "+r"(src_v),   // %1
+                 "+r"(dst_uv),  // %2
+                 "+r"(width)    // %3
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_AVX2
 
@@ -4859,11 +5203,9 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile(
+      asm volatile("sub         %0,%1                         \n"
 
-      "sub         %0,%1                         \n"
-
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
@@ -4876,12 +5218,12 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
       "lea         0x20(%2),%2                   \n"
       "sub         $0x10,%3                      \n"
       "jg          1b                            \n"
-      : "+r"(src_u),   // %0
-        "+r"(src_v),   // %1
-        "+r"(dst_uv),  // %2
-        "+r"(width)    // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+               : "+r"(src_u),   // %0
+                 "+r"(src_v),   // %1
+                 "+r"(dst_uv),  // %2
+                 "+r"(width)    // %3
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_SSE2
 
@@ -4891,37 +5233,35 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
                         uint16_t* dst_uv,
                         int depth,
                         int width) {
-  depth = 16 - depth;
   // clang-format off
   asm volatile (
       "vmovd       %4,%%xmm3                     \n"
+      "vmovd       %5,%%xmm4                     \n"
+
+
       "sub         %0,%1                         \n"
+      // 8 pixels per loop.
 
-    // 16 pixels per loop.
-    LABELALIGN
+      LABELALIGN
       "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     (%0,%1,1),%%ymm1              \n"
-      "add         $0x20,%0                      \n"
-
+      "vpmovzxwd   (%0),%%ymm0                   \n"
+      "vpmovzxwd   0x00(%0,%1,1),%%ymm1          \n"
+      "lea         0x10(%0),%0                   \n"
       "vpsllw      %%xmm3,%%ymm0,%%ymm0          \n"
-      "vpsllw      %%xmm3,%%ymm1,%%ymm1          \n"
-      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm2          \n"  // mutates
-      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vextractf128 $0x0,%%ymm2,(%2)             \n"
-      "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
-      "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
-      "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
-      "add         $0x40,%2                      \n"
-      "sub         $0x10,%3                      \n"
+      "vpslld      %%xmm4,%%ymm1,%%ymm1          \n"
+      "vpor        %%ymm0,%%ymm1,%%ymm2          \n"
+      "vmovdqu     %%ymm2,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x8,%3                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : "+r"(src_u),   // %0
-    "+r"(src_v),   // %1
-    "+r"(dst_uv),  // %2
-    "+r"(width)    // %3
-  : "r"(depth)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+  : "+r"(src_u),      // %0
+    "+r"(src_v),      // %1
+    "+r"(dst_uv),     // %2
+    "+r"(width)       // %3
+  : "r"(16 - depth),  // %4
+    "r"(32 - depth)   // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
   // clang-format on
 }
 #endif  // HAS_MERGEUVROW_AVX2
@@ -5127,7 +5467,6 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
 // 512 = 9 bits
 // 1024 = 10 bits
 // 4096 = 12 bits
-// TODO(fbarchard): reduce to SSE2
 void Convert8To16Row_SSE2(const uint8_t* src_y,
                           uint16_t* dst_y,
                           int scale,
@@ -6178,6 +6517,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
       "lea         0x40(%1),%1                   \n"
       "sub         $0x40,%2                      \n"
       "jg          1b                            \n"
+      "vzeroupper                                \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -6461,6 +6801,33 @@ void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
       : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
+void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width) {
+  asm volatile(LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "pavgb       %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+               : "+r"(src_yuy2),               // %0
+                 "+r"(dst_uv),                 // %1
+                 "+r"(width)                   // %2
+               : "r"((intptr_t)(stride_yuy2))  // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
 void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
                       int stride_yuy2,
                       uint8_t* dst_u,
@@ -6661,6 +7028,35 @@ void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
       : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
+void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpavgb      0x00(%0,%3,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%3,1),%%ymm1,%%ymm1   \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_uv),                 // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(stride_yuy2))  // %3
+      : "memory", "cc", "xmm0", "xmm1");
+}
+
 void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
                       int stride_yuy2,
                       uint8_t* dst_u,
@@ -7045,93 +7441,106 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
 
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
-                                     7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
-static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-                                     15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
+static const vec8 kAttenuateShuffle = {6,    -128, 6,    -128, 6,  -128,
+                                       -128, -128, 14,   -128, 14, -128,
+                                       14,   -128, -128, -128};
+
 // Attenuate 4 pixels at a time.
 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int width) {
   asm volatile(
-      "pcmpeqb     %%xmm3,%%xmm3                 \n"
-      "pslld       $0x18,%%xmm3                  \n"
       "movdqa      %3,%%xmm4                     \n"
-      "movdqa      %4,%%xmm5                     \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0x18,%%xmm5                  \n"
+      "pxor        %%xmm6,%%xmm6                 \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "punpcklbw   %%xmm6,%%xmm7                 \n"
+      "sub         %0,%1                         \n"
 
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "pshufb      %%xmm4,%%xmm0                 \n"
-      "movdqu      (%0),%%xmm1                   \n"
-      "punpcklbw   %%xmm1,%%xmm1                 \n"
-      "pmulhuw     %%xmm1,%%xmm0                 \n"
-      "movdqu      (%0),%%xmm1                   \n"
-      "pshufb      %%xmm5,%%xmm1                 \n"
-      "movdqu      (%0),%%xmm2                   \n"
-      "punpckhbw   %%xmm2,%%xmm2                 \n"
-      "pmulhuw     %%xmm2,%%xmm1                 \n"
-      "movdqu      (%0),%%xmm2                   \n"
-      "lea         0x10(%0),%0                   \n"
-      "pand        %%xmm3,%%xmm2                 \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqa      %%xmm6,%%xmm0                 \n"
+      "movdqa      %%xmm6,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "punpckhbw   %%xmm5,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"  // a,a,a,0
+      "pshufb      %%xmm4,%%xmm3                 \n"
+      "pmullw      %%xmm2,%%xmm0                 \n"  // rgb * alpha
+      "pmullw      %%xmm3,%%xmm1                 \n"
+      "paddw       %%xmm7,%%xmm0                 \n"  // + 255
+      "paddw       %%xmm7,%%xmm1                 \n"
       "psrlw       $0x8,%%xmm0                   \n"
       "psrlw       $0x8,%%xmm1                   \n"
       "packuswb    %%xmm1,%%xmm0                 \n"
-      "por         %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
+      "pand        %%xmm5,%%xmm6                 \n"
+      "por         %%xmm6,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%0,%1)                \n"
+      "lea         0x10(%0),%0                   \n"
       "sub         $0x4,%2                       \n"
       "jg          1b                            \n"
-      : "+r"(src_argb),       // %0
-        "+r"(dst_argb),       // %1
-        "+r"(width)           // %2
-      : "m"(kShuffleAlpha0),  // %3
-        "m"(kShuffleAlpha1)   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+      : "+r"(src_argb),         // %0
+        "+r"(dst_argb),         // %1
+        "+r"(width)             // %2
+      : "m"(kAttenuateShuffle)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBATTENUATEROW_SSSE3
 
 #ifdef HAS_ARGBATTENUATEROW_AVX2
+
 // Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
-                                         128u, 128u, 14u,  15u, 14u, 15u,
-                                         14u,  15u,  128u, 128u};
+static const lvec8 kAttenuateShuffle_AVX2 = {
+    6,    -128, 6,    -128, 6,    -128, -128, -128, 14,   -128, 14,
+    -128, 14,   -128, -128, -128, 22,   -128, 22,   -128, 22,   -128,
+    -128, -128, 30,   -128, 30,   -128, 30,   -128, -128, -128};
+
 // Attenuate 8 pixels at a time.
 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width) {
   asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vmovdqa     %3,%%ymm4                     \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpslld      $0x18,%%ymm5,%%ymm5           \n"
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"
+      "vpunpcklbw  %%ymm6,%%ymm7,%%ymm7          \n"
       "sub         %0,%1                         \n"
 
       // 8 pixel loop.
       LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm6                   \n"
-      "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
-      "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
+      "vpunpcklbw  %%ymm5,%%ymm6,%%ymm0          \n"
+      "vpunpckhbw  %%ymm5,%%ymm6,%%ymm1          \n"
       "vpshufb     %%ymm4,%%ymm0,%%ymm2          \n"
       "vpshufb     %%ymm4,%%ymm1,%%ymm3          \n"
-      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
-      "vpand       %%ymm5,%%ymm6,%%ymm6          \n"
+      "vpmullw     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpaddw      %%ymm7,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm7,%%ymm1,%%ymm1          \n"
       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpor        %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm6,%%ymm1          \n"
+      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
       "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
       "lea         0x20(%0),%0                   \n"
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_argb),          // %0
-        "+r"(dst_argb),          // %1
-        "+r"(width)              // %2
-      : "m"(kShuffleAlpha_AVX2)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+      : "+r"(src_argb),              // %0
+        "+r"(dst_argb),              // %1
+        "+r"(width)                  // %2
+      : "m"(kAttenuateShuffle_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBATTENUATEROW_AVX2
 
diff --git a/files/source/row_lasx.cc b/source/row_lasx.cc
index 7dd18f40..be85022e 100644
--- a/files/source/row_lasx.cc
+++ b/source/row_lasx.cc
@@ -543,8 +543,8 @@ void I422ToARGB4444Row_LASX(const uint8_t* src_y,
   __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
   __m256i vec_ubvr, vec_ugvg;
   __m256i const_0x80 = __lasx_xvldi(0x80);
-  __m256i alpha = {0xF000F000F000F000, 0xF000F000F000F000, 0xF000F000F000F000,
-                   0xF000F000F000F000};
+  __m256i alpha = (__m256i)v4u64{0xF000F000F000F000, 0xF000F000F000F000,
+                                 0xF000F000F000F000, 0xF000F000F000F000};
   __m256i mask = {0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0,
                   0x00F000F000F000F0};
 
@@ -595,8 +595,8 @@ void I422ToARGB1555Row_LASX(const uint8_t* src_y,
   __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
   __m256i vec_ubvr, vec_ugvg;
   __m256i const_0x80 = __lasx_xvldi(0x80);
-  __m256i alpha = {0x8000800080008000, 0x8000800080008000, 0x8000800080008000,
-                   0x8000800080008000};
+  __m256i alpha = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
+                                 0x8000800080008000, 0x8000800080008000};
 
   YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
   vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
@@ -775,40 +775,6 @@ void UYVYToUV422Row_LASX(const uint8_t* src_uyvy,
   }
 }
 
-void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 32;
-  __m256i src0, src1, src2, src3, vec0, vec1, vec2, vec3;
-  __m256i tmp0, tmp1, dst0;
-  __m256i const_19 = __lasx_xvldi(0x19);
-  __m256i const_42 = __lasx_xvldi(0x42);
-  __m256i const_81 = __lasx_xvldi(0x81);
-  __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
-                        0x1080108010801080, 0x1080108010801080};
-  __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
-                     0x0000000700000003};
-
-  for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64,
-              src_argb0, 96, src0, src1, src2, src3);
-    vec0 = __lasx_xvpickev_b(src1, src0);
-    vec1 = __lasx_xvpickev_b(src3, src2);
-    vec2 = __lasx_xvpickod_b(src1, src0);
-    vec3 = __lasx_xvpickod_b(src3, src2);
-    tmp0 = __lasx_xvmaddwev_h_bu(const_1080, vec0, const_19);
-    tmp1 = __lasx_xvmaddwev_h_bu(const_1080, vec1, const_19);
-    tmp0 = __lasx_xvmaddwev_h_bu(tmp0, vec2, const_81);
-    tmp1 = __lasx_xvmaddwev_h_bu(tmp1, vec3, const_81);
-    tmp0 = __lasx_xvmaddwod_h_bu(tmp0, vec0, const_42);
-    tmp1 = __lasx_xvmaddwod_h_bu(tmp1, vec1, const_42);
-    dst0 = __lasx_xvssrani_b_h(tmp1, tmp0, 8);
-    dst0 = __lasx_xvperm_w(dst0, control);
-    __lasx_xvst(dst0, dst_y, 0);
-    src_argb0 += 128;
-    dst_y += 32;
-  }
-}
-
 void ARGBToUVRow_LASX(const uint8_t* src_argb0,
                       int src_stride_argb,
                       uint8_t* dst_u,
@@ -833,8 +799,8 @@ void ARGBToUVRow_LASX(const uint8_t* src_argb0,
                         0x0009000900090009, 0x0009000900090009};
   __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
                      0x0000000700000003};
-  __m256i const_0x8080 = {0x8080808080808080, 0x8080808080808080,
-                          0x8080808080808080, 0x8080808080808080};
+  __m256i const_0x8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+                                        0x8080808080808080, 0x8080808080808080};
 
   for (x = 0; x < len; x++) {
     DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64,
@@ -1071,8 +1037,8 @@ void ARGBToUV444Row_LASX(const uint8_t* src_argb,
   __m256i const_38 = __lasx_xvldi(38);
   __m256i const_94 = __lasx_xvldi(94);
   __m256i const_18 = __lasx_xvldi(18);
-  __m256i const_0x8080 = {0x8080808080808080, 0x8080808080808080,
-                          0x8080808080808080, 0x8080808080808080};
+  __m256i const_0x8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+                                        0x8080808080808080, 0x8080808080808080};
   __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
                      0x0000000700000003};
   for (x = 0; x < len; x++) {
@@ -1216,7 +1182,7 @@ void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
 
 void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width) {
   int x;
   int len = width / 16;
@@ -1643,8 +1609,8 @@ void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555,
   __m256i const_38 = __lasx_xvldi(0x413);
   __m256i const_94 = __lasx_xvldi(0x42F);
   __m256i const_18 = __lasx_xvldi(0x409);
-  __m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
-                        0x8080808080808080, 0x8080808080808080};
+  __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+                                      0x8080808080808080, 0x8080808080808080};
 
   for (x = 0; x < len; x++) {
     DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555, 0,
@@ -1760,8 +1726,8 @@ void RGB565ToUVRow_LASX(const uint8_t* src_rgb565,
   __m256i const_38 = __lasx_xvldi(0x413);
   __m256i const_94 = __lasx_xvldi(0x42F);
   __m256i const_18 = __lasx_xvldi(0x409);
-  __m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
-                        0x8080808080808080, 0x8080808080808080};
+  __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+                                      0x8080808080808080, 0x8080808080808080};
 
   for (x = 0; x < len; x++) {
     DUP4_ARG2(__lasx_xvld, src_rgb565, 0, src_rgb565, 32, next_rgb565, 0,
@@ -1811,48 +1777,6 @@ void RGB565ToUVRow_LASX(const uint8_t* src_rgb565,
   }
 }
 
-void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 32;
-  __m256i src0, src1, src2;
-  __m256i tmp0, tmp1, tmp2, tmp3;
-  __m256i reg0, reg1, reg2, dst0;
-  __m256i const_129 = __lasx_xvldi(129);
-  __m256i const_br = {0x4219421942194219, 0x4219421942194219,
-                      0x4219421942194219, 0x4219421942194219};
-  __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
-                        0x1080108010801080, 0x1080108010801080};
-  __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200,
-                    0x17151412110F0E0C};
-  __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18,
-                    0x0F0D0C0A09070604};
-  __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001,
-                    0x001600130010000D};
-  __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019,
-                    0x000E000B00080005};
-
-  for (x = 0; x < len; x++) {
-    reg0 = __lasx_xvld(src_rgb24, 0);
-    reg1 = __lasx_xvld(src_rgb24, 32);
-    reg2 = __lasx_xvld(src_rgb24, 64);
-    src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
-    src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
-    src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
-    tmp0 = __lasx_xvshuf_b(src1, src0, shuff0);
-    tmp1 = __lasx_xvshuf_b(src1, src2, shuff1);
-    tmp2 = __lasx_xvshuf_b(src1, src0, shuff2);
-    tmp3 = __lasx_xvshuf_b(src1, src2, shuff3);
-    reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129);
-    reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129);
-    reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1);
-    dst0 = __lasx_xvpickod_b(reg1, reg0);
-    __lasx_xvst(dst0, dst_y, 0);
-    dst_y += 32;
-    src_rgb24 += 96;
-  }
-}
-
 void RGB24ToUVRow_LASX(const uint8_t* src_rgb24,
                        int src_stride_rgb24,
                        uint8_t* dst_u,
@@ -1869,8 +1793,8 @@ void RGB24ToUVRow_LASX(const uint8_t* src_rgb24,
   __m256i const_38 = __lasx_xvldi(0x413);
   __m256i const_94 = __lasx_xvldi(0x42F);
   __m256i const_18 = __lasx_xvldi(0x409);
-  __m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
-                        0x8080808080808080, 0x8080808080808080};
+  __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+                                      0x8080808080808080, 0x8080808080808080};
   __m256i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18,
                       0x15120F0C09060300, 0x00000000001E1B18};
   __m256i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908,
@@ -1916,48 +1840,6 @@ void RGB24ToUVRow_LASX(const uint8_t* src_rgb24,
   }
 }
 
-void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 32;
-  __m256i src0, src1, src2;
-  __m256i tmp0, tmp1, tmp2, tmp3;
-  __m256i reg0, reg1, reg2, dst0;
-  __m256i const_129 = __lasx_xvldi(129);
-  __m256i const_br = {0x1942194219421942, 0x1942194219421942,
-                      0x1942194219421942, 0x1942194219421942};
-  __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
-                        0x1080108010801080, 0x1080108010801080};
-  __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200,
-                    0x17151412110F0E0C};
-  __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18,
-                    0x0F0D0C0A09070604};
-  __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001,
-                    0x001600130010000D};
-  __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019,
-                    0x000E000B00080005};
-
-  for (x = 0; x < len; x++) {
-    reg0 = __lasx_xvld(src_raw, 0);
-    reg1 = __lasx_xvld(src_raw, 32);
-    reg2 = __lasx_xvld(src_raw, 64);
-    src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
-    src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
-    src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
-    tmp0 = __lasx_xvshuf_b(src1, src0, shuff0);
-    tmp1 = __lasx_xvshuf_b(src1, src2, shuff1);
-    tmp2 = __lasx_xvshuf_b(src1, src0, shuff2);
-    tmp3 = __lasx_xvshuf_b(src1, src2, shuff3);
-    reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129);
-    reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129);
-    reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1);
-    dst0 = __lasx_xvpickod_b(reg1, reg0);
-    __lasx_xvst(dst0, dst_y, 0);
-    dst_y += 32;
-    src_raw += 96;
-  }
-}
-
 void RAWToUVRow_LASX(const uint8_t* src_raw,
                      int src_stride_raw,
                      uint8_t* dst_u,
@@ -1974,8 +1856,8 @@ void RAWToUVRow_LASX(const uint8_t* src_raw,
   __m256i const_38 = __lasx_xvldi(0x413);
   __m256i const_94 = __lasx_xvldi(0x42F);
   __m256i const_18 = __lasx_xvldi(0x409);
-  __m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
-                        0x8080808080808080, 0x8080808080808080};
+  __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+                                      0x8080808080808080, 0x8080808080808080};
   __m256i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18,
                       0x15120F0C09060300, 0x00000000001E1B18};
   __m256i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908,
@@ -2118,36 +2000,228 @@ void NV21ToARGBRow_LASX(const uint8_t* src_y,
   }
 }
 
-void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 32;
-  __m256i src0, src1, src2, src3, dst0;
-  __m256i tmp0, tmp1, tmp2, tmp3;
-  __m256i reg0, reg1;
-  __m256i const_128 = __lasx_xvldi(0x480);
-  __m256i const_150 = __lasx_xvldi(0x96);
-  __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D,
-                      0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
-  __m256i shuff = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
-                   0x0000000700000003};
+struct RgbConstants {
+  uint8_t kRGBToY[4];
+  uint16_t kAddY;
+  uint16_t pad;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+                                                        128,
+                                                        0};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                        0x1080,
+                                                        0};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
+                                                      0x1080,
+                                                      0};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
+                                  uint8_t* dst_y,
+                                  int width,
+                                  const struct RgbConstants* rgbconstants) {
+  int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+  asm volatile(
+      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
+      "xvldrepl.h      $xr3,  %3,    4             \n\t"  // load rgbconstants
+      "xvld            $xr20, %4,    0             \n\t"  // load shuff
+      "1:                                          \n\t"
+      "xvld            $xr4,  %0,    0             \n\t"
+      "xvld            $xr5,  %0,    32            \n\t"
+      "xvld            $xr6,  %0,    64            \n\t"
+      "xvld            $xr7,  %0,    96            \n\t"  // load 32 pixels of
+                                                          // ARGB
+      "xvor.v          $xr12, $xr3,  $xr3          \n\t"
+      "xvor.v          $xr13, $xr3,  $xr3          \n\t"
+      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per
+                                                          // loop.
+      "xvpickev.b      $xr8,  $xr5,  $xr4          \n\t"  // BR
+      "xvpickev.b      $xr10, $xr7,  $xr6          \n\t"
+      "xvpickod.b      $xr9,  $xr5,  $xr4          \n\t"  // GA
+      "xvpickod.b      $xr11, $xr7,  $xr6          \n\t"
+      "xvmaddwev.h.bu  $xr12, $xr8,  $xr0          \n\t"  // B
+      "xvmaddwev.h.bu  $xr13, $xr10, $xr0          \n\t"
+      "xvmaddwev.h.bu  $xr12, $xr9,  $xr1          \n\t"  // G
+      "xvmaddwev.h.bu  $xr13, $xr11, $xr1          \n\t"
+      "xvmaddwod.h.bu  $xr12, $xr8,  $xr2          \n\t"  // R
+      "xvmaddwod.h.bu  $xr13, $xr10, $xr2          \n\t"
+      "addi.d          %0,    %0,    128           \n\t"
+      "xvpickod.b      $xr10, $xr13, $xr12         \n\t"
+      "xvperm.w        $xr11, $xr10, $xr20         \n\t"
+      "xvst            $xr11, %1,    0             \n\t"
+      "addi.d          %1,    %1,    32            \n\t"
+      "bnez            %2,    1b                   \n\t"
+      : "+&r"(src_argb),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(rgbconstants), "r"(shuff)
+      : "memory");
+}
 
-  for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
-              96, src0, src1, src2, src3);
-    tmp0 = __lasx_xvpickev_b(src1, src0);
-    tmp1 = __lasx_xvpickod_b(src1, src0);
-    tmp2 = __lasx_xvpickev_b(src3, src2);
-    tmp3 = __lasx_xvpickod_b(src3, src2);
-    reg0 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150);
-    reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp3, const_150);
-    reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp2);
-    dst0 = __lasx_xvpickod_b(reg1, reg0);
-    dst0 = __lasx_xvperm_w(dst0, shuff);
-    __lasx_xvst(dst0, dst_y, 0);
-    dst_y += 32;
-    src_argb += 128;
-  }
+void ARGBToYRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_LASX(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+
+void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_LASX(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_LASX(src_abgr, dst_y, width, &kRawI601Constants);
+}
+
+void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_LASX(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
+                                  uint8_t* dst_y,
+                                  int width,
+                                  const struct RgbConstants* rgbconstants) {
+  int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+  asm volatile(
+      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
+      "xvldrepl.h      $xr3,  %3,    4             \n\t"  // load rgbconstants
+      "xvld            $xr20, %4,    0             \n\t"  // load shuff
+      "1:                                          \n\t"
+      "xvld            $xr4,  %0,    0             \n\t"
+      "xvld            $xr5,  %0,    32            \n\t"
+      "xvld            $xr6,  %0,    64            \n\t"
+      "xvld            $xr7,  %0,    96            \n\t"  // load 32 pixels of
+                                                          // RGBA
+      "xvor.v          $xr12, $xr3,  $xr3          \n\t"
+      "xvor.v          $xr13, $xr3,  $xr3          \n\t"
+      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per
+                                                          // loop.
+      "xvpickev.b      $xr8,  $xr5,  $xr4          \n\t"  // AG
+      "xvpickev.b      $xr10, $xr7,  $xr6          \n\t"
+      "xvpickod.b      $xr9,  $xr5,  $xr4          \n\t"  // BR
+      "xvpickod.b      $xr11, $xr7,  $xr6          \n\t"
+      "xvmaddwev.h.bu  $xr12, $xr9,  $xr0          \n\t"  // B
+      "xvmaddwev.h.bu  $xr13, $xr11, $xr0          \n\t"
+      "xvmaddwod.h.bu  $xr12, $xr8,  $xr1          \n\t"  // G
+      "xvmaddwod.h.bu  $xr13, $xr10, $xr1          \n\t"
+      "xvmaddwod.h.bu  $xr12, $xr9,  $xr2          \n\t"  // R
+      "xvmaddwod.h.bu  $xr13, $xr11, $xr2          \n\t"
+      "addi.d          %0,    %0,    128           \n\t"
+      "xvpickod.b      $xr10, $xr13, $xr12         \n\t"
+      "xvperm.w        $xr11, $xr10, $xr20         \n\t"
+      "xvst            $xr11, %1,    0             \n\t"
+      "addi.d          %1,    %1,    32            \n\t"
+      "bnez            %2,    1b                   \n\t"
+      : "+&r"(src_rgba),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(rgbconstants), "r"(shuff)
+      : "memory");
+}
+
+void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_LASX(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+
+void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+  RGBAToYMatrixRow_LASX(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_LASX(src_bgra, dst_y, width, &kRawI601Constants);
+}
+
+static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct RgbConstants* rgbconstants) {
+  int8_t shuff[128] = {
+      0,  2,  3,  5,  6,  8, 9,  11, 12, 14, 15, 17, 18, 20, 21, 23,
+      0,  2,  3,  5,  6,  8, 9,  11, 12, 14, 15, 17, 18, 20, 21, 23,
+      24, 26, 27, 29, 30, 0, 1,  3,  4,  6,  7,  9,  10, 12, 13, 15,
+      24, 26, 27, 29, 30, 0, 1,  3,  4,  6,  7,  9,  10, 12, 13, 15,
+      1,  0,  4,  0,  7,  0, 10, 0,  13, 0,  16, 0,  19, 0,  22, 0,
+      1,  0,  4,  0,  7,  0, 10, 0,  13, 0,  16, 0,  19, 0,  22, 0,
+      25, 0,  28, 0,  31, 0, 2,  0,  5,  0,  8,  0,  11, 0,  14, 0,
+      25, 0,  28, 0,  31, 0, 2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
+  asm volatile(
+      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
+      "xvldrepl.h      $xr3,  %3,    4             \n\t"  // load rgbconstants
+      "xvld            $xr4,  %4,    0             \n\t"  // load shuff
+      "xvld            $xr5,  %4,    32            \n\t"
+      "xvld            $xr6,  %4,    64            \n\t"
+      "xvld            $xr7,  %4,    96            \n\t"
+      "1:                                          \n\t"
+      "xvld            $xr8,  %0,    0             \n\t"
+      "xvld            $xr9,  %0,    32            \n\t"
+      "xvld            $xr10, %0,    64            \n\t"  // load 32 pixels of
+                                                          // RGB
+      "xvor.v          $xr12, $xr3,  $xr3          \n\t"
+      "xvor.v          $xr13, $xr3,  $xr3          \n\t"
+      "xvor.v          $xr11, $xr9,  $xr9          \n\t"
+      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per
+                                                          // loop.
+      "xvpermi.q       $xr9,  $xr8,  0x30          \n\t"  // src0
+      "xvpermi.q       $xr8,  $xr10, 0x03          \n\t"  // src1
+      "xvpermi.q       $xr10, $xr11, 0x30          \n\t"  // src2
+      "xvshuf.b        $xr14, $xr8,  $xr9,  $xr4   \n\t"
+      "xvshuf.b        $xr15, $xr8,  $xr10, $xr5   \n\t"
+      "xvshuf.b        $xr16, $xr8,  $xr9,  $xr6   \n\t"
+      "xvshuf.b        $xr17, $xr8,  $xr10, $xr7   \n\t"
+      "xvmaddwev.h.bu  $xr12, $xr16, $xr1          \n\t"  // G
+      "xvmaddwev.h.bu  $xr13, $xr17, $xr1          \n\t"
+      "xvmaddwev.h.bu  $xr12, $xr14, $xr0          \n\t"  // B
+      "xvmaddwev.h.bu  $xr13, $xr15, $xr0          \n\t"
+      "xvmaddwod.h.bu  $xr12, $xr14, $xr2          \n\t"  // R
+      "xvmaddwod.h.bu  $xr13, $xr15, $xr2          \n\t"
+      "addi.d          %0,    %0,    96            \n\t"
+      "xvpickod.b      $xr10, $xr13, $xr12         \n\t"
+      "xvst            $xr10, %1,    0             \n\t"
+      "addi.d          %1,    %1,    32            \n\t"
+      "bnez            %2,    1b                   \n\t"
+      : "+&r"(src_rgba),    // %0
+        "+&r"(dst_y),       // %1
+        "+&r"(width)        // %2
+      : "r"(rgbconstants),  // %3
+        "r"(shuff)          // %4
+      : "memory");
+}
+
+void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LASX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LASX(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+
+void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LASX(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+
+void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LASX(src_raw, dst_y, width, &kRawI601Constants);
 }
 
 void ARGBToUVJRow_LASX(const uint8_t* src_argb,
@@ -2168,8 +2242,8 @@ void ARGBToUVJRow_LASX(const uint8_t* src_argb,
   __m256i const_21 = __lasx_xvldi(0x415);
   __m256i const_53 = __lasx_xvldi(0x435);
   __m256i const_10 = __lasx_xvldi(0x40A);
-  __m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
-                        0x8080808080808080, 0x8080808080808080};
+  __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+                                      0x8080808080808080, 0x8080808080808080};
   __m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08, 0x1715070513110301,
                    0x1F1D0F0D1B190B09};
 
diff --git a/files/source/row_lsx.cc b/source/row_lsx.cc
index 3e8b901a..fa088c9e 100644
--- a/files/source/row_lsx.cc
+++ b/source/row_lsx.cc
@@ -31,6 +31,91 @@ extern "C" {
     yb = __lsx_vreplgr2vr_w(yuvconst->kYBiasToRgb[0]);   \
   }
 
+// Load 32 YUV422 pixel data
+#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \
+  {                                                             \
+    __m128i temp0, temp1;                                       \
+                                                                \
+    DUP2_ARG2(__lsx_vld, psrc_y, 0, psrc_u, 0, out_y, temp0);   \
+    temp1 = __lsx_vld(psrc_v, 0);                               \
+    temp0 = __lsx_vsub_b(temp0, const_80);                      \
+    temp1 = __lsx_vsub_b(temp1, const_80);                      \
+    temp0 = __lsx_vsllwil_h_b(temp0, 0);                        \
+    temp1 = __lsx_vsllwil_h_b(temp1, 0);                        \
+    uv_l = __lsx_vilvl_h(temp0, temp1);                         \
+    uv_h = __lsx_vilvh_h(temp0, temp1);                         \
+  }
+
+// Load 16 YUV422 pixel data
+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \
+  {                                                   \
+    __m128i temp0, temp1;                             \
+                                                      \
+    out_y = __lsx_vld(psrc_y, 0);                     \
+    temp0 = __lsx_vldrepl_d(psrc_u, 0);               \
+    temp1 = __lsx_vldrepl_d(psrc_v, 0);               \
+    uv = __lsx_vilvl_b(temp0, temp1);                 \
+    uv = __lsx_vsub_b(uv, const_80);                  \
+    uv = __lsx_vsllwil_h_b(uv, 0);                    \
+  }
+
+// Convert 16 pixels of YUV420 to RGB.
+#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \
+                   g_h, r_l, r_h)                                           \
+  {                                                                         \
+    __m128i u_l, u_h, v_l, v_h;                                             \
+    __m128i yl_ev, yl_od, yh_ev, yh_od;                                     \
+    __m128i temp0, temp1, temp2, temp3;                                     \
+                                                                            \
+    temp0 = __lsx_vilvl_b(in_y, in_y);                                      \
+    temp1 = __lsx_vilvh_b(in_y, in_y);                                      \
+    yl_ev = __lsx_vmulwev_w_hu_h(temp0, yg);                                \
+    yl_od = __lsx_vmulwod_w_hu_h(temp0, yg);                                \
+    yh_ev = __lsx_vmulwev_w_hu_h(temp1, yg);                                \
+    yh_od = __lsx_vmulwod_w_hu_h(temp1, yg);                                \
+    DUP4_ARG2(__lsx_vsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16,    \
+              yl_ev, yl_od, yh_ev, yh_od);                                  \
+    yl_ev = __lsx_vadd_w(yl_ev, yb);                                        \
+    yl_od = __lsx_vadd_w(yl_od, yb);                                        \
+    yh_ev = __lsx_vadd_w(yh_ev, yb);                                        \
+    yh_od = __lsx_vadd_w(yh_od, yb);                                        \
+    v_l = __lsx_vmulwev_w_h(in_uvl, ubvr);                                  \
+    u_l = __lsx_vmulwod_w_h(in_uvl, ubvr);                                  \
+    v_h = __lsx_vmulwev_w_h(in_uvh, ubvr);                                  \
+    u_h = __lsx_vmulwod_w_h(in_uvh, ubvr);                                  \
+    temp0 = __lsx_vadd_w(yl_ev, u_l);                                       \
+    temp1 = __lsx_vadd_w(yl_od, u_l);                                       \
+    temp2 = __lsx_vadd_w(yh_ev, u_h);                                       \
+    temp3 = __lsx_vadd_w(yh_od, u_h);                                       \
+    DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+              temp1, temp2, temp3);                                         \
+    DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1,   \
+              temp2, temp3);                                                \
+    b_l = __lsx_vpackev_h(temp1, temp0);                                    \
+    b_h = __lsx_vpackev_h(temp3, temp2);                                    \
+    temp0 = __lsx_vadd_w(yl_ev, v_l);                                       \
+    temp1 = __lsx_vadd_w(yl_od, v_l);                                       \
+    temp2 = __lsx_vadd_w(yh_ev, v_h);                                       \
+    temp3 = __lsx_vadd_w(yh_od, v_h);                                       \
+    DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+              temp1, temp2, temp3);                                         \
+    DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1,   \
+              temp2, temp3);                                                \
+    r_l = __lsx_vpackev_h(temp1, temp0);                                    \
+    r_h = __lsx_vpackev_h(temp3, temp2);                                    \
+    DUP2_ARG2(__lsx_vdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h);        \
+    temp0 = __lsx_vsub_w(yl_ev, u_l);                                       \
+    temp1 = __lsx_vsub_w(yl_od, u_l);                                       \
+    temp2 = __lsx_vsub_w(yh_ev, u_h);                                       \
+    temp3 = __lsx_vsub_w(yh_od, u_h);                                       \
+    DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+              temp1, temp2, temp3);                                         \
+    DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1,   \
+              temp2, temp3);                                                \
+    g_l = __lsx_vpackev_h(temp1, temp0);                                    \
+    g_h = __lsx_vpackev_h(temp3, temp2);                                    \
+  }
+
 // Convert 8 pixels of YUV420 to RGB.
 #define YUVTORGB(in_y, in_vu, vrub, vgug, yg, yb, out_b, out_g, out_r) \
   {                                                                    \
@@ -118,6 +203,25 @@ extern "C" {
     out_g = __lsx_vpackev_h(tmp1, tmp0);                                 \
   }
 
+// Pack and Store 16 ARGB values.
+#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \
+  {                                                                    \
+    __m128i temp0, temp1, temp2, temp3;                                \
+    temp0 = __lsx_vpackev_b(g_l, b_l);                                 \
+    temp1 = __lsx_vpackev_b(a_l, r_l);                                 \
+    temp2 = __lsx_vpackev_b(g_h, b_h);                                 \
+    temp3 = __lsx_vpackev_b(a_h, r_h);                                 \
+    r_l = __lsx_vilvl_h(temp1, temp0);                                 \
+    r_h = __lsx_vilvh_h(temp1, temp0);                                 \
+    g_l = __lsx_vilvl_h(temp3, temp2);                                 \
+    g_h = __lsx_vilvh_h(temp3, temp2);                                 \
+    __lsx_vst(r_l, pdst_argb, 0);                                      \
+    __lsx_vst(r_h, pdst_argb, 16);                                     \
+    __lsx_vst(g_l, pdst_argb, 32);                                     \
+    __lsx_vst(g_h, pdst_argb, 48);                                     \
+    pdst_argb += 64;                                                   \
+  }
+
 // Pack and Store 8 ARGB values.
 #define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
   {                                                  \
@@ -155,6 +259,1028 @@ extern "C" {
     _dst0 = __lsx_vpickod_b(_reg1, _reg0);                       \
   }
 
+void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  int len = width / 32;
+  __m128i src0, src1;
+  __m128i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607};
+  src += width - 32;
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
+              src1);
+    __lsx_vst(src1, dst, 0);
+    __lsx_vst(src0, dst, 16);
+    dst += 32;
+    src -= 32;
+  }
+}
+
+void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  int x;
+  int len = width / 8;
+  __m128i src, dst;
+  __m128i shuffler = {0x0004000500060007, 0x0000000100020003};
+
+  src_uv += (width - 8) << 1;
+  for (x = 0; x < len; x++) {
+    src = __lsx_vld(src_uv, 0);
+    dst = __lsx_vshuf_h(shuffler, src, src);
+    __lsx_vst(dst, dst_uv, 0);
+    src_uv -= 16;
+    dst_uv += 16;
+  }
+}
+
+void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1;
+  __m128i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504};
+
+  src += (width * 4) - 32;
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
+              src1);
+    __lsx_vst(src1, dst, 0);
+    __lsx_vst(src0, dst, 16);
+    dst += 32;
+    src -= 32;
+  }
+}
+
+void I422ToYUY2Row_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  int x;
+  int len = width / 16;
+  __m128i src_u0, src_v0, src_y0, vec_uv0;
+  __m128i vec_yuy2_0, vec_yuy2_1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0);
+    src_y0 = __lsx_vld(src_y, 0);
+    vec_uv0 = __lsx_vilvl_b(src_v0, src_u0);
+    vec_yuy2_0 = __lsx_vilvl_b(vec_uv0, src_y0);
+    vec_yuy2_1 = __lsx_vilvh_b(vec_uv0, src_y0);
+    __lsx_vst(vec_yuy2_0, dst_yuy2, 0);
+    __lsx_vst(vec_yuy2_1, dst_yuy2, 16);
+    src_u += 8;
+    src_v += 8;
+    src_y += 16;
+    dst_yuy2 += 32;
+  }
+}
+
+void I422ToUYVYRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_uyvy,
+                       int width) {
+  int x;
+  int len = width / 16;
+  __m128i src_u0, src_v0, src_y0, vec_uv0;
+  __m128i vec_uyvy0, vec_uyvy1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0);
+    src_y0 = __lsx_vld(src_y, 0);
+    vec_uv0 = __lsx_vilvl_b(src_v0, src_u0);
+    vec_uyvy0 = __lsx_vilvl_b(src_y0, vec_uv0);
+    vec_uyvy1 = __lsx_vilvh_b(src_y0, vec_uv0);
+    __lsx_vst(vec_uyvy0, dst_uyvy, 0);
+    __lsx_vst(vec_uyvy1, dst_uyvy, 16);
+    src_u += 8;
+    src_v += 8;
+    src_y += 16;
+    dst_uyvy += 32;
+  }
+}
+
+void I422ToARGBRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i alpha = __lsx_vldi(0xFF);
+  __m128i const_80 = __lsx_vldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+void I422ToRGBARow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i alpha = __lsx_vldi(0xFF);
+  __m128i const_80 = __lsx_vldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb);
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+void I422AlphaToARGBRow_LSX(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  int x;
+  int len = width / 16;
+  int res = width & 15;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i zero = __lsx_vldi(0);
+  __m128i const_80 = __lsx_vldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h;
+
+    y = __lsx_vld(src_a, 0);
+    a_l = __lsx_vilvl_b(zero, y);
+    a_h = __lsx_vilvh_b(zero, y);
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+    src_a += 16;
+  }
+  if (res) {
+    __m128i y, uv, r, g, b, a;
+    a = __lsx_vld(src_a, 0);
+    a = __lsx_vsllwil_hu_bu(a, 0);
+    READYUV422(src_y, src_u, src_v, y, uv);
+    YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r);
+    STOREARGB(a, r, g, b, dst_argb);
+  }
+}
+
+void I422ToRGB24Row_LSX(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int32_t width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i const_80 = __lsx_vldi(0x80);
+  __m128i shuffler0 = {0x0504120302100100, 0x0A18090816070614};
+  __m128i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B};
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+    __m128i temp0, temp1, temp2, temp3;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    temp0 = __lsx_vpackev_b(g_l, b_l);
+    temp1 = __lsx_vpackev_b(g_h, b_h);
+    DUP4_ARG3(__lsx_vshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1, r_l,
+              temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0,
+              temp1);
+
+    b_l = __lsx_vilvl_d(temp1, temp2);
+    b_h = __lsx_vilvh_d(temp3, temp1);
+    __lsx_vst(temp0, dst_argb, 0);
+    __lsx_vst(b_l, dst_argb, 16);
+    __lsx_vst(b_h, dst_argb, 32);
+    dst_argb += 48;
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
+void I422ToRGB565Row_LSX(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i const_80 = __lsx_vldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    b_l = __lsx_vsrli_h(b_l, 3);
+    b_h = __lsx_vsrli_h(b_h, 3);
+    g_l = __lsx_vsrli_h(g_l, 2);
+    g_h = __lsx_vsrli_h(g_h, 2);
+    r_l = __lsx_vsrli_h(r_l, 3);
+    r_h = __lsx_vsrli_h(r_h, 3);
+    r_l = __lsx_vslli_h(r_l, 11);
+    r_h = __lsx_vslli_h(r_h, 11);
+    g_l = __lsx_vslli_h(g_l, 5);
+    g_h = __lsx_vslli_h(g_h, 5);
+    r_l = __lsx_vor_v(r_l, g_l);
+    r_l = __lsx_vor_v(r_l, b_l);
+    r_h = __lsx_vor_v(r_h, g_h);
+    r_h = __lsx_vor_v(r_h, b_h);
+    __lsx_vst(r_l, dst_rgb565, 0);
+    __lsx_vst(r_h, dst_rgb565, 16);
+    dst_rgb565 += 32;
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
+void I422ToARGB4444Row_LSX(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i const_80 = __lsx_vldi(0x80);
+  __m128i alpha = (__m128i)v2u64{0xF000F000F000F000, 0xF000F000F000F000};
+  __m128i mask = {0x00F000F000F000F0, 0x00F000F000F000F0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    b_l = __lsx_vsrli_h(b_l, 4);
+    b_h = __lsx_vsrli_h(b_h, 4);
+    r_l = __lsx_vsrli_h(r_l, 4);
+    r_h = __lsx_vsrli_h(r_h, 4);
+    g_l = __lsx_vand_v(g_l, mask);
+    g_h = __lsx_vand_v(g_h, mask);
+    r_l = __lsx_vslli_h(r_l, 8);
+    r_h = __lsx_vslli_h(r_h, 8);
+    r_l = __lsx_vor_v(r_l, alpha);
+    r_h = __lsx_vor_v(r_h, alpha);
+    r_l = __lsx_vor_v(r_l, g_l);
+    r_h = __lsx_vor_v(r_h, g_h);
+    r_l = __lsx_vor_v(r_l, b_l);
+    r_h = __lsx_vor_v(r_h, b_h);
+    __lsx_vst(r_l, dst_argb4444, 0);
+    __lsx_vst(r_h, dst_argb4444, 16);
+    dst_argb4444 += 32;
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+void I422ToARGB1555Row_LSX(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i const_80 = __lsx_vldi(0x80);
+  __m128i alpha = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    b_l = __lsx_vsrli_h(b_l, 3);
+    b_h = __lsx_vsrli_h(b_h, 3);
+    g_l = __lsx_vsrli_h(g_l, 3);
+
+    g_h = __lsx_vsrli_h(g_h, 3);
+    g_l = __lsx_vslli_h(g_l, 5);
+    g_h = __lsx_vslli_h(g_h, 5);
+    r_l = __lsx_vsrli_h(r_l, 3);
+    r_h = __lsx_vsrli_h(r_h, 3);
+    r_l = __lsx_vslli_h(r_l, 10);
+    r_h = __lsx_vslli_h(r_h, 10);
+    r_l = __lsx_vor_v(r_l, alpha);
+    r_h = __lsx_vor_v(r_h, alpha);
+    r_l = __lsx_vor_v(r_l, g_l);
+    r_h = __lsx_vor_v(r_h, g_h);
+    r_l = __lsx_vor_v(r_l, b_l);
+    r_h = __lsx_vor_v(r_h, b_h);
+    __lsx_vst(r_l, dst_argb1555, 0);
+    __lsx_vst(r_h, dst_argb1555, 16);
+    dst_argb1555 += 32;
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1);
+    dst0 = __lsx_vpickev_b(src1, src0);
+    __lsx_vst(dst0, dst_y, 0);
+    src_yuy2 += 32;
+    dst_y += 16;
+  }
+}
+
+void YUY2ToUVRow_LSX(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src_yuy2_next, 0,
+              src_yuy2_next, 16, src0, src1, src2, src3);
+    src0 = __lsx_vpickod_b(src1, src0);
+    src1 = __lsx_vpickod_b(src3, src2);
+    tmp0 = __lsx_vavgr_bu(src1, src0);
+    dst0 = __lsx_vpickev_b(tmp0, tmp0);
+    dst1 = __lsx_vpickod_b(tmp0, tmp0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst1, dst_v, 0, 0);
+    src_yuy2 += 32;
+    src_yuy2_next += 32;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1);
+    tmp0 = __lsx_vpickod_b(src1, src0);
+    dst0 = __lsx_vpickev_b(tmp0, tmp0);
+    dst1 = __lsx_vpickod_b(tmp0, tmp0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst1, dst_v, 0, 0);
+    src_yuy2 += 32;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1);
+    dst0 = __lsx_vpickod_b(src1, src0);
+    __lsx_vst(dst0, dst_y, 0);
+    src_uyvy += 32;
+    dst_y += 16;
+  }
+}
+
+void UYVYToUVRow_LSX(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, src2, src3, tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src_uyvy_next, 0,
+              src_uyvy_next, 16, src0, src1, src2, src3);
+    src0 = __lsx_vpickev_b(src1, src0);
+    src1 = __lsx_vpickev_b(src3, src2);
+    tmp0 = __lsx_vavgr_bu(src1, src0);
+    dst0 = __lsx_vpickev_b(tmp0, tmp0);
+    dst1 = __lsx_vpickod_b(tmp0, tmp0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst1, dst_v, 0, 0);
+    src_uyvy += 32;
+    src_uyvy_next += 32;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void UYVYToUV422Row_LSX(const uint8_t* src_uyvy,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    dst0 = __lsx_vpickev_b(tmp0, tmp0);
+    dst1 = __lsx_vpickod_b(tmp0, tmp0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst1, dst_v, 0, 0);
+    src_uyvy += 32;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void ARGBToUVRow_LSX(const uint8_t* src_argb0,
+                     int src_stride_argb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  int len = width / 16;
+  const uint8_t* src_argb1 = src_argb0 + src_stride_argb;
+
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i vec0, vec1, vec2, vec3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
+  __m128i const_0x70 = {0x0038003800380038, 0x0038003800380038};
+  __m128i const_0x4A = {0x0025002500250025, 0x0025002500250025};
+  __m128i const_0x26 = {0x0013001300130013, 0x0013001300130013};
+  __m128i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f};
+  __m128i const_0x12 = {0x0009000900090009, 0x0009000900090009};
+  __m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32, src_argb0,
+              48, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vld, src_argb1, 0, src_argb1, 16, src_argb1, 32, src_argb1,
+              48, src4, src5, src6, src7);
+    vec0 = __lsx_vaddwev_h_bu(src0, src4);
+    vec1 = __lsx_vaddwev_h_bu(src1, src5);
+    vec2 = __lsx_vaddwev_h_bu(src2, src6);
+    vec3 = __lsx_vaddwev_h_bu(src3, src7);
+    tmp0 = __lsx_vpickev_h(vec1, vec0);
+    tmp1 = __lsx_vpickev_h(vec3, vec2);
+    tmp2 = __lsx_vpickod_h(vec1, vec0);
+    tmp3 = __lsx_vpickod_h(vec3, vec2);
+    vec0 = __lsx_vaddwod_h_bu(src0, src4);
+    vec1 = __lsx_vaddwod_h_bu(src1, src5);
+    vec2 = __lsx_vaddwod_h_bu(src2, src6);
+    vec3 = __lsx_vaddwod_h_bu(src3, src7);
+    tmp4 = __lsx_vpickev_h(vec1, vec0);
+    tmp5 = __lsx_vpickev_h(vec3, vec2);
+    vec0 = __lsx_vpickev_h(tmp1, tmp0);
+    vec1 = __lsx_vpickod_h(tmp1, tmp0);
+    src0 = __lsx_vavgr_h(vec0, vec1);
+    vec0 = __lsx_vpickev_h(tmp3, tmp2);
+    vec1 = __lsx_vpickod_h(tmp3, tmp2);
+    src1 = __lsx_vavgr_h(vec0, vec1);
+    vec0 = __lsx_vpickev_h(tmp5, tmp4);
+    vec1 = __lsx_vpickod_h(tmp5, tmp4);
+    src2 = __lsx_vavgr_h(vec0, vec1);
+    dst0 = __lsx_vmadd_h(const_0x8080, src0, const_0x70);
+    dst0 = __lsx_vmsub_h(dst0, src2, const_0x4A);
+    dst0 = __lsx_vmsub_h(dst0, src1, const_0x26);
+    dst1 = __lsx_vmadd_h(const_0x8080, src1, const_0x70);
+    dst1 = __lsx_vmsub_h(dst1, src2, const_0x5E);
+    dst1 = __lsx_vmsub_h(dst1, src0, const_0x12);
+    dst0 = __lsx_vsrai_h(dst0, 8);
+    dst1 = __lsx_vsrai_h(dst1, 8);
+    dst0 = __lsx_vpickev_b(dst1, dst0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst0, dst_v, 0, 1);
+    src_argb0 += 64;
+    src_argb1 += 64;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  int len = (width / 16) - 1;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i shuf = {0x0908060504020100, 0x000000000E0D0C0A};
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+              src0, src1, src2, src3);
+    tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+    tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+    tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+    tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+    __lsx_vst(tmp0, dst_rgb, 0);
+    __lsx_vst(tmp1, dst_rgb, 12);
+    __lsx_vst(tmp2, dst_rgb, 24);
+    __lsx_vst(tmp3, dst_rgb, 36);
+    dst_rgb += 48;
+    src_argb += 64;
+  }
+  DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+            src0, src1, src2, src3);
+  tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+  tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+  tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+  tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+  __lsx_vst(tmp0, dst_rgb, 0);
+  __lsx_vst(tmp1, dst_rgb, 12);
+  __lsx_vst(tmp2, dst_rgb, 24);
+  dst_rgb += 36;
+  __lsx_vst(tmp3, dst_rgb, 0);
+}
+
+void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  int len = (width / 16) - 1;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i shuf = {0x090A040506000102, 0x000000000C0D0E08};
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+              src0, src1, src2, src3);
+    tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+    tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+    tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+    tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+    __lsx_vst(tmp0, dst_rgb, 0);
+    __lsx_vst(tmp1, dst_rgb, 12);
+    __lsx_vst(tmp2, dst_rgb, 24);
+    __lsx_vst(tmp3, dst_rgb, 36);
+    dst_rgb += 48;
+    src_argb += 64;
+  }
+  DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+            src0, src1, src2, src3);
+  tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+  tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+  tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+  tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+  __lsx_vst(tmp0, dst_rgb, 0);
+  __lsx_vst(tmp1, dst_rgb, 12);
+  __lsx_vst(tmp2, dst_rgb, 24);
+  dst_rgb += 36;
+  __lsx_vst(tmp3, dst_rgb, 0);
+}
+
+void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  int len = width / 8;
+  __m128i zero = __lsx_vldi(0);
+  __m128i src0, src1, tmp0, tmp1, dst0;
+  __m128i shift = {0x0300030003000300, 0x0300030003000300};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    tmp0 = __lsx_vsrli_b(tmp0, 3);
+    tmp1 = __lsx_vpackev_b(zero, tmp1);
+    tmp1 = __lsx_vsrli_h(tmp1, 2);
+    tmp0 = __lsx_vsll_b(tmp0, shift);
+    tmp1 = __lsx_vslli_h(tmp1, 5);
+    dst0 = __lsx_vor_v(tmp0, tmp1);
+    __lsx_vst(dst0, dst_rgb, 0);
+    dst_rgb += 16;
+    src_argb += 32;
+  }
+}
+
+void ARGBToARGB1555Row_LSX(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
+  int x;
+  int len = width / 8;
+  __m128i zero = __lsx_vldi(0);
+  __m128i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0;
+  __m128i shift1 = {0x0703070307030703, 0x0703070307030703};
+  __m128i shift2 = {0x0200020002000200, 0x0200020002000200};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    tmp0 = __lsx_vsrli_b(tmp0, 3);
+    tmp1 = __lsx_vsrl_b(tmp1, shift1);
+    tmp0 = __lsx_vsll_b(tmp0, shift2);
+    tmp2 = __lsx_vpackev_b(zero, tmp1);
+    tmp3 = __lsx_vpackod_b(zero, tmp1);
+    tmp2 = __lsx_vslli_h(tmp2, 5);
+    tmp3 = __lsx_vslli_h(tmp3, 15);
+    dst0 = __lsx_vor_v(tmp0, tmp2);
+    dst0 = __lsx_vor_v(dst0, tmp3);
+    __lsx_vst(dst0, dst_rgb, 0);
+    dst_rgb += 16;
+    src_argb += 32;
+  }
+}
+
+void ARGBToARGB4444Row_LSX(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    tmp1 = __lsx_vandi_b(tmp1, 0xF0);
+    tmp0 = __lsx_vsrli_b(tmp0, 4);
+    dst0 = __lsx_vor_v(tmp1, tmp0);
+    __lsx_vst(dst0, dst_rgb, 0);
+    dst_rgb += 16;
+    src_argb += 32;
+  }
+}
+
+void ARGBToUV444Row_LSX(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int32_t width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, dst0, dst1;
+  __m128i const_112 = __lsx_vldi(112);
+  __m128i const_74 = __lsx_vldi(74);
+  __m128i const_38 = __lsx_vldi(38);
+  __m128i const_94 = __lsx_vldi(94);
+  __m128i const_18 = __lsx_vldi(18);
+  __m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+              src0, src1, src2, src3);
+    tmp0 = __lsx_vpickev_h(src1, src0);
+    tmp1 = __lsx_vpickod_h(src1, src0);
+    tmp2 = __lsx_vpickev_h(src3, src2);
+    tmp3 = __lsx_vpickod_h(src3, src2);
+    reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp0, const_112);
+    reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp2, const_112);
+    reg2 = __lsx_vmulwod_h_bu(tmp0, const_74);
+    reg3 = __lsx_vmulwod_h_bu(tmp2, const_74);
+    reg2 = __lsx_vmaddwev_h_bu(reg2, tmp1, const_38);
+    reg3 = __lsx_vmaddwev_h_bu(reg3, tmp3, const_38);
+    reg0 = __lsx_vsub_h(reg0, reg2);
+    reg1 = __lsx_vsub_h(reg1, reg3);
+    reg0 = __lsx_vsrai_h(reg0, 8);
+    reg1 = __lsx_vsrai_h(reg1, 8);
+    dst0 = __lsx_vpickev_b(reg1, reg0);
+
+    reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp1, const_112);
+    reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp3, const_112);
+    reg2 = __lsx_vmulwev_h_bu(tmp0, const_18);
+    reg3 = __lsx_vmulwev_h_bu(tmp2, const_18);
+    reg2 = __lsx_vmaddwod_h_bu(reg2, tmp0, const_94);
+    reg3 = __lsx_vmaddwod_h_bu(reg3, tmp2, const_94);
+    reg0 = __lsx_vsub_h(reg0, reg2);
+    reg1 = __lsx_vsub_h(reg1, reg3);
+    reg0 = __lsx_vsrai_h(reg0, 8);
+    reg1 = __lsx_vsrai_h(reg1, 8);
+    dst1 = __lsx_vpickev_b(reg1, reg0);
+
+    __lsx_vst(dst0, dst_u, 0);
+    __lsx_vst(dst1, dst_v, 0);
+    dst_u += 16;
+    dst_v += 16;
+    src_argb += 64;
+  }
+}
+
+void ARGBMultiplyRow_LSX(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  int len = width / 4;
+  __m128i zero = __lsx_vldi(0);
+  __m128i src0, src1, dst0, dst1;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
+    tmp0 = __lsx_vilvl_b(src0, src0);
+    tmp1 = __lsx_vilvh_b(src0, src0);
+    tmp2 = __lsx_vilvl_b(zero, src1);
+    tmp3 = __lsx_vilvh_b(zero, src1);
+    dst0 = __lsx_vmuh_hu(tmp0, tmp2);
+    dst1 = __lsx_vmuh_hu(tmp1, tmp3);
+    dst0 = __lsx_vpickev_b(dst1, dst0);
+    __lsx_vst(dst0, dst_argb, 0);
+    src_argb0 += 16;
+    src_argb1 += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBAddRow_LSX(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width) {
+  int x;
+  int len = width / 4;
+  __m128i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
+    dst0 = __lsx_vsadd_bu(src0, src1);
+    __lsx_vst(dst0, dst_argb, 0);
+    src_argb0 += 16;
+    src_argb1 += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBSubtractRow_LSX(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  int len = width / 4;
+  __m128i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
+    dst0 = __lsx_vssub_bu(src0, src1);
+    __lsx_vst(dst0, dst_argb, 0);
+    src_argb0 += 16;
+    src_argb1 += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBAttenuateRow_LSX(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i b, g, r, a, dst0, dst1;
+  __m128i control = {0x0005000100040000, 0x0007000300060002};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    b = __lsx_vpackev_b(tmp0, tmp0);
+    r = __lsx_vpackod_b(tmp0, tmp0);
+    g = __lsx_vpackev_b(tmp1, tmp1);
+    a = __lsx_vpackod_b(tmp1, tmp1);
+    reg0 = __lsx_vmulwev_w_hu(b, a);
+    reg1 = __lsx_vmulwod_w_hu(b, a);
+    reg2 = __lsx_vmulwev_w_hu(r, a);
+    reg3 = __lsx_vmulwod_w_hu(r, a);
+    reg4 = __lsx_vmulwev_w_hu(g, a);
+    reg5 = __lsx_vmulwod_w_hu(g, a);
+    reg0 = __lsx_vssrani_h_w(reg1, reg0, 24);
+    reg2 = __lsx_vssrani_h_w(reg3, reg2, 24);
+    reg4 = __lsx_vssrani_h_w(reg5, reg4, 24);
+    reg0 = __lsx_vshuf_h(control, reg0, reg0);
+    reg2 = __lsx_vshuf_h(control, reg2, reg2);
+    reg4 = __lsx_vshuf_h(control, reg4, reg4);
+    tmp0 = __lsx_vpackev_b(reg4, reg0);
+    tmp1 = __lsx_vpackev_b(a, reg2);
+    dst0 = __lsx_vilvl_h(tmp1, tmp0);
+    dst1 = __lsx_vilvh_h(tmp1, tmp0);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    dst_argb += 32;
+    src_argb += 32;
+  }
+}
+
+void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               uint32_t dither4,
+                               int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1, dst0;
+  __m128i b, g, r;
+  __m128i zero = __lsx_vldi(0);
+  __m128i vec_dither = __lsx_vldrepl_w(&dither4, 0);
+
+  vec_dither = __lsx_vilvl_b(zero, vec_dither);
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    b = __lsx_vpackev_b(zero, tmp0);
+    r = __lsx_vpackod_b(zero, tmp0);
+    g = __lsx_vpackev_b(zero, tmp1);
+    b = __lsx_vadd_h(b, vec_dither);
+    g = __lsx_vadd_h(g, vec_dither);
+    r = __lsx_vadd_h(r, vec_dither);
+    DUP2_ARG1(__lsx_vclip255_h, b, g, b, g);
+    r = __lsx_vclip255_h(r);
+    b = __lsx_vsrai_h(b, 3);
+    g = __lsx_vsrai_h(g, 2);
+    r = __lsx_vsrai_h(r, 3);
+    g = __lsx_vslli_h(g, 5);
+    r = __lsx_vslli_h(r, 11);
+    dst0 = __lsx_vor_v(b, g);
+    dst0 = __lsx_vor_v(dst0, r);
+    __lsx_vst(dst0, dst_rgb, 0);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBShuffleRow_LSX(const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        const uint8_t* shuffler,
+                        int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, dst0, dst1;
+  __m128i shuf = {0x0404040400000000, 0x0C0C0C0C08080808};
+  __m128i temp = __lsx_vldrepl_w(shuffler, 0);
+
+  shuf = __lsx_vadd_b(shuf, temp);
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    dst0 = __lsx_vshuf_b(src0, src0, shuf);
+    dst1 = __lsx_vshuf_b(src1, src1, shuf);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBShadeRow_LSX(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      int width,
+                      uint32_t value) {
+  int x;
+  int len = width / 4;
+  __m128i src0, dst0, tmp0, tmp1;
+  __m128i vec_value = __lsx_vreplgr2vr_w(value);
+
+  vec_value = __lsx_vilvl_b(vec_value, vec_value);
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_argb, 0);
+    tmp0 = __lsx_vilvl_b(src0, src0);
+    tmp1 = __lsx_vilvh_b(src0, src0);
+    tmp0 = __lsx_vmuh_hu(tmp0, vec_value);
+    tmp1 = __lsx_vmuh_hu(tmp1, vec_value);
+    dst0 = __lsx_vpickod_b(tmp1, tmp0);
+    __lsx_vst(dst0, dst_argb, 0);
+    src_argb += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1;
+  __m128i reg0, reg1, reg2, dst0, dst1;
+  __m128i const_128 = __lsx_vldi(0x480);
+  __m128i const_150 = __lsx_vldi(0x96);
+  __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    reg0 = __lsx_vdp2_h_bu(tmp0, const_br);
+    reg1 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150);
+    reg2 = __lsx_vadd_h(reg0, reg1);
+    tmp0 = __lsx_vpackod_b(reg2, reg2);
+    tmp1 = __lsx_vpackod_b(tmp1, reg2);
+    dst0 = __lsx_vilvl_h(tmp1, tmp0);
+    dst1 = __lsx_vilvh_h(tmp1, tmp0);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1;
+  __m128i reg0, reg1, spb, spg, spr;
+  __m128i dst0, dst1;
+  __m128i spb_g = __lsx_vldi(68);
+  __m128i spg_g = __lsx_vldi(88);
+  __m128i spr_g = __lsx_vldi(98);
+  __m128i spb_br = {0x2311231123112311, 0x2311231123112311};
+  __m128i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16};
+  __m128i spr_br = {0x3218321832183218, 0x3218321832183218};
+  __m128i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    DUP2_ARG2(__lsx_vdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg);
+    spr = __lsx_vdp2_h_bu(tmp0, spr_br);
+    spb = __lsx_vmaddwev_h_bu(spb, tmp1, spb_g);
+    spg = __lsx_vmaddwev_h_bu(spg, tmp1, spg_g);
+    spr = __lsx_vmaddwev_h_bu(spr, tmp1, spr_g);
+    spb = __lsx_vsrli_h(spb, 7);
+    spg = __lsx_vsrli_h(spg, 7);
+    spr = __lsx_vsrli_h(spr, 7);
+    spg = __lsx_vsat_hu(spg, 7);
+    spr = __lsx_vsat_hu(spr, 7);
+    reg0 = __lsx_vpackev_b(spg, spb);
+    reg1 = __lsx_vshuf_b(tmp1, spr, shuff);
+    dst0 = __lsx_vilvl_h(reg1, reg0);
+    dst1 = __lsx_vilvh_h(reg1, reg0);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    dst_argb += 32;
+  }
+}
+
 void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444,
                            uint8_t* dst_argb,
                            int width) {
@@ -407,7 +1533,7 @@ void ARGB1555ToUVRow_LSX(const uint8_t* src_argb1555,
   __m128i const_38 = __lsx_vldi(0x413);
   __m128i const_94 = __lsx_vldi(0x42F);
   __m128i const_18 = __lsx_vldi(0x409);
-  __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
 
   for (x = 0; x < len; x++) {
     DUP4_ARG2(__lsx_vld, src_argb1555, 0, src_argb1555, 16, next_argb1555, 0,
@@ -516,7 +1642,7 @@ void RGB565ToUVRow_LSX(const uint8_t* src_rgb565,
   __m128i const_38 = __lsx_vldi(0x413);
   __m128i const_94 = __lsx_vldi(0x42F);
   __m128i const_18 = __lsx_vldi(0x409);
-  __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
 
   for (x = 0; x < len; x++) {
     DUP4_ARG2(__lsx_vld, src_rgb565, 0, src_rgb565, 16, next_rgb565, 0,
@@ -561,39 +1687,6 @@ void RGB565ToUVRow_LSX(const uint8_t* src_rgb565,
   }
 }
 
-void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 16;
-  __m128i src0, src1, src2;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-  __m128i reg0, reg1, dst0;
-  __m128i const_129 = __lsx_vldi(129);
-  __m128i const_br = {0x4219421942194219, 0x4219421942194219};
-  __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
-  __m128i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C};
-  __m128i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604};
-  __m128i shuff2 = {0x000A000700040001, 0x001600130010000D};
-  __m128i shuff3 = {0x0002001F001C0019, 0x000E000B00080005};
-
-  for (x = 0; x < len; x++) {
-    src0 = __lsx_vld(src_rgb24, 0);
-    src1 = __lsx_vld(src_rgb24, 16);
-    src2 = __lsx_vld(src_rgb24, 32);
-    tmp0 = __lsx_vshuf_b(src1, src0, shuff0);
-    tmp1 = __lsx_vshuf_b(src1, src2, shuff1);
-    tmp2 = __lsx_vshuf_b(src1, src0, shuff2);
-    tmp3 = __lsx_vshuf_b(src1, src2, shuff3);
-    reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp2, const_129);
-    reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129);
-    reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp1);
-    dst0 = __lsx_vpickod_b(reg1, reg0);
-    __lsx_vst(dst0, dst_y, 0);
-    dst_y += 16;
-    src_rgb24 += 48;
-  }
-}
-
 void RGB24ToUVRow_LSX(const uint8_t* src_rgb24,
                       int src_stride_rgb24,
                       uint8_t* dst_u,
@@ -610,7 +1703,7 @@ void RGB24ToUVRow_LSX(const uint8_t* src_rgb24,
   __m128i const_38 = __lsx_vldi(0x413);
   __m128i const_94 = __lsx_vldi(0x42F);
   __m128i const_18 = __lsx_vldi(0x409);
-  __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
   __m128i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18};
   __m128i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908};
   __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
@@ -647,39 +1740,6 @@ void RGB24ToUVRow_LSX(const uint8_t* src_rgb24,
   }
 }
 
-void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 16;
-  __m128i src0, src1, src2;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-  __m128i reg0, reg1, dst0;
-  __m128i const_129 = __lsx_vldi(129);
-  __m128i const_br = {0x1942194219421942, 0x1942194219421942};
-  __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
-  __m128i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C};
-  __m128i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604};
-  __m128i shuff2 = {0x000A000700040001, 0x001600130010000D};
-  __m128i shuff3 = {0x0002001F001C0019, 0x000E000B00080005};
-
-  for (x = 0; x < len; x++) {
-    src0 = __lsx_vld(src_raw, 0);
-    src1 = __lsx_vld(src_raw, 16);
-    src2 = __lsx_vld(src_raw, 32);
-    tmp0 = __lsx_vshuf_b(src1, src0, shuff0);
-    tmp1 = __lsx_vshuf_b(src1, src2, shuff1);
-    tmp2 = __lsx_vshuf_b(src1, src0, shuff2);
-    tmp3 = __lsx_vshuf_b(src1, src2, shuff3);
-    reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp2, const_129);
-    reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129);
-    reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp1);
-    dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
-    __lsx_vst(dst0, dst_y, 0);
-    dst_y += 16;
-    src_raw += 48;
-  }
-}
-
 void RAWToUVRow_LSX(const uint8_t* src_raw,
                     int src_stride_raw,
                     uint8_t* dst_u,
@@ -696,7 +1756,7 @@ void RAWToUVRow_LSX(const uint8_t* src_raw,
   __m128i const_38 = __lsx_vldi(0x413);
   __m128i const_94 = __lsx_vldi(0x42F);
   __m128i const_18 = __lsx_vldi(0x409);
-  __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
   __m128i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18};
   __m128i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908};
   __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
@@ -914,62 +1974,6 @@ void SobelXYRow_LSX(const uint8_t* src_sobelx,
   }
 }
 
-void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 16;
-  __m128i src0, src1, src2, src3, dst0;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-  __m128i reg0, reg1;
-  __m128i const_128 = __lsx_vldi(0x480);
-  __m128i const_150 = __lsx_vldi(0x96);
-  __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
-
-  for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
-              src0, src1, src2, src3);
-    tmp0 = __lsx_vpickev_b(src1, src0);
-    tmp1 = __lsx_vpickod_b(src1, src0);
-    tmp2 = __lsx_vpickev_b(src3, src2);
-    tmp3 = __lsx_vpickod_b(src3, src2);
-    reg0 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150);
-    reg1 = __lsx_vmaddwev_h_bu(const_128, tmp3, const_150);
-    reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
-    dst0 = __lsx_vpickod_b(reg1, reg0);
-    __lsx_vst(dst0, dst_y, 0);
-    dst_y += 16;
-    src_argb += 64;
-  }
-}
-
-void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 16;
-  __m128i src0, src1, src2, src3, dst0;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-  __m128i reg0, reg1;
-  __m128i const_129 = __lsx_vldi(0x81);
-  __m128i const_br = {0x1942194219421942, 0x1942194219421942};
-  __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
-
-  for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48,
-              src0, src1, src2, src3);
-    tmp0 = __lsx_vpickod_b(src1, src0);
-    tmp1 = __lsx_vpickev_b(src1, src0);
-    tmp2 = __lsx_vpickod_b(src3, src2);
-    tmp3 = __lsx_vpickev_b(src3, src2);
-    reg0 = __lsx_vmaddwod_h_bu(const_1080, tmp1, const_129);
-    reg1 = __lsx_vmaddwod_h_bu(const_1080, tmp3, const_129);
-    reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
-    dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
-    __lsx_vst(dst0, dst_y, 0);
-    dst_y += 16;
-    src_bgra += 64;
-  }
-}
-
 void BGRAToUVRow_LSX(const uint8_t* src_bgra,
                      int src_stride_bgra,
                      uint8_t* dst_u,
@@ -987,7 +1991,7 @@ void BGRAToUVRow_LSX(const uint8_t* src_bgra,
   __m128i const_38 = __lsx_vldi(0x413);
   __m128i const_94 = __lsx_vldi(0x42F);
   __m128i const_18 = __lsx_vldi(0x409);
-  __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
 
   for (x = 0; x < len; x++) {
     DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48,
@@ -1018,34 +2022,6 @@ void BGRAToUVRow_LSX(const uint8_t* src_bgra,
   }
 }
 
-void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 16;
-  __m128i src0, src1, src2, src3, dst0;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-  __m128i reg0, reg1;
-  __m128i const_129 = __lsx_vldi(0x81);
-  __m128i const_br = {0x1942194219421942, 0x1942194219421942};
-  __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
-
-  for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48,
-              src0, src1, src2, src3);
-    tmp0 = __lsx_vpickev_b(src1, src0);
-    tmp1 = __lsx_vpickod_b(src1, src0);
-    tmp2 = __lsx_vpickev_b(src3, src2);
-    tmp3 = __lsx_vpickod_b(src3, src2);
-    reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp1, const_129);
-    reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129);
-    reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
-    dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
-    __lsx_vst(dst0, dst_y, 0);
-    dst_y += 16;
-    src_abgr += 64;
-  }
-}
-
 void ABGRToUVRow_LSX(const uint8_t* src_abgr,
                      int src_stride_abgr,
                      uint8_t* dst_u,
@@ -1063,7 +2039,7 @@ void ABGRToUVRow_LSX(const uint8_t* src_abgr,
   __m128i const_38 = __lsx_vldi(0x413);
   __m128i const_94 = __lsx_vldi(0x42F);
   __m128i const_18 = __lsx_vldi(0x409);
-  __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
 
   for (x = 0; x < len; x++) {
     DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48,
@@ -1094,34 +2070,6 @@ void ABGRToUVRow_LSX(const uint8_t* src_abgr,
   }
 }
 
-void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  int x;
-  int len = width / 16;
-  __m128i src0, src1, src2, src3, dst0;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-  __m128i reg0, reg1;
-  __m128i const_129 = __lsx_vldi(0x81);
-  __m128i const_br = {0x4219421942194219, 0x4219421942194219};
-  __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
-
-  for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48,
-              src0, src1, src2, src3);
-    tmp0 = __lsx_vpickod_b(src1, src0);
-    tmp1 = __lsx_vpickev_b(src1, src0);
-    tmp2 = __lsx_vpickod_b(src3, src2);
-    tmp3 = __lsx_vpickev_b(src3, src2);
-    reg0 = __lsx_vmaddwod_h_bu(const_1080, tmp1, const_129);
-    reg1 = __lsx_vmaddwod_h_bu(const_1080, tmp3, const_129);
-    reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
-    reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
-    dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
-    __lsx_vst(dst0, dst_y, 0);
-    dst_y += 16;
-    src_rgba += 64;
-  }
-}
-
 void RGBAToUVRow_LSX(const uint8_t* src_rgba,
                      int src_stride_rgba,
                      uint8_t* dst_u,
@@ -1139,7 +2087,7 @@ void RGBAToUVRow_LSX(const uint8_t* src_rgba,
   __m128i const_38 = __lsx_vldi(0x413);
   __m128i const_94 = __lsx_vldi(0x42F);
   __m128i const_18 = __lsx_vldi(0x409);
-  __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
 
   for (x = 0; x < len; x++) {
     DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48,
@@ -1188,7 +2136,7 @@ void ARGBToUVJRow_LSX(const uint8_t* src_argb,
   __m128i const_21 = __lsx_vldi(0x415);
   __m128i const_53 = __lsx_vldi(0x435);
   __m128i const_10 = __lsx_vldi(0x40A);
-  __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
 
   for (x = 0; x < len; x++) {
     DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
@@ -1566,7 +2514,7 @@ void ARGBBlendRow_LSX(const uint8_t* src_argb,
   __m128i const_256 = __lsx_vldi(0x500);
   __m128i zero = __lsx_vldi(0);
   __m128i alpha = __lsx_vldi(0xFF);
-  __m128i control = {0xFF000000FF000000, 0xFF000000FF000000};
+  __m128i control = (__m128i)v2u64{0xFF000000FF000000, 0xFF000000FF000000};
 
   for (x = 0; x < len; x++) {
     DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb1, 0, src_argb1, 16,
@@ -1612,7 +2560,7 @@ void ARGBQuantizeRow_LSX(uint8_t* dst_argb,
   __m128i vec_offset = __lsx_vreplgr2vr_b(interval_offset);
   __m128i vec_scale = __lsx_vreplgr2vr_w(scale);
   __m128i zero = __lsx_vldi(0);
-  __m128i control = {0xFF000000FF000000, 0xFF000000FF000000};
+  __m128i control = (__m128i)v2u64{0xFF000000FF000000, 0xFF000000FF000000};
 
   for (x = 0; x < len; x++) {
     DUP4_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, dst_argb, 32, dst_argb, 48,
@@ -1821,6 +2769,216 @@ void HalfFloatRow_LSX(const uint16_t* src,
   }
 }
 
+struct RgbConstants {
+  uint8_t kRGBToY[4];
+  uint16_t kAddY;
+  uint16_t pad;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+                                                        128,
+                                                        0};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                        0x1080,
+                                                        0};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
+                                                      0x1080,
+                                                      0};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct RgbConstants* rgbconstants) {
+  asm volatile(
+      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
+      "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
+      "1:                                         \n\t"
+      "vld            $vr4,  %0,    0             \n\t"
+      "vld            $vr5,  %0,    16            \n\t"
+      "vld            $vr6,  %0,    32            \n\t"
+      "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of
+                                                         // ARGB
+      "vor.v          $vr12, $vr3,  $vr3          \n\t"
+      "vor.v          $vr13, $vr3,  $vr3          \n\t"
+      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
+                                                         // loop.
+      "vpickev.b      $vr8,  $vr5,  $vr4          \n\t"  // BR
+      "vpickev.b      $vr10, $vr7,  $vr6          \n\t"
+      "vpickod.b      $vr9,  $vr5,  $vr4          \n\t"  // GA
+      "vpickod.b      $vr11, $vr7,  $vr6          \n\t"
+      "vmaddwev.h.bu  $vr12, $vr8,  $vr0          \n\t"  // B
+      "vmaddwev.h.bu  $vr13, $vr10, $vr0          \n\t"
+      "vmaddwev.h.bu  $vr12, $vr9,  $vr1          \n\t"  // G
+      "vmaddwev.h.bu  $vr13, $vr11, $vr1          \n\t"
+      "vmaddwod.h.bu  $vr12, $vr8,  $vr2          \n\t"  // R
+      "vmaddwod.h.bu  $vr13, $vr10, $vr2          \n\t"
+      "addi.d         %0,    %0,    64            \n\t"
+      "vpickod.b      $vr10, $vr13, $vr12         \n\t"
+      "vst            $vr10, %1,    0             \n\t"
+      "addi.d         %1,    %1,    16            \n\t"
+      "bnez           %2,    1b                   \n\t"
+      : "+&r"(src_argb),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(rgbconstants)
+      : "memory");
+}
+
+void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_LSX(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+
+void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_LSX(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_LSX(src_abgr, dst_y, width, &kRawI601Constants);
+}
+
+void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_LSX(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct RgbConstants* rgbconstants) {
+  asm volatile(
+      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
+      "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
+      "1:                                         \n\t"
+      "vld            $vr4,  %0,    0             \n\t"
+      "vld            $vr5,  %0,    16            \n\t"
+      "vld            $vr6,  %0,    32            \n\t"
+      "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of
+                                                         // RGBA
+      "vor.v          $vr12, $vr3,  $vr3          \n\t"
+      "vor.v          $vr13, $vr3,  $vr3          \n\t"
+      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
+                                                         // loop.
+      "vpickev.b      $vr8,  $vr5,  $vr4          \n\t"  // AG
+      "vpickev.b      $vr10, $vr7,  $vr6          \n\t"
+      "vpickod.b      $vr9,  $vr5,  $vr4          \n\t"  // BR
+      "vpickod.b      $vr11, $vr7,  $vr6          \n\t"
+      "vmaddwev.h.bu  $vr12, $vr9,  $vr0          \n\t"  // B
+      "vmaddwev.h.bu  $vr13, $vr11, $vr0          \n\t"
+      "vmaddwod.h.bu  $vr12, $vr8,  $vr1          \n\t"  // G
+      "vmaddwod.h.bu  $vr13, $vr10, $vr1          \n\t"
+      "vmaddwod.h.bu  $vr12, $vr9,  $vr2          \n\t"  // R
+      "vmaddwod.h.bu  $vr13, $vr11, $vr2          \n\t"
+      "addi.d         %0,    %0,    64            \n\t"
+      "vpickod.b      $vr10, $vr13, $vr12         \n\t"
+      "vst            $vr10, %1,    0             \n\t"
+      "addi.d         %1,    %1,    16            \n\t"
+      "bnez           %2,    1b                   \n\t"
+      : "+&r"(src_rgba),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(rgbconstants)
+      : "memory");
+}
+
+void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_LSX(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+
+void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+  RGBAToYMatrixRow_LSX(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_LSX(src_bgra, dst_y, width, &kRawI601Constants);
+}
+
+static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
+                                uint8_t* dst_y,
+                                int width,
+                                const struct RgbConstants* rgbconstants) {
+  int8_t shuff[64] = {0,  2,  3,  5,  6,  8,  9,  11, 12, 14, 15, 17, 18,
+                      20, 21, 23, 24, 26, 27, 29, 30, 0,  1,  3,  4,  6,
+                      7,  9,  10, 12, 13, 15, 1,  0,  4,  0,  7,  0,  10,
+                      0,  13, 0,  16, 0,  19, 0,  22, 0,  25, 0,  28, 0,
+                      31, 0,  2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
+  asm volatile(
+      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
+      "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
+      "vld            $vr4,  %4,    0             \n\t"  // load shuff
+      "vld            $vr5,  %4,    16            \n\t"
+      "vld            $vr6,  %4,    32            \n\t"
+      "vld            $vr7,  %4,    48            \n\t"
+      "1:                                         \n\t"
+      "vld            $vr8,  %0,    0             \n\t"
+      "vld            $vr9,  %0,    16            \n\t"
+      "vld            $vr10, %0,    32            \n\t"  // load 16 pixels of
+                                                         // RGB
+      "vor.v          $vr12, $vr3,  $vr3          \n\t"
+      "vor.v          $vr13, $vr3,  $vr3          \n\t"
+      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
+                                                         // loop.
+      "vshuf.b        $vr14, $vr9,  $vr8,  $vr4   \n\t"
+      "vshuf.b        $vr15, $vr9,  $vr10, $vr5   \n\t"
+      "vshuf.b        $vr16, $vr9,  $vr8,  $vr6   \n\t"
+      "vshuf.b        $vr17, $vr9,  $vr10, $vr7   \n\t"
+      "vmaddwev.h.bu  $vr12, $vr16, $vr1          \n\t"  // G
+      "vmaddwev.h.bu  $vr13, $vr17, $vr1          \n\t"
+      "vmaddwev.h.bu  $vr12, $vr14, $vr0          \n\t"  // B
+      "vmaddwev.h.bu  $vr13, $vr15, $vr0          \n\t"
+      "vmaddwod.h.bu  $vr12, $vr14, $vr2          \n\t"  // R
+      "vmaddwod.h.bu  $vr13, $vr15, $vr2          \n\t"
+      "addi.d         %0,    %0,    48            \n\t"
+      "vpickod.b      $vr10, $vr13, $vr12         \n\t"
+      "vst            $vr10, %1,    0             \n\t"
+      "addi.d         %1,    %1,    16            \n\t"
+      "bnez           %2,    1b                   \n\t"
+      : "+&r"(src_rgba),    // %0
+        "+&r"(dst_y),       // %1
+        "+&r"(width)        // %2
+      : "r"(rgbconstants),  // %3
+        "r"(shuff)          // %4
+      : "memory");
+}
+
+void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LSX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LSX(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+
+void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LSX(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+
+void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LSX(src_raw, dst_y, width, &kRawI601Constants);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/source/row_msa.cc b/source/row_msa.cc
index b7d5bb5e..b7d5bb5e 100644
--- a/files/source/row_msa.cc
+++ b/source/row_msa.cc
diff --git a/files/source/row_neon.cc b/source/row_neon.cc
index 804ff839..31142a90 100644
--- a/files/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -89,12 +89,14 @@ extern "C" {
   "vsli.u16   d2, d2, #8                     \n" \
   "vsri.u16   d3, d3, #8                     \n"
 
+// TODO: Use single register for kUVCoeff and multiply by lane
 #define YUVTORGB_SETUP                                        \
+  "vld1.16    {d31}, [%[kRGBCoeffBias]]                   \n" \
   "vld4.8     {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \
-  "vld1.16    {d31[]}, [%[kRGBCoeffBias]]!   \n"              \
-  "vld1.16    {d20[], d21[]}, [%[kRGBCoeffBias]]! \n"         \
-  "vld1.16    {d22[], d23[]}, [%[kRGBCoeffBias]]! \n"         \
-  "vld1.16    {d24[], d25[]}, [%[kRGBCoeffBias]] \n"
+  "vdup.u16   q10, d31[1]                                 \n" \
+  "vdup.u16   q11, d31[2]                                 \n" \
+  "vdup.u16   q12, d31[3]                                 \n" \
+  "vdup.u16   d31, d31[0]                                 \n"
 
 // q0: B uint16x8_t
 // q1: G uint16x8_t
@@ -156,6 +158,29 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
       : "cc", "memory", YUVTORGB_REGS, "d6");
 }
 
+void I444ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV444 YUVTORGB
+          RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst3.8      {d0, d2, d4}, [%[dst_rgb24]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
+}
+
 void I422ToARGBRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -586,10 +611,10 @@ void DetileRow_NEON(const uint8_t* src,
                     int width) {
   asm volatile(
       "1:                                        \n"
-      "vld1.16     {q0}, [%0], %3                \n"  // load 16 bytes
+      "vld1.8      {q0}, [%0], %3                \n"  // load 16 bytes
       "subs        %2, %2, #16                   \n"  // 16 processed per loop
-      "pld         [%0, 1792]                    \n"
-      "vst1.16     {q0}, [%1]!                   \n"  // store 16 bytes
+      "pld         [%0, #1792]                   \n"
+      "vst1.8      {q0}, [%1]!                   \n"  // store 16 bytes
       "bgt         1b                            \n"
       : "+r"(src),            // %0
         "+r"(dst),            // %1
@@ -599,6 +624,26 @@ void DetileRow_NEON(const uint8_t* src,
   );
 }
 
+// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
+void DetileRow_16_NEON(const uint16_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint16_t* dst,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.16     {q0, q1}, [%0], %3            \n"  // load 16 pixels
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "pld         [%0, #3584]                   \n"
+      "vst1.16     {q0, q1}, [%1]!               \n"  // store 16 pixels
+      "bgt         1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2
+      : "r"(src_tile_stride * 2)    // %3
+      : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
 // Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
 void DetileSplitUVRow_NEON(const uint8_t* src_uv,
                            ptrdiff_t src_tile_stride,
@@ -609,7 +654,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
       "1:                                        \n"
       "vld2.8      {d0, d1}, [%0], %4            \n"
       "subs        %3, %3, #16                   \n"
-      "pld         [%0, 1792]                    \n"
+      "pld         [%0, #1792]                   \n"
       "vst1.8      {d0}, [%1]!                   \n"
       "vst1.8      {d1}, [%2]!                   \n"
       "bgt         1b                            \n"
@@ -622,6 +667,101 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
   );
 }
 
+#if LIBYUV_USE_ST2
+// Read 16 Y, 8 UV, and write 8 YUYV.
+void DetileToYUY2_NEON(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0], %4                \n"  // Load 16 Y
+      "pld         [%0, #1792]                   \n"
+      "vld1.8      {q1}, [%1], %5                \n"  // Load 8 UV
+      "pld         [%1, #1792]                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vst2.8      {q0, q1}, [%2]!               \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),                            // %0
+        "+r"(src_uv),                           // %1
+        "+r"(dst_yuy2),                         // %2
+        "+r"(width)                             // %3
+      : "r"(src_y_tile_stride),                 // %4
+        "r"(src_uv_tile_stride)                 // %5
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber list
+  );
+}
+#else
+// Read 16 Y, 8 UV, and write 8 YUYV.
+void DetileToYUY2_NEON(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0], %4                \n"  // Load 16 Y
+      "vld1.8      {q1}, [%1], %5                \n"  // Load 8 UV
+      "subs        %3, %3, #16                   \n"
+      "pld         [%0, #1792]                   \n"
+      "vzip.8      q0, q1                        \n"
+      "pld         [%1, #1792]                   \n"
+      "vst1.8      {q0, q1}, [%2]!               \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),                            // %0
+        "+r"(src_uv),                           // %1
+        "+r"(dst_yuy2),                         // %2
+        "+r"(width)                             // %3
+      : "r"(src_y_tile_stride),                 // %4
+        "r"(src_uv_tile_stride)                 // %5
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber list
+  );
+}
+#endif
+
+void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q14}, [%0]!                  \n"  // Load lower bits.
+      "vld1.8      {q9}, [%0]!                   \n"  // Load upper bits row
+                                                      // by row.
+      "vld1.8      {q11}, [%0]!                  \n"
+      "vld1.8      {q13}, [%0]!                  \n"
+      "vld1.8      {q15}, [%0]!                  \n"
+      "vshl.u8     q8, q14, #6                   \n"  // Shift lower bit data
+                                                      // appropriately.
+      "vshl.u8     q10, q14, #4                  \n"
+      "vshl.u8     q12, q14, #2                  \n"
+      "vzip.u8     q8, q9                        \n"  // Interleave upper and
+                                                      // lower bits.
+      "vzip.u8     q10, q11                      \n"
+      "vzip.u8     q12, q13                      \n"
+      "vzip.u8     q14, q15                      \n"
+      "vsri.u16    q8, q8, #10                   \n"  // Copy upper 6 bits
+                                                      // into lower 6 bits for
+                                                      // better accuracy in
+                                                      // conversions.
+      "vsri.u16    q9, q9, #10                   \n"
+      "vsri.u16    q10, q10, #10                 \n"
+      "vsri.u16    q11, q11, #10                 \n"
+      "vsri.u16    q12, q12, #10                 \n"
+      "vsri.u16    q13, q13, #10                 \n"
+      "vsri.u16    q14, q14, #10                 \n"
+      "vsri.u16    q15, q15, #10                 \n"
+      "vstmia      %1!, {q8-q15}                 \n"  // Store pixel block (64
+                                                      // pixels).
+      "subs        %2, %2, #80                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src),  // %0
+        "+r"(dst),  // %1
+        "+r"(size)  // %2
+      :
+      : "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
 // Reads 16 U's and V's and writes out 16 pairs of UV.
 void MergeUVRow_NEON(const uint8_t* src_u,
                      const uint8_t* src_v,
@@ -664,7 +804,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
         "+r"(dst_b),                      // %3
         "+r"(width)                       // %4
       :                                   // Input registers
-      : "cc", "memory", "d0", "d1", "d2"  // Clobber List
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
   );
 }
 
@@ -1505,6 +1645,29 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
   );
 }
 
+void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // stride + src_yuy2
+      "1:                                        \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of YUY2.
+      "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
+      "vld2.8      {q2, q3}, [%1]!               \n"  // load next row YUY2.
+      "vrhadd.u8   q4, q1, q3                    \n"  // average rows of UV
+      "vst1.8      {q4}, [%2]!                   \n"  // store 8 UV.
+      "bgt         1b                            \n"
+      : "+r"(src_yuy2),     // %0
+        "+r"(stride_yuy2),  // %1
+        "+r"(dst_uv),       // %2
+        "+r"(width)         // %3
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+        "d7"  // Clobber List
+  );
+}
+
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                          uint8_t* dst_argb,
@@ -1590,7 +1753,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
 
 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width) {
   asm volatile(
       "vdup.32     d7, %2                        \n"  // dither4
@@ -1664,19 +1827,27 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
   );
 }
 
+struct RgbUVConstants {
+  uint8_t kRGBToU[4];
+  uint8_t kRGBToV[4];
+};
+
 // 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "vmov.u8     d24, #112                     \n"  // UB / VR 0.875
-                                                      // coefficient
-      "vmov.u8     d25, #74                      \n"  // UG -0.5781 coefficient
-      "vmov.u8     d26, #38                      \n"  // UR -0.2969 coefficient
-      "vmov.u8     d27, #18                      \n"  // VB -0.1406 coefficient
-      "vmov.u8     d28, #94                      \n"  // VG -0.7344 coefficient
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width,
+                               const struct RgbUVConstants* rgbuvconstants) {
+  asm volatile(
+
+      "vld1.8      {d0}, [%4]                    \n"  // load rgbuvconstants
+      "vdup.u8     d24, d0[0]                    \n"  // UB  0.875  coefficient
+      "vdup.u8     d25, d0[1]                    \n"  // UG -0.5781 coefficient
+      "vdup.u8     d26, d0[2]                    \n"  // UR -0.2969 coefficient
+      "vdup.u8     d27, d0[4]                    \n"  // VB -0.1406 coefficient
+      "vdup.u8     d28, d0[5]                    \n"  // VG -0.7344 coefficient
       "vmov.u16    q15, #0x8080                  \n"  // 128.5
+
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
       "subs        %3, %3, #8                    \n"  // 8 processed per loop.
@@ -1694,15 +1865,53 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
       "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels U.
       "vst1.8      {d1}, [%2]!                   \n"  // store 8 pixels V.
       "bgt         1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
+      : "+r"(src_argb),      // %0
+        "+r"(dst_u),         // %1
+        "+r"(dst_v),         // %2
+        "+r"(width)          // %3
+      : "r"(rgbuvconstants)  // %4
       : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
         "q15");
 }
 
+// RGB to bt601 coefficients
+// UB   0.875 coefficient = 112
+// UG -0.5781 coefficient = 74
+// UR -0.2969 coefficient = 38
+// VB -0.1406 coefficient = 18
+// VG -0.7344 coefficient = 94
+// VR   0.875 coefficient = 112 (ignored)
+
+static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
+                                                            {18, 94, 112, 0}};
+
+// RGB to JPeg coefficients
+// UB coeff 0.500    = 127
+// UG coeff -0.33126 = 84
+// UR coeff -0.16874 = 43
+// VB coeff -0.08131 = 20
+// VG coeff -0.41869 = 107
+// VR coeff 0.500    = 127 (ignored)
+
+static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
+                                                            {20, 107, 127, 0}};
+
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kRgb24I601UVConstants);
+}
+
+void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kRgb24JPegUVConstants);
+}
+
 // clang-format off
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
 #define RGBTOUV(QB, QG, QR)                                                 \
@@ -1762,7 +1971,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
   );
 }
 
-// TODO(fbarchard): Subsample match C code.
+// TODO(fbarchard): Subsample match Intel code.
 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
@@ -1808,6 +2017,51 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
   );
 }
 
+void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_uj,
+                       uint8_t* dst_vj,
+                       int width) {
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #127 / 2                 \n"  // UB / VR 0.500 coefficient
+      "vmov.s16    q11, #84 / 2                  \n"  // UG -0.33126 coefficient
+      "vmov.s16    q12, #43 / 2                  \n"  // UR -0.16874 coefficient
+      "vmov.s16    q13, #20 / 2                  \n"  // VB -0.08131 coefficient
+      "vmov.s16    q14, #107 / 2                 \n"  // VG -0.41869 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ABGR pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ABGR pixels.
+      "vpaddl.u8   q0, q0                        \n"  // R 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // B 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ABGR pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ABGR pixels.
+      "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+    RGBTOUV(q2, q1, q0)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_stride_abgr),  // %1
+    "+r"(dst_uj),     // %2
+    "+r"(dst_vj),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
 // TODO(fbarchard): Subsample match C code.
 void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
                         int src_stride_rgb24,
@@ -2494,7 +2748,6 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
 struct RgbConstants {
   uint8_t kRGBToY[4];
   uint16_t kAddY;
-  uint16_t pad;
 };
 
 // RGB to JPeg coefficients
@@ -2502,11 +2755,9 @@ struct RgbConstants {
 // G * 0.5870 coefficient = 150
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
-static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
-                                                        128,
-                                                        0};
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
 
-static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
 
 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
@@ -2515,12 +2766,9 @@ static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
 // Add 16.5 = 0x1080
 
 static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
-                                                        0x1080,
-                                                        0};
+                                                        0x1080};
 
-static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
-                                                      0x1080,
-                                                      0};
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
 
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
@@ -2567,6 +2815,10 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
   ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
 }
 
+void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
 // Same code as ARGB, except the LD4
 void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
@@ -2846,6 +3098,8 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width) {
   asm volatile(
+      "vmov.u16    q15, #0x00ff                  \n"  // 255 for rounding up
+
       // Attenuate 8 pixels.
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of ARGB.
@@ -2853,16 +3107,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
       "vmull.u8    q10, d0, d3                   \n"  // b * a
       "vmull.u8    q11, d1, d3                   \n"  // g * a
       "vmull.u8    q12, d2, d3                   \n"  // r * a
-      "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
-      "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
-      "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+      "vaddhn.u16  d0, q10, q15                  \n"  // (b + 255) >> 8
+      "vaddhn.u16  d1, q11, q15                  \n"  // (g + 255) >> 8
+      "vaddhn.u16  d2, q12, q15                  \n"  // (r + 255) >> 8
       "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
       "bgt         1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
       :
-      : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
+      : "cc", "memory", "q0", "q1", "q10", "q11", "q12", "q15");
 }
 
 // Quantize 8 ARGB pixels (32 bytes).
@@ -3633,7 +3887,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
         "+r"(dst_v),   // %2
         "+r"(width)    // %3
       : "r"(shift)     // %4
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+      : "cc", "memory", "q0", "q1", "q2");
 }
 
 void MergeUVRow_16_NEON(const uint16_t* src_u,
@@ -3687,31 +3941,25 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                        int scale,
                        int width) {
   asm volatile(
-      "vdup.16     q0, %3                        \n"
-      "1:                                        \n"
-      "vld1.16     {q1}, [%0]!                   \n"
-      "vld1.16     {q2}, [%0]!                   \n"
-      "vmovl.u16   q3, d2                        \n"
-      "vmovl.u16   q1, d3                        \n"
-      "vmovl.u16   q4, d4                        \n"
-      "vmovl.u16   q2, d5                        \n"
-      "vshl.u32    q3, q3, q0                    \n"
-      "vshl.u32    q4, q4, q0                    \n"
-      "vshl.u32    q1, q1, q0                    \n"
-      "vshl.u32    q2, q2, q0                    \n"
-      "vmovn.u32   d2, q3                        \n"
-      "vmovn.u32   d3, q1                        \n"
-      "vmovn.u32   d4, q4                        \n"
-      "vmovn.u32   d5, q2                        \n"
-      "vst1.16     {q1}, [%1]!                   \n"
-      "vst1.16     {q2}, [%1]!                   \n"
+      "vdup.16     d8, %3                        \n"
+      "1:                                        \n"
+      "vld1.16     {q2, q3}, [%0]!               \n"
+      "vmull.u16   q0, d4, d8                    \n"
+      "vmull.u16   q1, d5, d8                    \n"
+      "vmull.u16   q2, d6, d8                    \n"
+      "vmull.u16   q3, d7, d8                    \n"
+      "vshrn.u32   d0, q0, #16                   \n"
+      "vshrn.u32   d1, q1, #16                   \n"
+      "vshrn.u32   d2, q2, #16                   \n"
+      "vshrn.u32   d3, q3, #16                   \n"
+      "vst1.16     {q0, q1}, [%1]!               \n"  // store 16 pixels
       "subs        %2, %2, #16                   \n"  // 16 src pixels per loop
       "bgt         1b                            \n"
       : "+r"(src_y),  // %0
         "+r"(dst_y),  // %1
         "+r"(width)   // %2
       : "r"(scale)    // %3
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+      : "cc", "memory", "q0", "q1", "q2", "q3", "d8");
 }
 
 // Use scale to convert lsb formats to msb, depending how many bits there are:
diff --git a/files/source/row_neon64.cc b/source/row_neon64.cc
index 0f120373..1679f87c 100644
--- a/files/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -142,6 +142,29 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
       : "cc", "memory", YUVTORGB_REGS, "v19");
 }
 
+void I444ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV444 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
+}
+
 void I422ToARGBRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -627,6 +650,26 @@ void DetileRow_NEON(const uint8_t* src,
   );
 }
 
+// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
+void DetileRow_16_NEON(const uint16_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint16_t* dst,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.8h,v1.8h}, [%0], %3       \n"  // load 16 pixels
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 3584]         \n"  // 7 tiles of 512b ahead
+      "st1         {v0.8h,v1.8h}, [%1], #32      \n"  // store 16 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2
+      : "r"(src_tile_stride * 2)    // %3
+      : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
 // Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
 void DetileSplitUVRow_NEON(const uint8_t* src_uv,
                            ptrdiff_t src_tile_stride,
@@ -651,6 +694,100 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
 }
 
 #if LIBYUV_USE_ST2
+// Read 16 Y, 8 UV, and write 8 YUY2
+void DetileToYUY2_NEON(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], %4            \n"  // load 16 Ys
+      "prfm        pldl1keep, [%0, 1792]         \n"
+      "ld1         {v1.16b}, [%1], %5            \n"  // load 8 UVs
+      "prfm        pldl1keep, [%1, 1792]         \n"
+      "subs        %w3, %w3, #16                 \n"  // store 8 YUY2
+      "st2         {v0.16b,v1.16b}, [%2], #32    \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_y),                // %0
+        "+r"(src_uv),               // %1
+        "+r"(dst_yuy2),             // %2
+        "+r"(width)                 // %3
+      : "r"(src_y_tile_stride),     // %4
+        "r"(src_uv_tile_stride)     // %5
+      : "cc", "memory", "v0", "v1"  // Clobber list
+  );
+}
+#else
+// Read 16 Y, 8 UV, and write 8 YUY2
+void DetileToYUY2_NEON(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], %4            \n"  // load 16 Ys
+      "ld1         {v1.16b}, [%1], %5            \n"  // load 8 UVs
+      "subs        %w3, %w3, #16                 \n"
+      "prfm        pldl1keep, [%0, 1792]         \n"
+      "zip1        v2.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%1, 1792]         \n"
+      "zip2        v3.16b, v0.16b, v1.16b        \n"
+      "st1         {v2.16b,v3.16b}, [%2], #32    \n"  // store 8 YUY2
+      "b.gt        1b                            \n"
+      : "+r"(src_y),                            // %0
+        "+r"(src_uv),                           // %1
+        "+r"(dst_yuy2),                         // %2
+        "+r"(width)                             // %3
+      : "r"(src_y_tile_stride),                 // %4
+        "r"(src_uv_tile_stride)                 // %5
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber list
+  );
+}
+#endif
+
+// Unpack MT2T into tiled P010 64 pixels at a time. See
+// tinyurl.com/mtk-10bit-video-format for format documentation.
+void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v7.16b}, [%0], #16           \n"
+      "ld1         {v0.16b-v3.16b}, [%0], #64    \n"
+      "shl         v4.16b, v7.16b, #6            \n"
+      "shl         v5.16b, v7.16b, #4            \n"
+      "shl         v6.16b, v7.16b, #2            \n"
+      "subs        %2, %2, #80                   \n"
+      "zip1        v16.16b, v4.16b, v0.16b       \n"
+      "zip1        v18.16b, v5.16b, v1.16b       \n"
+      "zip1        v20.16b, v6.16b, v2.16b       \n"
+      "zip1        v22.16b, v7.16b, v3.16b       \n"
+      "zip2        v17.16b, v4.16b, v0.16b       \n"
+      "zip2        v19.16b, v5.16b, v1.16b       \n"
+      "zip2        v21.16b, v6.16b, v2.16b       \n"
+      "zip2        v23.16b, v7.16b, v3.16b       \n"
+      "sri         v16.8h, v16.8h, #10           \n"
+      "sri         v17.8h, v17.8h, #10           \n"
+      "sri         v18.8h, v18.8h, #10           \n"
+      "sri         v19.8h, v19.8h, #10           \n"
+      "st1         {v16.8h-v19.8h}, [%1], #64    \n"
+      "sri         v20.8h, v20.8h, #10           \n"
+      "sri         v21.8h, v21.8h, #10           \n"
+      "sri         v22.8h, v22.8h, #10           \n"
+      "sri         v23.8h, v23.8h, #10           \n"
+      "st1         {v20.8h-v23.8h}, [%1], #64    \n"
+      "b.gt        1b                            \n"
+      : "+r"(src),  // %0
+        "+r"(dst),  // %1
+        "+r"(size)  // %2
+      :
+      : "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+}
+
+#if LIBYUV_USE_ST2
 // Reads 16 U's and V's and writes out 16 pairs of UV.
 void MergeUVRow_NEON(const uint8_t* src_u,
                      const uint8_t* src_v,
@@ -1729,6 +1866,29 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
   );
 }
 
+void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width) {
+  const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels
+      "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
+      "ld2         {v2.16b,v3.16b}, [%1], #32    \n"  // load next row
+      "urhadd      v4.16b, v1.16b, v3.16b        \n"  // average rows of UV
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "st1         {v4.16b}, [%2], #16           \n"  // store 8 UV.
+      "b.gt        1b                            \n"
+      : "+r"(src_yuy2),   // %0
+        "+r"(src_yuy2b),  // %1
+        "+r"(dst_uv),     // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                          uint8_t* dst_argb,
@@ -1819,24 +1979,23 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
 
 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width) {
   asm volatile(
-      "dup         v1.4s, %w2                    \n"  // dither4
+      "dup         v1.4s, %w3                    \n"  // dither4
       "1:                                        \n"
-      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // load 8
-                                                                 // pixels
-      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       "uqadd       v16.8b, v16.8b, v1.8b         \n"
       "prfm        pldl1keep, [%0, 448]          \n"
       "uqadd       v17.8b, v17.8b, v1.8b         \n"
       "uqadd       v18.8b, v18.8b, v1.8b         \n" ARGBTORGB565
-      "st1         {v18.16b}, [%0], #16          \n"  // store 8 pixels RGB565.
+      "st1         {v18.16b}, [%1], #16          \n"  // store 8 pixels RGB565.
       "b.gt        1b                            \n"
-      : "+r"(dst_rgb)   // %0
-      : "r"(src_argb),  // %1
-        "r"(dither4),   // %2
-        "r"(width)      // %3
+      : "+r"(src_argb),  // %0
+        "+r"(dst_rgb),   // %1
+        "+r"(width)      // %2
+      : "r"(dither4)     // %3
       : "cc", "memory", "v1", "v16", "v17", "v18", "v19");
 }
 
@@ -2039,19 +2198,26 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
   );
 }
 
+struct RgbUVConstants {
+  uint8_t kRGBToU[4];
+  uint8_t kRGBToV[4];
+};
+
 // 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width,
+                               const struct RgbUVConstants* rgbuvconstants) {
   asm volatile(
-      "movi        v24.8b, #112                  \n"  // UB / VR 0.875
-                                                      // coefficient
-      "movi        v25.8b, #74                   \n"  // UG -0.5781 coefficient
-      "movi        v26.8b, #38                   \n"  // UR -0.2969 coefficient
-      "movi        v27.8b, #18                   \n"  // VB -0.1406 coefficient
-      "movi        v28.8b, #94                   \n"  // VG -0.7344 coefficient
-      "movi        v29.16b,#0x80                 \n"  // 128.5
+      "ldr         d0, [%4]                      \n"  // load rgbuvconstants
+      "dup         v24.16b, v0.b[0]              \n"  // UB  0.875 coefficient
+      "dup         v25.16b, v0.b[1]              \n"  // UG -0.5781 coefficient
+      "dup         v26.16b, v0.b[2]              \n"  // UR -0.2969 coefficient
+      "dup         v27.16b, v0.b[4]              \n"  // VB -0.1406 coefficient
+      "dup         v28.16b, v0.b[5]              \n"  // VG -0.7344 coefficient
+      "movi        v29.16b, #0x80                \n"  // 128.5
+
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
@@ -2070,15 +2236,53 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels U.
       "st1         {v1.8b}, [%2], #8             \n"  // store 8 pixels V.
       "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
+      : "+r"(src_argb),      // %0
+        "+r"(dst_u),         // %1
+        "+r"(dst_v),         // %2
+        "+r"(width)          // %3
+      : "r"(rgbuvconstants)  // %4
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
         "v27", "v28", "v29");
 }
 
+// RGB to bt601 coefficients
+// UB   0.875 coefficient = 112
+// UG -0.5781 coefficient = 74
+// UR -0.2969 coefficient = 38
+// VB -0.1406 coefficient = 18
+// VG -0.7344 coefficient = 94
+// VR   0.875 coefficient = 112 (ignored)
+
+static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
+                                                            {18, 94, 112, 0}};
+
+// RGB to JPeg coefficients
+// UB coeff 0.500    = 127
+// UG coeff -0.33126 = 84
+// UR coeff -0.16874 = 43
+// VB coeff -0.08131 = 20
+// VG coeff -0.41869 = 107
+// VR coeff 0.500    = 127 (ignored)
+
+static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
+                                                            {20, 107, 127, 0}};
+
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kRgb24I601UVConstants);
+}
+
+void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kRgb24JPegUVConstants);
+}
+
 #define RGBTOUV_SETUP_REG                                                  \
   "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
   "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
@@ -2144,6 +2348,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
   );
 }
 
+// TODO(fbarchard): Subsample match Intel code.
 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
@@ -2189,6 +2394,51 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
   );
 }
 
+void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_uj,
+                       uint8_t* dst_vj,
+                       int width) {
+  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
+  asm volatile (
+      "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
+      "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
+      "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
+      "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
+      "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
+      "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "uaddlp      v0.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "uadalp      v0.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+    RGBTOUV(v2.8h, v1.8h, v0.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_abgr_1),  // %1
+    "+r"(dst_uj),     // %2
+    "+r"(dst_vj),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
 void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
                         int src_stride_rgb24,
                         uint8_t* dst_u,
@@ -2738,34 +2988,8 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
 struct RgbConstants {
   uint8_t kRGBToY[4];
   uint16_t kAddY;
-  uint16_t pad;
 };
 
-// RGB to JPeg coefficients
-// B * 0.1140 coefficient = 29
-// G * 0.5870 coefficient = 150
-// R * 0.2990 coefficient = 77
-// Add 0.5 = 0x80
-static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
-                                                        128,
-                                                        0};
-
-static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
-
-// RGB to BT.601 coefficients
-// B * 0.1016 coefficient = 25
-// G * 0.5078 coefficient = 129
-// R * 0.2578 coefficient = 66
-// Add 16.5 = 0x1080
-
-static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
-                                                        0x1080,
-                                                        0};
-
-static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
-                                                      0x1080,
-                                                      0};
-
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
                            uint8_t* dst_y,
@@ -2800,6 +3024,26 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
         "v17");
 }
 
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                        0x1080};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
+
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
 }
@@ -2812,6 +3056,10 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
   ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
 }
 
+void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
 // Same code as ARGB, except the LD4
 void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
@@ -3193,6 +3441,8 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width) {
   asm volatile(
+      "movi        v7.8h, #0x00ff                \n"  // 255 for rounding up
+
       // Attenuate 8 pixels.
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
@@ -3201,16 +3451,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
       "prfm        pldl1keep, [%0, 448]          \n"
       "umull       v5.8h, v1.8b, v3.8b           \n"         // g * a
       "umull       v6.8h, v2.8b, v3.8b           \n"         // r * a
-      "uqrshrn     v0.8b, v4.8h, #8              \n"         // b >>= 8
-      "uqrshrn     v1.8b, v5.8h, #8              \n"         // g >>= 8
-      "uqrshrn     v2.8b, v6.8h, #8              \n"         // r >>= 8
+      "addhn       v0.8b, v4.8h, v7.8h           \n"         // (b + 255) >> 8
+      "addhn       v1.8b, v5.8h, v7.8h           \n"         // (g + 255) >> 8
+      "addhn       v2.8b, v6.8h, v7.8h           \n"         // (r + 255) >> 8
       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
       "b.gt        1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
       :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
 // Quantize 8 ARGB pixels (32 bytes).
@@ -3751,6 +4001,86 @@ void ByteToFloatRow_NEON(const uint8_t* src,
       : "cc", "memory", "v1", "v2", "v3");
 }
 
+// Convert FP16 Half Floats to FP32 Floats
+void ConvertFP16ToFP32Row_NEON(const uint16_t* src,  // fp16
+                               float* dst,
+                               int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v1.8h}, [%0], #16            \n"  // load 8 halffloats
+      "subs        %w2, %w2, #8                  \n"  // 8 floats per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fcvtl       v2.4s, v1.4h                  \n"  // 8 floats
+      "fcvtl2      v3.4s, v1.8h                  \n"
+      "stp         q2, q3, [%1], #32             \n"  // store 8 floats
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+// Convert FP16 Half Floats to FP32 Floats
+// Read a column and write a row
+void ConvertFP16ToFP32Column_NEON(const uint16_t* src,  // fp16
+                                  int src_stride,       // stride in elements
+                                  float* dst,
+                                  int width) {
+  asm volatile(
+      "cmp         %w2, #8                       \n"  // Is there 8 rows?
+      "b.lo        2f                            \n"
+      "1:                                        \n"
+      "ld1         {v0.h}[0], [%0], %3           \n"  // load 8 halffloats
+      "ld1         {v0.h}[1], [%0], %3           \n"
+      "ld1         {v0.h}[2], [%0], %3           \n"
+      "ld1         {v0.h}[3], [%0], %3           \n"
+      "ld1         {v1.h}[0], [%0], %3           \n"
+      "ld1         {v1.h}[1], [%0], %3           \n"
+      "ld1         {v1.h}[2], [%0], %3           \n"
+      "ld1         {v1.h}[3], [%0], %3           \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 rows per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fcvtl       v2.4s, v0.4h                  \n"  // 4 floats
+      "fcvtl       v3.4s, v1.4h                  \n"  // 4 more floats
+      "stp         q2, q3, [%1], #32             \n"  // store 8 floats
+      "b.gt        1b                            \n"
+      "cmp         %w2, #1                       \n"  // Is there 1 value?
+      "b.lo        3f                            \n"
+      "2:                                        \n"
+      "ld1         {v1.h}[0], [%0], %3           \n"  // load 1 halffloats
+      "subs        %w2, %w2, #1                  \n"  // 1 floats per loop
+      "fcvtl       v2.4s, v1.4h                  \n"  // 1 floats
+      "str         s2, [%1], #4                  \n"  // store 1 floats
+      "b.gt        2b                            \n"
+      "3:                                        \n"
+      : "+r"(src),                        // %0
+        "+r"(dst),                        // %1
+        "+r"(width)                       // %2
+      : "r"((ptrdiff_t)(src_stride * 2))  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Convert FP32 Floats to FP16 Half Floats
+void ConvertFP32ToFP16Row_NEON(const float* src,
+                               uint16_t* dst,  // fp16
+                               int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ldp         q2, q3, [%0], #32             \n"  // load 8 floats
+      "subs        %w2, %w2, #8                  \n"  // 8 floats per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fcvtn       v1.4h, v2.4s                  \n"  // 8 fp16 halffloats
+      "fcvtn2      v1.8h, v3.4s                  \n"
+      "str         q1, [%1], #16                 \n"  // store 8 fp16 halffloats
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
 float ScaleMaxSamples_NEON(const float* src,
                            float* dst,
                            float scale,
@@ -4241,23 +4571,19 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                        int scale,
                        int width) {
   asm volatile(
-      "dup         v0.8h, %w3                    \n"
+      "dup         v4.8h, %w3                    \n"
       "1:                                        \n"
-      "ldp         q1, q2, [%0], #32             \n"
-      "ushll       v3.4s, v1.4h, #0              \n"
-      "ushll       v4.4s, v2.4h, #0              \n"
+      "ldp         q2, q3, [%0], #32             \n"
+      "umull       v0.4s, v2.4h, v4.4h           \n"
+      "umull2      v1.4s, v2.8h, v4.8h           \n"
+      "umull       v2.4s, v3.4h, v4.4h           \n"
+      "umull2      v3.4s, v3.8h, v4.8h           \n"
       "prfm        pldl1keep, [%0, 448]          \n"
-      "ushll2      v1.4s, v1.8h, #0              \n"
-      "ushll2      v2.4s, v2.8h, #0              \n"
-      "mul         v3.4s, v0.4s, v3.4s           \n"
-      "mul         v4.4s, v0.4s, v4.4s           \n"
-      "mul         v1.4s, v0.4s, v1.4s           \n"
-      "mul         v2.4s, v0.4s, v2.4s           \n"
-      "shrn        v3.4h, v3.4s, #16             \n"
-      "shrn        v4.4h, v4.4s, #16             \n"
-      "shrn2       v3.8h, v1.4s, #16             \n"
-      "shrn2       v4.8h, v2.4s, #16             \n"
-      "stp         q3, q3, [%1], #32             \n"  // store 16 pixels
+      "shrn        v0.4h, v0.4s, #16             \n"
+      "shrn2       v0.8h, v1.4s, #16             \n"
+      "shrn        v1.4h, v2.4s, #16             \n"
+      "shrn2       v1.8h, v3.4s, #16             \n"
+      "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
       "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
       "b.gt        1b                            \n"
       : "+r"(src_y),  // %0
diff --git a/source/row_rvv.cc b/source/row_rvv.cc
new file mode 100644
index 00000000..0bf2bef6
--- /dev/null
+++ b/source/row_rvv.cc
@@ -0,0 +1,1394 @@
+/*
+ *  Copyright 2023 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * Contributed by Darren Hsieh <darren.hsieh@sifive.com>
+ * Contributed by Bruce Lai <bruce.lai@sifive.com>
+ */
+
+#include "libyuv/row.h"
+
+// This module is for clang rvv. GCC hasn't supported segment load & store.
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \
+    defined(__clang__)
+#include <assert.h>
+#include <riscv_vector.h>
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Fill YUV -> RGB conversion constants into vectors
+// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+// register) is set to round-to-nearest-up mode(0).
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
+  {                                                              \
+    asm volatile("csrwi vxrm, 0");                               \
+    ub = yuvconst->kUVCoeff[0];                                  \
+    vr = yuvconst->kUVCoeff[1];                                  \
+    ug = yuvconst->kUVCoeff[2];                                  \
+    vg = yuvconst->kUVCoeff[3];                                  \
+    yg = yuvconst->kRGBCoeffBias[0];                             \
+    bb = yuvconst->kRGBCoeffBias[1] + 32;                        \
+    bg = yuvconst->kRGBCoeffBias[2] - 32;                        \
+    br = yuvconst->kRGBCoeffBias[3] + 32;                        \
+  }
+
+// Read [2*VLEN/8] Y, [VLEN/8] U and [VLEN/8] V from 422
+#define READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \
+  {                                                              \
+    vuint8m1_t v_tmp0, v_tmp1;                                   \
+    vuint8m2_t v_y;                                              \
+    vuint16m2_t v_u_16, v_v_16;                                  \
+    vl = __riscv_vsetvl_e8m1((w + 1) / 2);                       \
+    v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl);                     \
+    v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);             \
+    v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl);                     \
+    v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);             \
+    v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl);          \
+    v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl);          \
+    v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);             \
+    v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);             \
+    vl = __riscv_vsetvl_e8m2(w);                                 \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);                        \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);                \
+  }
+
+// Read [2*VLEN/8] Y, [2*VLEN/8] U, and [2*VLEN/8] V from 444
+#define READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \
+  {                                                              \
+    vuint8m2_t v_y;                                              \
+    vl = __riscv_vsetvl_e8m2(w);                                 \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);                        \
+    v_u = __riscv_vle8_v_u8m2(src_u, vl);                        \
+    v_v = __riscv_vle8_v_u8m2(src_v, vl);                        \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);                \
+  }
+
+// Convert from YUV to fixed point RGB
+#define YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, \
+                 v_b_16, v_r_16)                                               \
+  {                                                                            \
+    vuint16m4_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4;                        \
+    vuint32m8_t v_tmp5;                                                        \
+    v_tmp0 = __riscv_vwmulu_vx_u16m4(v_u, ug, vl);                             \
+    v_y_16 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl);                        \
+    v_tmp0 = __riscv_vwmaccu_vx_u16m4(v_tmp0, vg, v_v, vl);                    \
+    v_tmp1 = __riscv_vwmulu_vx_u16m4(v_u, ub, vl);                             \
+    v_tmp5 = __riscv_vwmulu_vx_u32m8(v_y_16, yg, vl);                          \
+    v_tmp2 = __riscv_vnsrl_wx_u16m4(v_tmp5, 16, vl);                           \
+    v_tmp3 = __riscv_vadd_vx_u16m4(v_tmp2, bg, vl);                            \
+    v_tmp4 = __riscv_vadd_vv_u16m4(v_tmp2, v_tmp1, vl);                        \
+    v_tmp2 = __riscv_vwmaccu_vx_u16m4(v_tmp2, vr, v_v, vl);                    \
+    v_g_16 = __riscv_vssubu_vv_u16m4(v_tmp3, v_tmp0, vl);                      \
+    v_b_16 = __riscv_vssubu_vx_u16m4(v_tmp4, bb, vl);                          \
+    v_r_16 = __riscv_vssubu_vx_u16m4(v_tmp2, br, vl);                          \
+  }
+
+// Convert from fixed point RGB To 8 bit RGB
+#define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \
+  {                                                          \
+    v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, vl);            \
+    v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, vl);            \
+    v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl);            \
+  }
+
+// Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv
+#define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16)   \
+  {                                                        \
+    vuint8m1_t v_tmp0, v_tmp1;                             \
+    vuint8m2_t v_y;                                        \
+    vuint16m2_t v_u_16, v_v_16;                            \
+    vl = __riscv_vsetvl_e8m1((w + 1) / 2);                 \
+    __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_uv, vl); \
+    v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);       \
+    v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);       \
+    v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl);    \
+    v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl);    \
+    v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);       \
+    v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);       \
+    vl = __riscv_vsetvl_e8m2(w);                           \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);                  \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);          \
+  }
+
+// Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu
+#define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16)   \
+  {                                                        \
+    vuint8m1_t v_tmp0, v_tmp1;                             \
+    vuint8m2_t v_y;                                        \
+    vuint16m2_t v_u_16, v_v_16;                            \
+    vl = __riscv_vsetvl_e8m1((w + 1) / 2);                 \
+    __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_vu, vl); \
+    v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);       \
+    v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);       \
+    v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl);    \
+    v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl);    \
+    v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);       \
+    v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);       \
+    vl = __riscv_vsetvl_e8m2(w);                           \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);                  \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);          \
+  }
+
+#ifdef HAS_ARGBTOAR64ROW_RVV
+void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
+  size_t avl = (size_t)4 * width;
+  do {
+    vuint16m8_t v_ar64;
+    vuint8m4_t v_argb;
+    size_t vl = __riscv_vsetvl_e8m4(avl);
+    v_argb = __riscv_vle8_v_u8m4(src_argb, vl);
+    v_ar64 = __riscv_vwaddu_vx_u16m8(v_argb, 0, vl);
+    v_ar64 = __riscv_vmul_vx_u16m8(v_ar64, 0x0101, vl);
+    __riscv_vse16_v_u16m8(dst_ar64, v_ar64, vl);
+    avl -= vl;
+    src_argb += vl;
+    dst_ar64 += vl;
+  } while (avl > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_RVV
+void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
+  size_t avl = (size_t)width;
+  do {
+    vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16;
+    vuint8m1_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m1(avl);
+    __riscv_vlseg4e8_v_u8m1(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl);
+    v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl);
+    v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl);
+    v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl);
+    v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl);
+    v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl);
+    v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl);
+    v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl);
+    __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r_16, v_g_16, v_b_16, v_a_16, vl);
+    avl -= vl;
+    src_argb += 4 * vl;
+    dst_ab64 += 4 * vl;
+  } while (avl > 0);
+}
+#endif
+
+#ifdef HAS_AR64TOARGBROW_RVV
+void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
+  size_t avl = (size_t)4 * width;
+  do {
+    vuint16m8_t v_ar64;
+    vuint8m4_t v_argb;
+    size_t vl = __riscv_vsetvl_e16m8(avl);
+    v_ar64 = __riscv_vle16_v_u16m8(src_ar64, vl);
+    v_argb = __riscv_vnsrl_wx_u8m4(v_ar64, 8, vl);
+    __riscv_vse8_v_u8m4(dst_argb, v_argb, vl);
+    avl -= vl;
+    src_ar64 += vl;
+    dst_argb += vl;
+  } while (avl > 0);
+}
+#endif
+
+#ifdef HAS_AR64TOAB64ROW_RVV
+void AR64ToAB64Row_RVV(const uint16_t* src_ar64,
+                       uint16_t* dst_ab64,
+                       int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e16m2(w);
+    vuint16m2_t v_b, v_g, v_r, v_a;
+    __riscv_vlseg4e16_v_u16m2(&v_b, &v_g, &v_r, &v_a, src_ar64, vl);
+    __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r, v_g, v_b, v_a, vl);
+    w -= vl;
+    src_ar64 += vl * 4;
+    dst_ab64 += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_AB64TOARGBROW_RVV
+void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
+  size_t avl = (size_t)width;
+  do {
+    vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16;
+    vuint8m1_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e16m2(avl);
+    __riscv_vlseg4e16_v_u16m2(&v_r_16, &v_g_16, &v_b_16, &v_a_16, src_ab64, vl);
+    v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl);
+    v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl);
+    v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl);
+    v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl);
+    __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
+    avl -= vl;
+    src_ab64 += 4 * vl;
+    dst_argb += 4 * vl;
+  } while (avl > 0);
+}
+#endif
+
+#ifdef HAS_RAWTOARGBROW_RVV
+void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_raw += vl * 3;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RAWTORGBAROW_RVV
+void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_raw += vl * 3;
+    dst_rgba += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RAWTORGB24ROW_RVV
+void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_r, v_g, v_b, vl);
+    w -= vl;
+    src_raw += vl * 3;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTORAWROW_RVV
+void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_raw += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTORGB24ROW_RVV
+void ARGBToRGB24Row_RVV(const uint8_t* src_argb,
+                        uint8_t* dst_rgb24,
+                        int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTOABGRROW_RVV
+void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_a, v_r, v_g, v_b;
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_abgr, v_r, v_g, v_b, v_a, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_abgr += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTOBGRAROW_RVV
+void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_a, v_r, v_g, v_b;
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_bgra, v_a, v_r, v_g, v_b, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_bgra += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTORGBAROW_RVV
+void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_a, v_r, v_g, v_b;
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_rgba += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RGBATOARGBROW_RVV
+void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_a, v_r, v_g, v_b;
+    __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_rgba += vl * 4;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RGB24TOARGBROW_RVV
+void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
+                        uint8_t* dst_argb,
+                        int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_rgb24 += vl * 3;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I444TOARGBROW_RVV
+void I444ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl;
+    src_v += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I444ALPHATOARGBROW_RVV
+void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    v_a = __riscv_vle8_v_u8m2(src_a, vl);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_a += vl;
+    src_u += vl;
+    src_v += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I444TORGB24ROW_RVV
+void I444ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl;
+    src_v += vl;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I422TOARGBROW_RVV
+void I422ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I422ALPHATOARGBROW_RVV
+void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    v_a = __riscv_vle8_v_u8m2(src_a, vl);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_a += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I422TORGBAROW_RVV
+void I422ToRGBARow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgba,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_rgba += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I422TORGB24ROW_RVV
+void I422ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I400TOARGBROW_RVV
+void I400ToARGBRow_RVV(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  const bool is_yb_positive = (yuvconstants->kRGBCoeffBias[4] >= 0);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  vuint16m4_t v_yb;
+  vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
+  // To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) sets to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  if (is_yb_positive) {
+    v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
+  } else {
+    v_yb = __riscv_vmv_v_x_u16m4(-yuvconstants->kRGBCoeffBias[4] + 32, vl);
+  }
+  do {
+    vuint8m2_t v_y, v_out;
+    vuint16m4_t v_y_16, v_tmp0, v_tmp1, v_tmp2;
+    vl = __riscv_vsetvl_e8m2(w);
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);
+    v_tmp0 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl);  // 257 * v_y
+    v_tmp1 = __riscv_vmulhu_vv_u16m4(v_tmp0, v_yg, vl);
+    if (is_yb_positive) {
+      v_tmp2 = __riscv_vsaddu_vv_u16m4(v_tmp1, v_yb, vl);
+    } else {
+      v_tmp2 = __riscv_vssubu_vv_u16m4(v_tmp1, v_yb, vl);
+    }
+    v_out = __riscv_vnclipu_wx_u8m2(v_tmp2, 6, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_out, v_out, v_out, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_J400TOARGBROW_RVV
+void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_y;
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_y, v_y, v_y, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_COPYROW_RVV
+void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m8(w);
+    vuint8m8_t v_data = __riscv_vle8_v_u8m8(src, vl);
+    __riscv_vse8_v_u8m8(dst, v_data, vl);
+    w -= vl;
+    src += vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_NV12TOARGBROW_RVV
+void NV12ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_uv += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_NV12TORGB24ROW_RVV
+void NV12ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_uv += vl;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_NV21TOARGBROW_RVV
+void NV21ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_vu += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_NV21TORGB24ROW_RVV
+void NV21ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_vu += vl;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+// Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1
+
+#ifdef HAS_INTERPOLATEROW_RVV
+void InterpolateRow_RVV(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int dst_width,
+                        int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  size_t dst_w = (size_t)dst_width;
+  assert(source_y_fraction >= 0);
+  assert(source_y_fraction < 256);
+  // Blend 100 / 0 - Copy row unchanged.
+  if (y1_fraction == 0) {
+    do {
+      size_t vl = __riscv_vsetvl_e8m8(dst_w);
+      __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl);
+      dst_w -= vl;
+      src_ptr += vl;
+      dst_ptr += vl;
+    } while (dst_w > 0);
+    return;
+  }
+  // To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up(0).
+  asm volatile("csrwi vxrm, 0");
+  // Blend 50 / 50.
+  if (y1_fraction == 128) {
+    do {
+      size_t vl = __riscv_vsetvl_e8m8(dst_w);
+      vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl);
+      vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl);
+      // Use round-to-nearest-up mode for averaging add
+      vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl);
+      __riscv_vse8_v_u8m8(dst_ptr, row_out, vl);
+      dst_w -= vl;
+      src_ptr += vl;
+      src_ptr1 += vl;
+      dst_ptr += vl;
+    } while (dst_w > 0);
+    return;
+  }
+  // General purpose row blend.
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(dst_w);
+    vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl);
+    vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl);
+    vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl);
+    acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl);
+    // Use round-to-nearest-up mode for vnclip
+    __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl);
+    dst_w -= vl;
+    src_ptr += vl;
+    src_ptr1 += vl;
+    dst_ptr += vl;
+  } while (dst_w > 0);
+}
+#endif
+
+#ifdef HAS_SPLITRGBROW_RVV
+void SplitRGBRow_RVV(const uint8_t* src_rgb,
+                     uint8_t* dst_r,
+                     uint8_t* dst_g,
+                     uint8_t* dst_b,
+                     int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_rgb, vl);
+    __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+    __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+    __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+    w -= vl;
+    dst_r += vl;
+    dst_g += vl;
+    dst_b += vl;
+    src_rgb += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_MERGERGBROW_RVV
+void MergeRGBRow_RVV(const uint8_t* src_r,
+                     const uint8_t* src_g,
+                     const uint8_t* src_b,
+                     uint8_t* dst_rgb,
+                     int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
+    vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
+    vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb, v_r, v_g, v_b, vl);
+    w -= vl;
+    src_r += vl;
+    src_g += vl;
+    src_b += vl;
+    dst_rgb += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SPLITARGBROW_RVV
+void SplitARGBRow_RVV(const uint8_t* src_argb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      uint8_t* dst_a,
+                      int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vse8_v_u8m2(dst_a, v_a, vl);
+    __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+    __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+    __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+    w -= vl;
+    dst_a += vl;
+    dst_r += vl;
+    dst_g += vl;
+    dst_b += vl;
+    src_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_MERGEARGBROW_RVV
+void MergeARGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      const uint8_t* src_a,
+                      uint8_t* dst_argb,
+                      int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
+    vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
+    vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
+    vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_r += vl;
+    src_g += vl;
+    src_b += vl;
+    src_a += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SPLITXRGBROW_RVV
+void SplitXRGBRow_RVV(const uint8_t* src_argb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+    __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+    __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+    w -= vl;
+    dst_r += vl;
+    dst_g += vl;
+    dst_b += vl;
+    src_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_MERGEXRGBROW_RVV
+void MergeXRGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_argb,
+                      int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_r, v_g, v_b;
+    v_r = __riscv_vle8_v_u8m2(src_r, vl);
+    v_g = __riscv_vle8_v_u8m2(src_g, vl);
+    v_b = __riscv_vle8_v_u8m2(src_b, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_r += vl;
+    src_g += vl;
+    src_b += vl;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SPLITUVROW_RVV
+void SplitUVRow_RVV(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4_t v_u, v_v;
+    __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl);
+    __riscv_vse8_v_u8m4(dst_u, v_u, vl);
+    __riscv_vse8_v_u8m4(dst_v, v_v, vl);
+    w -= vl;
+    dst_u += vl;
+    dst_v += vl;
+    src_uv += 2 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_MERGEUVROW_RVV
+void MergeUVRow_RVV(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m4_t v_u, v_v;
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    v_u = __riscv_vle8_v_u8m4(src_u, vl);
+    v_v = __riscv_vle8_v_u8m4(src_v, vl);
+    __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl);
+    w -= vl;
+    src_u += vl;
+    src_v += vl;
+    dst_uv += 2 * vl;
+  } while (w > 0);
+}
+#endif
+
+struct RgbConstants {
+  uint8_t kRGBToY[4];
+  uint16_t kAddY;
+  uint16_t pad;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+                                                        128,
+                                                        0};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                        0x1080,
+                                                        0};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
+                                                      0x1080,
+                                                      0};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored
+#ifdef HAS_ARGBTOYMATRIXROW_RVV
+void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
+                          uint8_t* dst_y,
+                          int width,
+                          const struct RgbConstants* rgbconstants) {
+  assert(width != 0);
+  size_t w = (size_t)width;
+  vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
+  vuint16m4_t v_addy;           // vector is to store kAddY
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
+  v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
+  v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
+  v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a, v_y;
+    vuint16m4_t v_y_u16;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
+    v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
+    v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
+    __riscv_vse8_v_u8m2(dst_y, v_y, vl);
+    w -= vl;
+    src_argb += 4 * vl;
+    dst_y += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTOYROW_RVV
+void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_RVV(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+#endif
+
+#ifdef HAS_ARGBTOYJROW_RVV
+void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_RVV(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+#endif
+
+#ifdef HAS_ABGRTOYROW_RVV
+void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_RVV(src_abgr, dst_y, width, &kRawI601Constants);
+}
+#endif
+
+#ifdef HAS_ABGRTOYJROW_RVV
+void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_RVV(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+#endif
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+#ifdef HAS_RGBATOYMATRIXROW_RVV
+void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba,
+                          uint8_t* dst_y,
+                          int width,
+                          const struct RgbConstants* rgbconstants) {
+  assert(width != 0);
+  size_t w = (size_t)width;
+  vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
+  vuint16m4_t v_addy;           // vector is to store kAddY
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
+  v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
+  v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
+  v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a, v_y;
+    vuint16m4_t v_y_u16;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl);
+    v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
+    v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
+    v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
+    __riscv_vse8_v_u8m2(dst_y, v_y, vl);
+    w -= vl;
+    src_rgba += 4 * vl;
+    dst_y += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RGBATOYROW_RVV
+void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_RVV(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+#endif
+
+#ifdef HAS_RGBATOYJROW_RVV
+void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+  RGBAToYMatrixRow_RVV(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+#endif
+
+#ifdef HAS_BGRATOYROW_RVV
+void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_RVV(src_bgra, dst_y, width, &kRawI601Constants);
+}
+#endif
+
+#ifdef HAS_RGBTOYMATRIXROW_RVV
+void RGBToYMatrixRow_RVV(const uint8_t* src_rgb,
+                         uint8_t* dst_y,
+                         int width,
+                         const struct RgbConstants* rgbconstants) {
+  assert(width != 0);
+  size_t w = (size_t)width;
+  vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
+  vuint16m4_t v_addy;           // vector is to store kAddY
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
+  v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
+  v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
+  v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_y;
+    vuint16m4_t v_y_u16;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb, vl);
+    v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
+    v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
+    v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
+    __riscv_vse8_v_u8m2(dst_y, v_y, vl);
+    w -= vl;
+    src_rgb += 3 * vl;
+    dst_y += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RGB24TOYJROW_RVV
+void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_RVV(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+#endif
+
+#ifdef HAS_RAWTOYJROW_RVV
+void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_RVV(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+#endif
+
+#ifdef HAS_RGB24TOYROW_RVV
+void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_RVV(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+#endif
+
+#ifdef HAS_RAWTOYROW_RVV
+void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants);
+}
+#endif
+
+// Blend src_argb over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb or src_argb1.
+// src_argb: RGB values have already been pre-multiplied by the a.
+#ifdef HAS_ARGBBLENDROW_RVV
+void ARGBBlendRow_RVV(const uint8_t* src_argb,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvlmax_e8m2();
+  // clamp255((((256 - a) * b) >> 8) + f)
+  // = b * (256 - a) / 256 + f
+  // = b - (b * a / 256) + f
+  vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl);
+  do {
+    vuint8m2_t v_src0_b, v_src0_g, v_src0_r, v_src0_a;
+    vuint8m2_t v_src1_b, v_src1_g, v_src1_r, v_src1_a;
+    vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r;
+    vuint8m2_t v_dst_b, v_dst_g, v_dst_r;
+    vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_src0_b, &v_src0_g, &v_src0_r, &v_src0_a,
+                            src_argb, vl);
+    __riscv_vlseg4e8_v_u8m2(&v_src1_b, &v_src1_g, &v_src1_r, &v_src1_a,
+                            src_argb1, vl);
+
+    v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl);
+    v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl);
+    v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl);
+
+    v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl);
+    v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl);
+    v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl);
+
+    v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl);
+    v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl);
+    v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_dst_b, v_dst_g, v_dst_r, v_255, vl);
+
+    w -= vl;
+    src_argb += 4 * vl;
+    src_argb1 += 4 * vl;
+    dst_argb += 4 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_BLENDPLANEROW_RVV
+void BlendPlaneRow_RVV(const uint8_t* src0,
+                       const uint8_t* src1,
+                       const uint8_t* alpha,
+                       uint8_t* dst,
+                       int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint16m8_t v_dst_u16;
+    vuint8m4_t v_dst;
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
+    vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
+    vuint8m4_t v_alpha = __riscv_vle8_v_u8m4(alpha, vl);
+    vuint8m4_t v_255_minus_alpha = __riscv_vrsub_vx_u8m4(v_alpha, 255u, vl);
+
+    // (a * foreground) + (1-a) * background
+    v_dst_u16 = __riscv_vwmulu_vv_u16m8(v_alpha, v_src0, vl);
+    v_dst_u16 =
+        __riscv_vwmaccu_vv_u16m8(v_dst_u16, v_255_minus_alpha, v_src1, vl);
+    v_dst_u16 = __riscv_vadd_vx_u16m8(v_dst_u16, 255u, vl);
+    v_dst = __riscv_vnsrl_wx_u8m4(v_dst_u16, 8, vl);
+
+    __riscv_vse8_v_u8m4(dst, v_dst, vl);
+    w -= vl;
+    src0 += vl;
+    src1 += vl;
+    alpha += vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+// Attenuate: (f * a + 255) >> 8
+#ifdef HAS_ARGBATTENUATEROW_RVV
+void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    vuint16m4_t v_ba_16, v_ga_16, v_ra_16;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    // f * a
+    v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl);
+    v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl);
+    v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl);
+    // f * a + 255
+    v_ba_16 = __riscv_vadd_vx_u16m4(v_ba_16, 255u, vl);
+    v_ga_16 = __riscv_vadd_vx_u16m4(v_ga_16, 255u, vl);
+    v_ra_16 = __riscv_vadd_vx_u16m4(v_ra_16, 255u, vl);
+    // (f * a + 255) >> 8
+    v_b = __riscv_vnsrl_wx_u8m2(v_ba_16, 8, vl);
+    v_g = __riscv_vnsrl_wx_u8m2(v_ga_16, 8, vl);
+    v_r = __riscv_vnsrl_wx_u8m2(v_ra_16, 8, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_RVV
+void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vse8_v_u8m2(dst_a, v_a, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_a += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_RVV
+void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
+  size_t w = (size_t)width;
+  const ptrdiff_t dst_stride = 4;
+  dst += 3;
+  do {
+    size_t vl = __riscv_vsetvl_e8m8(w);
+    vuint8m8_t v_a = __riscv_vle8_v_u8m8(src, vl);
+    __riscv_vsse8_v_u8m8(dst, dst_stride, v_a, vl);
+    w -= vl;
+    src += vl;
+    dst += vl * dst_stride;
+  } while (w > 0);
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) &&
+        // defined(__clang__)
diff --git a/files/source/row_win.cc b/source/row_win.cc
index c7c1ff60..5fb28521 100644
--- a/files/source/row_win.cc
+++ b/source/row_win.cc
@@ -14,7 +14,9 @@
 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
     !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
 
-#if defined(_M_X64)
+#if defined(_M_ARM64EC)
+#include <intrin.h>
+#elif defined(_M_X64)
 #include <emmintrin.h>
 #include <tmmintrin.h>  // For _mm_maddubs_epi16
 #endif
@@ -893,7 +895,7 @@ __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
 
 __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
                                                   uint8_t* dst_rgb,
-                                                  const uint32_t dither4,
+                                                  uint32_t dither4,
                                                   int width) {
   __asm {
 
@@ -940,7 +942,7 @@ __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
                                                   uint8_t* dst_rgb,
-                                                  const uint32_t dither4,
+                                                  uint32_t dither4,
                                                   int width) {
   __asm {
     mov        eax, [esp + 4]  // src_argb
@@ -2789,6 +2791,44 @@ __declspec(naked) void I422ToRGB24Row_SSSE3(
   }
 }
 
+// 8 pixels.
+// 8 UV values, mixed with 8 Y producing 8 RGB24 (24 bytes).
+__declspec(naked) void I444ToRGB24Row_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_rgb24,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
+    movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
+
+ convertloop:
+    READYUV444
+    YUVTORGB(ebx)
+    STORERGB24
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
 // 8 pixels
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
 __declspec(naked) void I422ToRGB565Row_SSSE3(
@@ -3423,17 +3463,14 @@ __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
     sub        edx, eax
 
   convertloop:
-    vmovdqu    ymm0, [eax]  // read 32 U's
-    vmovdqu    ymm1, [eax + edx]  // and 32 V's
-    lea        eax,  [eax + 32]
-    vpunpcklbw ymm2, ymm0, ymm1  // low 16 UV pairs. mutated qqword 0,2
-    vpunpckhbw ymm0, ymm0, ymm1  // high 16 UV pairs. mutated qqword 1,3
-    vextractf128 [edi], ymm2, 0  // bytes 0..15
-    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
-    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
-    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
-    lea        edi, [edi + 64]
-    sub        ecx, 32
+    vpmovzxbw  ymm0, [eax]
+    vpmovzxbw  ymm1, [eax + edx]
+    lea        eax,  [eax + 16]
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm2, ymm1, ymm0
+    vmovdqu    [edi], ymm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
diff --git a/files/source/scale.cc b/source/scale.cc
index e1335f1e..b7a602ba 100644
--- a/files/source/scale.cc
+++ b/source/scale.cc
@@ -135,6 +135,14 @@ static void ScalePlaneDown2(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN2_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowDown2 = filtering == kFilterNone
+                        ? ScaleRowDown2_RVV
+                        : (filtering == kFilterLinear ? ScaleRowDown2Linear_RVV
+                                                      : ScaleRowDown2Box_RVV);
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -198,6 +206,51 @@ static void ScalePlaneDown2_16(int src_width,
   }
 }
 
+void ScalePlaneDown2_16To8(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           int scale,
+                           enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                        uint8_t* dst_ptr, int dst_width, int scale) =
+      (src_width & 1)
+          ? (filtering == kFilterNone
+                 ? ScaleRowDown2_16To8_Odd_C
+                 : (filtering == kFilterLinear ? ScaleRowDown2Linear_16To8_Odd_C
+                                               : ScaleRowDown2Box_16To8_Odd_C))
+          : (filtering == kFilterNone
+                 ? ScaleRowDown2_16To8_C
+                 : (filtering == kFilterLinear ? ScaleRowDown2Linear_16To8_C
+                                               : ScaleRowDown2Box_16To8_C));
+  int row_stride = src_stride * 2;
+  (void)dst_height;
+  if (!filtering) {
+    src_ptr += src_stride;  // Point to odd rows.
+    src_stride = 0;
+  }
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (y = 0; y < src_height / 2; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width, scale);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+  if (src_height & 1) {
+    if (!filtering) {
+      src_ptr -= src_stride;  // Point to last row.
+    }
+    ScaleRowDown2(src_ptr, 0, dst_ptr, dst_width, scale);
+  }
+}
+
 // Scale plane, 1/4
 // This is an optimized version for scaling down a plane to 1/4 of
 // its original size.
@@ -267,6 +320,11 @@ static void ScalePlaneDown4(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN4_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_RVV : ScaleRowDown4_RVV;
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -427,6 +485,17 @@ static void ScalePlaneDown34(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN34_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_RVV;
+      ScaleRowDown34_1 = ScaleRowDown34_RVV;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_RVV;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_RVV;
+    }
+  }
+#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -642,6 +711,17 @@ static void ScalePlaneDown38(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN38_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_RVV;
+      ScaleRowDown38_2 = ScaleRowDown38_RVV;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_RVV;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_RVV;
+    }
+  }
+#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -775,9 +855,11 @@ static void ScaleAddCols2_C(int dst_width,
     int ix = x >> 16;
     x += dx;
     boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ =
-        SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >>
-        16;
+    int scaletbl_index = boxwidth - minboxwidth;
+    assert((scaletbl_index == 0) || (scaletbl_index == 1));
+    *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + ix) *
+                               scaletbl[scaletbl_index] >>
+                           16);
   }
 }
 
@@ -797,9 +879,10 @@ static void ScaleAddCols2_16_C(int dst_width,
     int ix = x >> 16;
     x += dx;
     boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
-                     scaletbl[boxwidth - minboxwidth] >>
-                 16;
+    int scaletbl_index = boxwidth - minboxwidth;
+    assert((scaletbl_index == 0) || (scaletbl_index == 1));
+    *dst_ptr++ =
+        SumPixels_16(boxwidth, src_ptr + ix) * scaletbl[scaletbl_index] >> 16;
   }
 }
 
@@ -814,7 +897,7 @@ static void ScaleAddCols0_C(int dst_width,
   (void)dx;
   src_ptr += (x >> 16);
   for (i = 0; i < dst_width; ++i) {
-    *dst_ptr++ = src_ptr[i] * scaleval >> 16;
+    *dst_ptr++ = (uint8_t)(src_ptr[i] * scaleval >> 16);
   }
 }
 
@@ -829,7 +912,7 @@ static void ScaleAddCols1_C(int dst_width,
   int i;
   x >>= 16;
   for (i = 0; i < dst_width; ++i) {
-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+    *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + x) * scaleval >> 16);
     x += boxwidth;
   }
 }
@@ -856,14 +939,14 @@ static void ScaleAddCols1_16_C(int dst_width,
 // one pixel of destination using fixed point (16.16) to step
 // through source, sampling a box of pixel with simple
 // averaging.
-static void ScalePlaneBox(int src_width,
-                          int src_height,
-                          int dst_width,
-                          int dst_height,
-                          int src_stride,
-                          int dst_stride,
-                          const uint8_t* src_ptr,
-                          uint8_t* dst_ptr) {
+static int ScalePlaneBox(int src_width,
+                         int src_height,
+                         int dst_width,
+                         int dst_height,
+                         int src_stride,
+                         int dst_stride,
+                         const uint8_t* src_ptr,
+                         uint8_t* dst_ptr) {
   int j, k;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -877,6 +960,8 @@ static void ScalePlaneBox(int src_width,
   {
     // Allocate a row buffer of uint16_t.
     align_buffer_64(row16, src_width * 2);
+    if (!row16)
+      return 1;
     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
                          const uint16_t* src_ptr, uint8_t* dst_ptr) =
         (dx & 0xffff) ? ScaleAddCols2_C
@@ -923,6 +1008,11 @@ static void ScalePlaneBox(int src_width,
       }
     }
 #endif
+#if defined(HAS_SCALEADDROW_RVV)
+    if (TestCpuFlag(kCpuHasRVV)) {
+      ScaleAddRow = ScaleAddRow_RVV;
+    }
+#endif
 
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
@@ -943,16 +1033,17 @@ static void ScalePlaneBox(int src_width,
     }
     free_aligned_buffer_64(row16);
   }
+  return 0;
 }
 
-static void ScalePlaneBox_16(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint16_t* src_ptr,
-                             uint16_t* dst_ptr) {
+static int ScalePlaneBox_16(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint16_t* src_ptr,
+                            uint16_t* dst_ptr) {
   int j, k;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -966,6 +1057,8 @@ static void ScalePlaneBox_16(int src_width,
   {
     // Allocate a row buffer of uint32_t.
     align_buffer_64(row32, src_width * 4);
+    if (!row32)
+      return 1;
     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
                          const uint32_t* src_ptr, uint16_t* dst_ptr) =
         (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
@@ -997,18 +1090,19 @@ static void ScalePlaneBox_16(int src_width,
     }
     free_aligned_buffer_64(row32);
   }
+  return 0;
 }
 
 // Scale plane down with bilinear interpolation.
-void ScalePlaneBilinearDown(int src_width,
-                            int src_height,
-                            int dst_width,
-                            int dst_height,
-                            int src_stride,
-                            int dst_stride,
-                            const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            enum FilterMode filtering) {
+static int ScalePlaneBilinearDown(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
+                                  int src_stride,
+                                  int dst_stride,
+                                  const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
@@ -1017,13 +1111,15 @@ void ScalePlaneBilinearDown(int src_width,
   // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
   // Allocate a row buffer.
   align_buffer_64(row, src_width);
+  if (!row)
+    return 1;
 
   const int max_y = (src_height - 1) << 16;
   int j;
-  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                           int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1070,6 +1166,11 @@ void ScalePlaneBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
 #if defined(HAS_SCALEFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -1121,17 +1222,18 @@ void ScalePlaneBilinearDown(int src_width,
     }
   }
   free_aligned_buffer_64(row);
+  return 0;
 }
 
-void ScalePlaneBilinearDown_16(int src_width,
-                               int src_height,
-                               int dst_width,
-                               int dst_height,
-                               int src_stride,
-                               int dst_stride,
-                               const uint16_t* src_ptr,
-                               uint16_t* dst_ptr,
-                               enum FilterMode filtering) {
+static int ScalePlaneBilinearDown_16(int src_width,
+                                     int src_height,
+                                     int dst_width,
+                                     int dst_height,
+                                     int src_stride,
+                                     int dst_stride,
+                                     const uint16_t* src_ptr,
+                                     uint16_t* dst_ptr,
+                                     enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
@@ -1140,13 +1242,15 @@ void ScalePlaneBilinearDown_16(int src_width,
   // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
   // Allocate a row buffer.
   align_buffer_64(row, src_width * 2);
+  if (!row)
+    return 1;
 
   const int max_y = (src_height - 1) << 16;
   int j;
-  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                           int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
-  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_16_C;
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1212,18 +1316,19 @@ void ScalePlaneBilinearDown_16(int src_width,
     }
   }
   free_aligned_buffer_64(row);
+  return 0;
 }
 
 // Scale up down with bilinear interpolation.
-void ScalePlaneBilinearUp(int src_width,
-                          int src_height,
-                          int dst_width,
-                          int dst_height,
-                          int src_stride,
-                          int dst_stride,
-                          const uint8_t* src_ptr,
-                          uint8_t* dst_ptr,
-                          enum FilterMode filtering) {
+static int ScalePlaneBilinearUp(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                enum FilterMode filtering) {
   int j;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -1231,10 +1336,10 @@ void ScalePlaneBilinearUp(int src_width,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                           int dst_width, int x, int dx) =
       filtering ? ScaleFilterCols_C : ScaleCols_C;
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1265,6 +1370,11 @@ void ScalePlaneBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   if (filtering && src_width >= 32768) {
     ScaleFilterCols = ScaleFilterCols64_C;
@@ -1315,11 +1425,13 @@ void ScalePlaneBilinearUp(int src_width,
     const uint8_t* src = src_ptr + yi * (int64_t)src_stride;
 
     // Allocate 2 row buffers.
-    const int kRowSize = (dst_width + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (dst_width + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
 
     uint8_t* rowptr = row;
-    int rowstride = kRowSize;
+    int rowstride = row_size;
     int lasty = yi;
 
     ScaleFilterCols(rowptr, src, dst_width, x, dx);
@@ -1360,6 +1472,7 @@ void ScalePlaneBilinearUp(int src_width,
     }
     free_aligned_buffer_64(row);
   }
+  return 0;
 }
 
 // Scale plane, horizontally up by 2 times.
@@ -1367,20 +1480,21 @@ void ScalePlaneBilinearUp(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original width, using linear interpolation.
 // This is used to scale U and V planes of I422 to I444.
-void ScalePlaneUp2_Linear(int src_width,
-                          int src_height,
-                          int dst_width,
-                          int dst_height,
-                          int src_stride,
-                          int dst_stride,
-                          const uint8_t* src_ptr,
-                          uint8_t* dst_ptr) {
+static void ScalePlaneUp2_Linear(int src_width,
+                                 int src_height,
+                                 int dst_width,
+                                 int dst_height,
+                                 int src_stride,
+                                 int dst_stride,
+                                 const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr) {
   void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
       ScaleRowUp2_Linear_Any_C;
   int i;
   int y;
   int dy;
 
+  (void)src_width;
   // This function can only scale up by 2 times horizontally.
   assert(src_width == ((dst_width + 1) / 2));
 
@@ -1407,6 +1521,11 @@ void ScalePlaneUp2_Linear(int src_width,
     ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
   }
 #endif
+#ifdef HAS_SCALEROWUP2_LINEAR_RVV
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowUp = ScaleRowUp2_Linear_RVV;
+  }
+#endif
 
   if (dst_height == 1) {
     ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
@@ -1426,19 +1545,20 @@ void ScalePlaneUp2_Linear(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original size, using bilinear interpolation.
 // This is used to scale U and V planes of I420 to I444.
-void ScalePlaneUp2_Bilinear(int src_width,
-                            int src_height,
-                            int dst_width,
-                            int dst_height,
-                            int src_stride,
-                            int dst_stride,
-                            const uint8_t* src_ptr,
-                            uint8_t* dst_ptr) {
+static void ScalePlaneUp2_Bilinear(int src_width,
+                                   int src_height,
+                                   int dst_width,
+                                   int dst_height,
+                                   int src_stride,
+                                   int dst_stride,
+                                   const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr) {
   void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                       uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
       ScaleRowUp2_Bilinear_Any_C;
   int x;
 
+  (void)src_width;
   // This function can only scale up by 2 times.
   assert(src_width == ((dst_width + 1) / 2));
   assert(src_height == ((dst_height + 1) / 2));
@@ -1466,6 +1586,11 @@ void ScalePlaneUp2_Bilinear(int src_width,
     Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
   }
 #endif
+#ifdef HAS_SCALEROWUP2_BILINEAR_RVV
+  if (TestCpuFlag(kCpuHasRVV)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_RVV;
+  }
+#endif
 
   Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
   dst_ptr += dst_stride;
@@ -1486,20 +1611,21 @@ void ScalePlaneUp2_Bilinear(int src_width,
 // its original width, using linear interpolation.
 // stride is in count of uint16_t.
 // This is used to scale U and V planes of I210 to I410 and I212 to I412.
-void ScalePlaneUp2_12_Linear(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint16_t* src_ptr,
-                             uint16_t* dst_ptr) {
+static void ScalePlaneUp2_12_Linear(int src_width,
+                                    int src_height,
+                                    int dst_width,
+                                    int dst_height,
+                                    int src_stride,
+                                    int dst_stride,
+                                    const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr) {
   void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
                      int dst_width) = ScaleRowUp2_Linear_16_Any_C;
   int i;
   int y;
   int dy;
 
+  (void)src_width;
   // This function can only scale up by 2 times horizontally.
   assert(src_width == ((dst_width + 1) / 2));
 
@@ -1540,19 +1666,20 @@ void ScalePlaneUp2_12_Linear(int src_width,
 // its original size, using bilinear interpolation.
 // stride is in count of uint16_t.
 // This is used to scale U and V planes of I010 to I410 and I012 to I412.
-void ScalePlaneUp2_12_Bilinear(int src_width,
-                               int src_height,
-                               int dst_width,
-                               int dst_height,
-                               int src_stride,
-                               int dst_stride,
-                               const uint16_t* src_ptr,
-                               uint16_t* dst_ptr) {
+static void ScalePlaneUp2_12_Bilinear(int src_width,
+                                      int src_height,
+                                      int dst_width,
+                                      int dst_height,
+                                      int src_stride,
+                                      int dst_stride,
+                                      const uint16_t* src_ptr,
+                                      uint16_t* dst_ptr) {
   void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                       uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
       ScaleRowUp2_Bilinear_16_Any_C;
   int x;
 
+  (void)src_width;
   // This function can only scale up by 2 times.
   assert(src_width == ((dst_width + 1) / 2));
   assert(src_height == ((dst_height + 1) / 2));
@@ -1587,20 +1714,21 @@ void ScalePlaneUp2_12_Bilinear(int src_width,
   }
 }
 
-void ScalePlaneUp2_16_Linear(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint16_t* src_ptr,
-                             uint16_t* dst_ptr) {
+static void ScalePlaneUp2_16_Linear(int src_width,
+                                    int src_height,
+                                    int dst_width,
+                                    int dst_height,
+                                    int src_stride,
+                                    int dst_stride,
+                                    const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr) {
   void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
                      int dst_width) = ScaleRowUp2_Linear_16_Any_C;
   int i;
   int y;
   int dy;
 
+  (void)src_width;
   // This function can only scale up by 2 times horizontally.
   assert(src_width == ((dst_width + 1) / 2));
 
@@ -1636,19 +1764,20 @@ void ScalePlaneUp2_16_Linear(int src_width,
   }
 }
 
-void ScalePlaneUp2_16_Bilinear(int src_width,
-                               int src_height,
-                               int dst_width,
-                               int dst_height,
-                               int src_stride,
-                               int dst_stride,
-                               const uint16_t* src_ptr,
-                               uint16_t* dst_ptr) {
+static void ScalePlaneUp2_16_Bilinear(int src_width,
+                                      int src_height,
+                                      int dst_width,
+                                      int dst_height,
+                                      int src_stride,
+                                      int dst_stride,
+                                      const uint16_t* src_ptr,
+                                      uint16_t* dst_ptr) {
   void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                       uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
       ScaleRowUp2_Bilinear_16_Any_C;
   int x;
 
+  (void)src_width;
   // This function can only scale up by 2 times.
   assert(src_width == ((dst_width + 1) / 2));
   assert(src_height == ((dst_height + 1) / 2));
@@ -1683,15 +1812,15 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
   }
 }
 
-void ScalePlaneBilinearUp_16(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint16_t* src_ptr,
-                             uint16_t* dst_ptr,
-                             enum FilterMode filtering) {
+static int ScalePlaneBilinearUp_16(int src_width,
+                                   int src_height,
+                                   int dst_width,
+                                   int dst_height,
+                                   int src_stride,
+                                   int dst_stride,
+                                   const uint16_t* src_ptr,
+                                   uint16_t* dst_ptr,
+                                   enum FilterMode filtering) {
   int j;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -1699,10 +1828,10 @@ void ScalePlaneBilinearUp_16(int src_width,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_16_C;
-  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                           int dst_width, int x, int dx) =
       filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1766,12 +1895,13 @@ void ScalePlaneBilinearUp_16(int src_width,
     const uint16_t* src = src_ptr + yi * (int64_t)src_stride;
 
     // Allocate 2 row buffers.
-    const int kRowSize = (dst_width + 31) & ~31;
-    align_buffer_64(row, kRowSize * 4);
-
-    uint16_t* rowptr = (uint16_t*)row;
-    int rowstride = kRowSize;
+    const int row_size = (dst_width + 31) & ~31;
+    align_buffer_64(row, row_size * 4);
+    int rowstride = row_size;
     int lasty = yi;
+    uint16_t* rowptr = (uint16_t*)row;
+    if (!row)
+      return 1;
 
     ScaleFilterCols(rowptr, src, dst_width, x, dx);
     if (src_height > 1) {
@@ -1811,6 +1941,7 @@ void ScalePlaneBilinearUp_16(int src_width,
     }
     free_aligned_buffer_64(row);
   }
+  return 0;
 }
 
 // Scale Plane to/from any dimensions, without interpolation.
@@ -1827,7 +1958,7 @@ static void ScalePlaneSimple(int src_width,
                              const uint8_t* src_ptr,
                              uint8_t* dst_ptr) {
   int i;
-  void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
+  void (*ScaleCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width,
                     int x, int dx) = ScaleCols_C;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -1864,7 +1995,7 @@ static void ScalePlaneSimple_16(int src_width,
                                 const uint16_t* src_ptr,
                                 uint16_t* dst_ptr) {
   int i;
-  void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width,
+  void (*ScaleCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width,
                     int x, int dx) = ScaleCols_16_C;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -1895,15 +2026,15 @@ static void ScalePlaneSimple_16(int src_width,
 // Scale a plane.
 // This function dispatches to a specialized scaler based on scale factor.
 LIBYUV_API
-void ScalePlane(const uint8_t* src,
-                int src_stride,
-                int src_width,
-                int src_height,
-                uint8_t* dst,
-                int dst_stride,
-                int dst_width,
-                int dst_height,
-                enum FilterMode filtering) {
+int ScalePlane(const uint8_t* src,
+               int src_stride,
+               int src_width,
+               int src_height,
+               uint8_t* dst,
+               int dst_stride,
+               int dst_width,
+               int dst_height,
+               enum FilterMode filtering) {
   // Simplify filtering when possible.
   filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
                                 filtering);
@@ -1919,7 +2050,7 @@ void ScalePlane(const uint8_t* src,
   if (dst_width == src_width && dst_height == src_height) {
     // Straight copy.
     CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
-    return;
+    return 0;
   }
   if (dst_width == src_width && filtering != kFilterBox) {
     int dy = 0;
@@ -1935,7 +2066,7 @@ void ScalePlane(const uint8_t* src,
     // Arbitrary scale vertically, but unscaled horizontally.
     ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
                        dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering);
-    return;
+    return 0;
   }
   if (dst_width <= Abs(src_width) && dst_height <= src_height) {
     // Scale down.
@@ -1943,69 +2074,67 @@ void ScalePlane(const uint8_t* src,
       // optimized, 3/4
       ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
                        dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
     if (2 * dst_width == src_width && 2 * dst_height == src_height) {
       // optimized, 1/2
       ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
                       dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
     // 3/8 rounded up for odd sized chroma height.
     if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
       // optimized, 3/8
       ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
                        dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
     if (4 * dst_width == src_width && 4 * dst_height == src_height &&
         (filtering == kFilterBox || filtering == kFilterNone)) {
       // optimized, 1/4
       ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
                       dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
   }
   if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,
-                  dst_stride, src, dst);
-    return;
+    return ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst);
   }
   if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
     ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height,
                          src_stride, dst_stride, src, dst);
-    return;
+    return 0;
   }
   if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
       (filtering == kFilterBilinear || filtering == kFilterBox)) {
     ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height,
                            src_stride, dst_stride, src, dst);
-    return;
+    return 0;
   }
   if (filtering && dst_height > src_height) {
-    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
-                         src_stride, dst_stride, src, dst, filtering);
-    return;
+    return ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+                                src_stride, dst_stride, src, dst, filtering);
   }
   if (filtering) {
-    ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
-                           src_stride, dst_stride, src, dst, filtering);
-    return;
+    return ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+                                  src_stride, dst_stride, src, dst, filtering);
   }
   ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
                    dst_stride, src, dst);
+  return 0;
 }
 
 LIBYUV_API
-void ScalePlane_16(const uint16_t* src,
-                   int src_stride,
-                   int src_width,
-                   int src_height,
-                   uint16_t* dst,
-                   int dst_stride,
-                   int dst_width,
-                   int dst_height,
-                   enum FilterMode filtering) {
+int ScalePlane_16(const uint16_t* src,
+                  int src_stride,
+                  int src_width,
+                  int src_height,
+                  uint16_t* dst,
+                  int dst_stride,
+                  int dst_width,
+                  int dst_height,
+                  enum FilterMode filtering) {
   // Simplify filtering when possible.
   filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
                                 filtering);
@@ -2021,7 +2150,7 @@ void ScalePlane_16(const uint16_t* src,
   if (dst_width == src_width && dst_height == src_height) {
     // Straight copy.
     CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
-    return;
+    return 0;
   }
   if (dst_width == src_width && filtering != kFilterBox) {
     int dy = 0;
@@ -2040,7 +2169,7 @@ void ScalePlane_16(const uint16_t* src,
     // Arbitrary scale vertically, but unscaled horizontally.
     ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
                           dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering);
-    return;
+    return 0;
   }
   if (dst_width <= Abs(src_width) && dst_height <= src_height) {
     // Scale down.
@@ -2048,69 +2177,68 @@ void ScalePlane_16(const uint16_t* src,
       // optimized, 3/4
       ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
                           src_stride, dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
     if (2 * dst_width == src_width && 2 * dst_height == src_height) {
       // optimized, 1/2
       ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
                          src_stride, dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
     // 3/8 rounded up for odd sized chroma height.
     if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
       // optimized, 3/8
       ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
                           src_stride, dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
     if (4 * dst_width == src_width && 4 * dst_height == src_height &&
         (filtering == kFilterBox || filtering == kFilterNone)) {
       // optimized, 1/4
       ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
                          src_stride, dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
   }
   if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride,
-                     dst_stride, src, dst);
-    return;
+    return ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
+                            src_stride, dst_stride, src, dst);
   }
   if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
     ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height,
                             src_stride, dst_stride, src, dst);
-    return;
+    return 0;
   }
   if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
       (filtering == kFilterBilinear || filtering == kFilterBox)) {
     ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height,
                               src_stride, dst_stride, src, dst);
-    return;
+    return 0;
   }
   if (filtering && dst_height > src_height) {
-    ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
-                            src_stride, dst_stride, src, dst, filtering);
-    return;
+    return ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
+                                   src_stride, dst_stride, src, dst, filtering);
   }
   if (filtering) {
-    ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
-                              src_stride, dst_stride, src, dst, filtering);
-    return;
+    return ScalePlaneBilinearDown_16(src_width, src_height, dst_width,
+                                     dst_height, src_stride, dst_stride, src,
+                                     dst, filtering);
   }
   ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride,
                       dst_stride, src, dst);
+  return 0;
 }
 
 LIBYUV_API
-void ScalePlane_12(const uint16_t* src,
-                   int src_stride,
-                   int src_width,
-                   int src_height,
-                   uint16_t* dst,
-                   int dst_stride,
-                   int dst_width,
-                   int dst_height,
-                   enum FilterMode filtering) {
+int ScalePlane_12(const uint16_t* src,
+                  int src_stride,
+                  int src_width,
+                  int src_height,
+                  uint16_t* dst,
+                  int dst_stride,
+                  int dst_width,
+                  int dst_height,
+                  enum FilterMode filtering) {
   // Simplify filtering when possible.
   filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
                                 filtering);
@@ -2125,17 +2253,17 @@ void ScalePlane_12(const uint16_t* src,
   if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
     ScalePlaneUp2_12_Linear(src_width, src_height, dst_width, dst_height,
                             src_stride, dst_stride, src, dst);
-    return;
+    return 0;
   }
   if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
       (filtering == kFilterBilinear || filtering == kFilterBox)) {
     ScalePlaneUp2_12_Bilinear(src_width, src_height, dst_width, dst_height,
                               src_stride, dst_stride, src, dst);
-    return;
+    return 0;
   }
 
-  ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride,
-                dst_width, dst_height, filtering);
+  return ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride,
+                       dst_width, dst_height, filtering);
 }
 
 // Scale an I420 image.
@@ -2163,6 +2291,7 @@ int I420Scale(const uint8_t* src_y,
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int r;
 
   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
       src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
@@ -2170,13 +2299,19 @@ int I420Scale(const uint8_t* src_y,
     return -1;
   }
 
-  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-             dst_width, dst_height, filtering);
-  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
-             dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
-  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
-             dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
-  return 0;
+  r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
+                 dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+                 dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+                 dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+  return r;
 }
 
 LIBYUV_API
@@ -2201,6 +2336,7 @@ int I420Scale_16(const uint16_t* src_y,
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int r;
 
   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
       src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
@@ -2208,13 +2344,19 @@ int I420Scale_16(const uint16_t* src_y,
     return -1;
   }
 
-  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-                dst_width, dst_height, filtering);
-  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
-                dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
-  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
-                dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
-  return 0;
+  r = ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y,
+                    dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+                    dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+                    dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+  return r;
 }
 
 LIBYUV_API
@@ -2239,6 +2381,7 @@ int I420Scale_12(const uint16_t* src_y,
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int r;
 
   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
       src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
@@ -2246,13 +2389,19 @@ int I420Scale_12(const uint16_t* src_y,
     return -1;
   }
 
-  ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-                dst_width, dst_height, filtering);
-  ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
-                dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
-  ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
-                dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
-  return 0;
+  r = ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y,
+                    dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+                    dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+                    dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+  return r;
 }
 
 // Scale an I444 image.
@@ -2276,19 +2425,27 @@ int I444Scale(const uint8_t* src_y,
               int dst_width,
               int dst_height,
               enum FilterMode filtering) {
+  int r;
+
   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
       src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
       dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
-  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-             dst_width, dst_height, filtering);
-  ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
-             dst_width, dst_height, filtering);
-  ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
-             dst_width, dst_height, filtering);
-  return 0;
+  r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
+                 dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u,
+                 dst_stride_u, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v,
+                 dst_stride_v, dst_width, dst_height, filtering);
+  return r;
 }
 
 LIBYUV_API
@@ -2309,19 +2466,27 @@ int I444Scale_16(const uint16_t* src_y,
                  int dst_width,
                  int dst_height,
                  enum FilterMode filtering) {
+  int r;
+
   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
       src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
       dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
-  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-                dst_width, dst_height, filtering);
-  ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
-                dst_width, dst_height, filtering);
-  ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
-                dst_width, dst_height, filtering);
-  return 0;
+  r = ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y,
+                    dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u,
+                    dst_stride_u, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v,
+                    dst_stride_v, dst_width, dst_height, filtering);
+  return r;
 }
 
 LIBYUV_API
@@ -2342,19 +2507,27 @@ int I444Scale_12(const uint16_t* src_y,
                  int dst_width,
                  int dst_height,
                  enum FilterMode filtering) {
+  int r;
+
   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
       src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
       dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
-  ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-                dst_width, dst_height, filtering);
-  ScalePlane_12(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
-                dst_width, dst_height, filtering);
-  ScalePlane_12(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
-                dst_width, dst_height, filtering);
-  return 0;
+  r = ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y,
+                    dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_u, src_stride_u, src_width, src_height, dst_u,
+                    dst_stride_u, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_v, src_stride_v, src_width, src_height, dst_v,
+                    dst_stride_v, dst_width, dst_height, filtering);
+  return r;
 }
 
 // Scale an I422 image.
@@ -2380,6 +2553,7 @@ int I422Scale(const uint8_t* src_y,
               enum FilterMode filtering) {
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int r;
 
   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
       src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
@@ -2387,13 +2561,19 @@ int I422Scale(const uint8_t* src_y,
     return -1;
   }
 
-  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-             dst_width, dst_height, filtering);
-  ScalePlane(src_u, src_stride_u, src_halfwidth, src_height, dst_u,
-             dst_stride_u, dst_halfwidth, dst_height, filtering);
-  ScalePlane(src_v, src_stride_v, src_halfwidth, src_height, dst_v,
-             dst_stride_v, dst_halfwidth, dst_height, filtering);
-  return 0;
+  r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
+                 dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_u, src_stride_u, src_halfwidth, src_height, dst_u,
+                 dst_stride_u, dst_halfwidth, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_v, src_stride_v, src_halfwidth, src_height, dst_v,
+                 dst_stride_v, dst_halfwidth, dst_height, filtering);
+  return r;
 }
 
 LIBYUV_API
@@ -2416,6 +2596,7 @@ int I422Scale_16(const uint16_t* src_y,
                  enum FilterMode filtering) {
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int r;
 
   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
       src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
@@ -2423,13 +2604,19 @@ int I422Scale_16(const uint16_t* src_y,
     return -1;
   }
 
-  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-                dst_width, dst_height, filtering);
-  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_height, dst_u,
-                dst_stride_u, dst_halfwidth, dst_height, filtering);
-  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_height, dst_v,
-                dst_stride_v, dst_halfwidth, dst_height, filtering);
-  return 0;
+  r = ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y,
+                    dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_height, dst_u,
+                    dst_stride_u, dst_halfwidth, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_height, dst_v,
+                    dst_stride_v, dst_halfwidth, dst_height, filtering);
+  return r;
 }
 
 LIBYUV_API
@@ -2452,6 +2639,7 @@ int I422Scale_12(const uint16_t* src_y,
                  enum FilterMode filtering) {
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int r;
 
   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
       src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
@@ -2459,13 +2647,19 @@ int I422Scale_12(const uint16_t* src_y,
     return -1;
   }
 
-  ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-                dst_width, dst_height, filtering);
-  ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_height, dst_u,
-                dst_stride_u, dst_halfwidth, dst_height, filtering);
-  ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_height, dst_v,
-                dst_stride_v, dst_halfwidth, dst_height, filtering);
-  return 0;
+  r = ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y,
+                    dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_height, dst_u,
+                    dst_stride_u, dst_halfwidth, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_height, dst_v,
+                    dst_stride_v, dst_halfwidth, dst_height, filtering);
+  return r;
 }
 
 // Scale an NV12 image.
@@ -2489,6 +2683,7 @@ int NV12Scale(const uint8_t* src_y,
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int r;
 
   if (!src_y || !src_uv || src_width <= 0 || src_height == 0 ||
       src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv ||
@@ -2496,11 +2691,14 @@ int NV12Scale(const uint8_t* src_y,
     return -1;
   }
 
-  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-             dst_width, dst_height, filtering);
-  UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv,
-          dst_stride_uv, dst_halfwidth, dst_halfheight, filtering);
-  return 0;
+  r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
+                 dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv,
+              dst_stride_uv, dst_halfwidth, dst_halfheight, filtering);
+  return r;
 }
 
 // Deprecated api
diff --git a/files/source/scale_any.cc b/source/scale_any.cc
index 317041f8..f6576874 100644
--- a/files/source/scale_any.cc
+++ b/source/scale_any.cc
@@ -128,6 +128,22 @@ SDODD(ScaleRowDown2Box_Odd_NEON,
       1,
       15)
 #endif
+#ifdef HAS_SCALEUVROWDOWN2_NEON
+SDANY(ScaleUVRowDown2_Any_NEON,
+      ScaleUVRowDown2_NEON,
+      ScaleUVRowDown2_C,
+      2,
+      2,
+      7)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2LINEAR_NEON
+SDANY(ScaleUVRowDown2Linear_Any_NEON,
+      ScaleUVRowDown2Linear_NEON,
+      ScaleUVRowDown2Linear_C,
+      2,
+      2,
+      7)
+#endif
 #ifdef HAS_SCALEUVROWDOWN2BOX_NEON
 SDANY(ScaleUVRowDown2Box_Any_NEON,
       ScaleUVRowDown2Box_NEON,
diff --git a/files/source/scale_argb.cc b/source/scale_argb.cc
index 9c3acf7f..18bdeb86 100644
--- a/files/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -16,6 +16,7 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"  // For CopyARGB
 #include "libyuv/row.h"
+#include "libyuv/scale_argb.h"
 #include "libyuv/scale_row.h"
 
 #ifdef __cplusplus
@@ -58,9 +59,9 @@ static void ScaleARGBDown2(int src_width,
   assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
   // Advance to odd row, even column.
   if (filtering == kFilterBilinear) {
-    src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4;
+    src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
   } else {
-    src_argb += (y >> 16) * (int64_t)src_stride + ((x >> 16) - 1) * 4;
+    src_argb += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 4;
   }
 
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
@@ -127,6 +128,15 @@ static void ScaleARGBDown2(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWN2_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_RVV
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_RVV
+                                          : ScaleARGBRowDown2Box_RVV);
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -141,28 +151,33 @@ static void ScaleARGBDown2(int src_width,
 // ScaleARGB ARGB, 1/4
 // This is an optimized version for scaling down a ARGB to 1/4 of
 // its original size.
-static void ScaleARGBDown4Box(int src_width,
-                              int src_height,
-                              int dst_width,
-                              int dst_height,
-                              int src_stride,
-                              int dst_stride,
-                              const uint8_t* src_argb,
-                              uint8_t* dst_argb,
-                              int x,
-                              int dx,
-                              int y,
-                              int dy) {
+static int ScaleARGBDown4Box(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int x,
+                             int dx,
+                             int y,
+                             int dy) {
   int j;
   // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2);
+  const int row_size = (dst_width * 2 * 4 + 31) & ~31;
+  // TODO(fbarchard): Remove this row buffer and implement a ScaleARGBRowDown4
+  // but implemented via a 2 pass wrapper that uses a very small array on the
+  // stack with a horizontal loop.
+  align_buffer_64(row, row_size * 2);
+  if (!row)
+    return 1;
   int row_stride = src_stride * (dy >> 16);
   void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
                             uint8_t* dst_argb, int dst_width) =
       ScaleARGBRowDown2Box_C;
   // Advance to odd row, even column.
-  src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4;
+  src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
   (void)src_width;
   (void)src_height;
   (void)dx;
@@ -184,16 +199,22 @@ static void ScaleARGBDown4Box(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWN2_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_RVV;
+  }
+#endif
 
   for (j = 0; j < dst_height; ++j) {
     ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
-    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,
+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + row_size,
                       dst_width * 2);
-    ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
+    ScaleARGBRowDown2(row, row_size, dst_argb, dst_width);
     src_argb += row_stride;
     dst_argb += dst_stride;
   }
   free_aligned_buffer_64(row);
+  return 0;
 }
 
 // ScaleARGB ARGB Even
@@ -214,7 +235,7 @@ static void ScaleARGBDownEven(int src_width,
                               enum FilterMode filtering) {
   int j;
   int col_step = dx >> 16;
-  int row_stride = (dy >> 16) * (int64_t)src_stride;
+  ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride);
   void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,
                                int src_step, uint8_t* dst_argb, int dst_width) =
       filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
@@ -222,7 +243,7 @@ static void ScaleARGBDownEven(int src_width,
   (void)src_height;
   assert(IS_ALIGNED(src_width, 2));
   assert(IS_ALIGNED(src_height, 2));
-  src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4;
+  src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
@@ -263,6 +284,16 @@ static void ScaleARGBDownEven(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWNEVENBOX_RVV)
+  if (filtering && TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEvenBox_RVV;
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_RVV)
+  if (!filtering && TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_RVV;
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -275,24 +306,24 @@ static void ScaleARGBDownEven(int src_width,
 }
 
 // Scale ARGB down with bilinear interpolation.
-static void ScaleARGBBilinearDown(int src_width,
-                                  int src_height,
-                                  int dst_width,
-                                  int dst_height,
-                                  int src_stride,
-                                  int dst_stride,
-                                  const uint8_t* src_argb,
-                                  uint8_t* dst_argb,
-                                  int x,
-                                  int dx,
-                                  int y,
-                                  int dy,
-                                  enum FilterMode filtering) {
+static int ScaleARGBBilinearDown(int src_width,
+                                 int src_height,
+                                 int dst_width,
+                                 int dst_height,
+                                 int src_stride,
+                                 int dst_stride,
+                                 const uint8_t* src_argb,
+                                 uint8_t* dst_argb,
+                                 int x,
+                                 int dx,
+                                 int y,
+                                 int dy,
+                                 enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
                               int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
   int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
@@ -348,6 +379,11 @@ static void ScaleARGBBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
@@ -381,6 +417,8 @@ static void ScaleARGBBilinearDown(int src_width,
   // Allocate a row of ARGB.
   {
     align_buffer_64(row, clip_src_width * 4);
+    if (!row)
+      return 1;
 
     const int max_y = (src_height - 1) << 16;
     if (y > max_y) {
@@ -388,7 +426,7 @@ static void ScaleARGBBilinearDown(int src_width,
     }
     for (j = 0; j < dst_height; ++j) {
       int yi = y >> 16;
-      const uint8_t* src = src_argb + yi * (int64_t)src_stride;
+      const uint8_t* src = src_argb + yi * (intptr_t)src_stride;
       if (filtering == kFilterLinear) {
         ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
       } else {
@@ -404,27 +442,28 @@ static void ScaleARGBBilinearDown(int src_width,
     }
     free_aligned_buffer_64(row);
   }
+  return 0;
 }
 
 // Scale ARGB up with bilinear interpolation.
-static void ScaleARGBBilinearUp(int src_width,
-                                int src_height,
-                                int dst_width,
-                                int dst_height,
-                                int src_stride,
-                                int dst_stride,
-                                const uint8_t* src_argb,
-                                uint8_t* dst_argb,
-                                int x,
-                                int dx,
-                                int y,
-                                int dy,
-                                enum FilterMode filtering) {
+static int ScaleARGBBilinearUp(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint8_t* src_argb,
+                               uint8_t* dst_argb,
+                               int x,
+                               int dx,
+                               int y,
+                               int dy,
+                               enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
                               int dst_width, int x, int dx) =
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   const int max_y = (src_height - 1) << 16;
@@ -468,6 +507,11 @@ static void ScaleARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
   if (src_width >= 32768) {
     ScaleARGBFilterCols =
         filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
@@ -545,14 +589,16 @@ static void ScaleARGBBilinearUp(int src_width,
 
   {
     int yi = y >> 16;
-    const uint8_t* src = src_argb + yi * (int64_t)src_stride;
+    const uint8_t* src = src_argb + yi * (intptr_t)src_stride;
 
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (dst_width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (dst_width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
 
     uint8_t* rowptr = row;
-    int rowstride = kRowSize;
+    int rowstride = row_size;
     int lasty = yi;
 
     ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
@@ -570,7 +616,7 @@ static void ScaleARGBBilinearUp(int src_width,
         if (y > max_y) {
           y = max_y;
           yi = y >> 16;
-          src = src_argb + yi * (int64_t)src_stride;
+          src = src_argb + yi * (intptr_t)src_stride;
         }
         if (yi != lasty) {
           ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
@@ -593,27 +639,28 @@ static void ScaleARGBBilinearUp(int src_width,
     }
     free_aligned_buffer_64(row);
   }
+  return 0;
 }
 
 #ifdef YUVSCALEUP
 // Scale YUV to ARGB up with bilinear interpolation.
-static void ScaleYUVToARGBBilinearUp(int src_width,
-                                     int src_height,
-                                     int dst_width,
-                                     int dst_height,
-                                     int src_stride_y,
-                                     int src_stride_u,
-                                     int src_stride_v,
-                                     int dst_stride_argb,
-                                     const uint8_t* src_y,
-                                     const uint8_t* src_u,
-                                     const uint8_t* src_v,
-                                     uint8_t* dst_argb,
-                                     int x,
-                                     int dx,
-                                     int y,
-                                     int dy,
-                                     enum FilterMode filtering) {
+static int ScaleYUVToARGBBilinearUp(int src_width,
+                                    int src_height,
+                                    int dst_width,
+                                    int dst_height,
+                                    int src_stride_y,
+                                    int src_stride_u,
+                                    int src_stride_v,
+                                    int dst_stride_argb,
+                                    const uint8_t* src_y,
+                                    const uint8_t* src_u,
+                                    const uint8_t* src_v,
+                                    uint8_t* dst_argb,
+                                    int x,
+                                    int dx,
+                                    int y,
+                                    int dy,
+                                    enum FilterMode filtering) {
   int j;
   void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
                         const uint8_t* v_buf, uint8_t* rgb_buf, int width) =
@@ -659,6 +706,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(src_width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_LSX;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGBROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
     I422ToARGBRow = I422ToARGBRow_Any_LASX;
@@ -667,8 +722,13 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif
 
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
 #if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -711,8 +771,13 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
-  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
                               int dst_width, int x, int dx) =
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   if (src_width >= 32768) {
@@ -793,20 +858,21 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
   const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
   int yi = y >> 16;
   int uv_yi = yi >> kYShift;
-  const uint8_t* src_row_y = src_y + yi * (int64_t)src_stride_y;
-  const uint8_t* src_row_u = src_u + uv_yi * (int64_t)src_stride_u;
-  const uint8_t* src_row_v = src_v + uv_yi * (int64_t)src_stride_v;
+  const uint8_t* src_row_y = src_y + yi * (intptr_t)src_stride_y;
+  const uint8_t* src_row_u = src_u + uv_yi * (intptr_t)src_stride_u;
+  const uint8_t* src_row_v = src_v + uv_yi * (intptr_t)src_stride_v;
 
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 4 + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2);
-
-  // Allocate 1 row of ARGB for source conversion.
-  align_buffer_64(argb_row, src_width * 4);
+  // Allocate 1 row of ARGB for source conversion and 2 rows of ARGB
+  // scaled horizontally to the destination width.
+  const int row_size = (dst_width * 4 + 31) & ~31;
+  align_buffer_64(row, row_size * 2 + src_width * 4);
 
+  uint8_t* argb_row = row + row_size * 2;
   uint8_t* rowptr = row;
-  int rowstride = kRowSize;
+  int rowstride = row_size;
   int lasty = yi;
+  if (!row)
+    return 1;
 
   // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
   ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
@@ -833,9 +899,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
         y = max_y;
         yi = y >> 16;
         uv_yi = yi >> kYShift;
-        src_row_y = src_y + yi * (int64_t)src_stride_y;
-        src_row_u = src_u + uv_yi * (int64_t)src_stride_u;
-        src_row_v = src_v + uv_yi * (int64_t)src_stride_v;
+        src_row_y = src_y + yi * (intptr_t)src_stride_y;
+        src_row_u = src_u + uv_yi * (intptr_t)src_stride_u;
+        src_row_v = src_v + uv_yi * (intptr_t)src_stride_v;
       }
       if (yi != lasty) {
         // TODO(fbarchard): Convert the clipped region of row.
@@ -861,7 +927,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     y += dy;
   }
   free_aligned_buffer_64(row);
-  free_aligned_buffer_64(row_argb);
+  return 0;
 }
 #endif
 
@@ -883,7 +949,7 @@ static void ScaleARGBSimple(int src_width,
                             int y,
                             int dy) {
   int j;
-  void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*ScaleARGBCols)(uint8_t* dst_argb, const uint8_t* src_argb,
                         int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
   (void)src_height;
@@ -926,7 +992,7 @@ static void ScaleARGBSimple(int src_width,
   }
 
   for (j = 0; j < dst_height; ++j) {
-    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (int64_t)src_stride,
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (intptr_t)src_stride,
                   dst_width, x, dx);
     dst_argb += dst_stride;
     y += dy;
@@ -936,19 +1002,19 @@ static void ScaleARGBSimple(int src_width,
 // ScaleARGB a ARGB.
 // This function in turn calls a scaling function
 // suitable for handling the desired resolutions.
-static void ScaleARGB(const uint8_t* src,
-                      int src_stride,
-                      int src_width,
-                      int src_height,
-                      uint8_t* dst,
-                      int dst_stride,
-                      int dst_width,
-                      int dst_height,
-                      int clip_x,
-                      int clip_y,
-                      int clip_width,
-                      int clip_height,
-                      enum FilterMode filtering) {
+static int ScaleARGB(const uint8_t* src,
+                     int src_stride,
+                     int src_width,
+                     int src_height,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int dst_width,
+                     int dst_height,
+                     int clip_x,
+                     int clip_y,
+                     int clip_width,
+                     int clip_height,
+                     enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
@@ -962,7 +1028,7 @@ static void ScaleARGB(const uint8_t* src,
   // Negative src_height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
-    src = src + (src_height - 1) * (int64_t)src_stride;
+    src = src + (src_height - 1) * (intptr_t)src_stride;
     src_stride = -src_stride;
   }
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -977,7 +1043,7 @@ static void ScaleARGB(const uint8_t* src,
   if (clip_y) {
     int64_t clipf = (int64_t)(clip_y)*dy;
     y += (clipf & 0xffff);
-    src += (clipf >> 16) * (int64_t)src_stride;
+    src += (clipf >> 16) * (intptr_t)src_stride;
     dst += clip_y * dst_stride;
   }
 
@@ -993,27 +1059,27 @@ static void ScaleARGB(const uint8_t* src,
           ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
                          src_stride, dst_stride, src, dst, x, dx, y, dy,
                          filtering);
-          return;
+          return 0;
         }
         if (dx == 0x40000 && filtering == kFilterBox) {
           // Optimized 1/4 box downsample.
-          ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height,
-                            src_stride, dst_stride, src, dst, x, dx, y, dy);
-          return;
+          return ScaleARGBDown4Box(src_width, src_height, clip_width,
+                                   clip_height, src_stride, dst_stride, src,
+                                   dst, x, dx, y, dy);
         }
         ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
                           src_stride, dst_stride, src, dst, x, dx, y, dy,
                           filtering);
-        return;
+        return 0;
       }
       // Optimized odd scale down. ie 3, 5, 7, 9x.
       if ((dx & 0x10000) && (dy & 0x10000)) {
         filtering = kFilterNone;
         if (dx == 0x10000 && dy == 0x10000) {
           // Straight copy.
-          ARGBCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 4,
+          ARGBCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4,
                    src_stride, dst, dst_stride, clip_width, clip_height);
-          return;
+          return 0;
         }
       }
     }
@@ -1022,22 +1088,21 @@ static void ScaleARGB(const uint8_t* src,
     // Arbitrary scale vertically, but unscaled horizontally.
     ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
                        dst_stride, src, dst, x, y, dy, /*bpp=*/4, filtering);
-    return;
+    return 0;
   }
   if (filtering && dy < 65536) {
-    ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
-                        src_stride, dst_stride, src, dst, x, dx, y, dy,
-                        filtering);
-    return;
+    return ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
+                               src_stride, dst_stride, src, dst, x, dx, y, dy,
+                               filtering);
   }
   if (filtering) {
-    ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
-                          src_stride, dst_stride, src, dst, x, dx, y, dy,
-                          filtering);
-    return;
+    return ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
+                                 src_stride, dst_stride, src, dst, x, dx, y, dy,
+                                 filtering);
   }
   ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride,
                   dst_stride, src, dst, x, dx, y, dy);
+  return 0;
 }
 
 LIBYUV_API
@@ -1061,10 +1126,9 @@ int ARGBScaleClip(const uint8_t* src_argb,
       (clip_y + clip_height) > dst_height) {
     return -1;
   }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
-            dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width,
-            clip_height, filtering);
-  return 0;
+  return ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+                   dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
+                   clip_width, clip_height, filtering);
 }
 
 // Scale an ARGB image.
@@ -1082,10 +1146,9 @@ int ARGBScale(const uint8_t* src_argb,
       src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
-            dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height,
-            filtering);
-  return 0;
+  return ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+                   dst_stride_argb, dst_width, dst_height, 0, 0, dst_width,
+                   dst_height, filtering);
 }
 
 // Scale with YUV conversion to ARGB and clipping.
@@ -1109,8 +1172,11 @@ int YUVToARGBScaleClip(const uint8_t* src_y,
                        int clip_width,
                        int clip_height,
                        enum FilterMode filtering) {
-  uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4);
   int r;
+  uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4);
+  if (!argb_buffer) {
+    return 1;  // Out of memory runtime error.
+  }
   (void)src_fourcc;  // TODO(fbarchard): implement and/or assert.
   (void)dst_fourcc;
   I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
diff --git a/files/source/scale_common.cc b/source/scale_common.cc
index b02bdafd..d07a39af 100644
--- a/files/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -23,6 +23,25 @@ namespace libyuv {
 extern "C" {
 #endif
 
+#ifdef __cplusplus
+#define STATIC_CAST(type, expr) static_cast<type>(expr)
+#else
+#define STATIC_CAST(type, expr) (type)(expr)
+#endif
+
+// TODO(fbarchard): make clamp255 preserve negative values.
+static __inline int32_t clamp255(int32_t v) {
+  return (-(v >= 255) | v) & 255;
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
+
 static __inline int Abs(int v) {
   return v >= 0 ? v : -v;
 }
@@ -62,6 +81,50 @@ void ScaleRowDown2_16_C(const uint16_t* src_ptr,
   }
 }
 
+void ScaleRowDown2_16To8_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width,
+                           int scale) {
+  int x;
+  (void)src_stride;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+    dst[1] = STATIC_CAST(uint8_t, C16TO8(src_ptr[3], scale));
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+  }
+}
+
+void ScaleRowDown2_16To8_Odd_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width,
+                               int scale) {
+  int x;
+  (void)src_stride;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  dst_width -= 1;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+    dst[1] = STATIC_CAST(uint8_t, C16TO8(src_ptr[3], scale));
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+    dst += 1;
+    src_ptr += 2;
+  }
+  dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[0], scale));
+}
+
 void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst,
@@ -98,6 +161,52 @@ void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
   }
 }
 
+void ScaleRowDown2Linear_16To8_C(const uint16_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst,
+                                 int dst_width,
+                                 int scale) {
+  const uint16_t* s = src_ptr;
+  int x;
+  (void)src_stride;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+    dst[1] = STATIC_CAST(uint8_t, C16TO8((s[2] + s[3] + 1) >> 1, scale));
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+  }
+}
+
+void ScaleRowDown2Linear_16To8_Odd_C(const uint16_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst,
+                                     int dst_width,
+                                     int scale) {
+  const uint16_t* s = src_ptr;
+  int x;
+  (void)src_stride;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  dst_width -= 1;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+    dst[1] = STATIC_CAST(uint8_t, C16TO8((s[2] + s[3] + 1) >> 1, scale));
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+    dst += 1;
+    s += 2;
+  }
+  dst[0] = STATIC_CAST(uint8_t, C16TO8(s[0], scale));
+}
+
 void ScaleRowDown2Box_C(const uint8_t* src_ptr,
                         ptrdiff_t src_stride,
                         uint8_t* dst,
@@ -160,6 +269,61 @@ void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
   }
 }
 
+void ScaleRowDown2Box_16To8_C(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width,
+                              int scale) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
+  int x;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t,
+                         C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+    dst[1] = STATIC_CAST(uint8_t,
+                         C16TO8((s[2] + s[3] + t[2] + t[3] + 2) >> 2, scale));
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t,
+                         C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+  }
+}
+
+void ScaleRowDown2Box_16To8_Odd_C(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst,
+                                  int dst_width,
+                                  int scale) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
+  int x;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  dst_width -= 1;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t,
+                         C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+    dst[1] = STATIC_CAST(uint8_t,
+                         C16TO8((s[2] + s[3] + t[2] + t[3] + 2) >> 2, scale));
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t,
+                         C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+    dst += 1;
+    s += 2;
+    t += 2;
+  }
+  dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + t[0] + 1) >> 1, scale));
+}
+
 void ScaleRowDown4_C(const uint8_t* src_ptr,
                      ptrdiff_t src_stride,
                      uint8_t* dst,
@@ -1116,18 +1280,13 @@ void ScaleUVRowDown2_C(const uint8_t* src_uv,
                        ptrdiff_t src_stride,
                        uint8_t* dst_uv,
                        int dst_width) {
-  const uint16_t* src = (const uint16_t*)(src_uv);
-  uint16_t* dst = (uint16_t*)(dst_uv);
   int x;
   (void)src_stride;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src[1];
-    dst[1] = src[3];
-    src += 2;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[1];
+  for (x = 0; x < dst_width; ++x) {
+    dst_uv[0] = src_uv[2];  // Store the 2nd UV
+    dst_uv[1] = src_uv[3];
+    src_uv += 4;
+    dst_uv += 2;
   }
 }
 
@@ -1469,7 +1628,7 @@ void ScalePlaneVertical(int src_height,
                         enum FilterMode filtering) {
   // TODO(fbarchard): Allow higher bpp.
   int dst_width_bytes = dst_width * bpp;
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
@@ -1519,6 +1678,12 @@ void ScalePlaneVertical(int src_height,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
+
   for (j = 0; j < dst_height; ++j) {
     int yi;
     int yf;
@@ -1548,7 +1713,7 @@ void ScalePlaneVertical_16(int src_height,
                            enum FilterMode filtering) {
   // TODO(fbarchard): Allow higher wpp.
   int dst_width_words = dst_width * wpp;
-  void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb,
+  void (*InterpolateRow)(uint16_t* dst_argb, const uint16_t* src_argb,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_16_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
@@ -1627,7 +1792,7 @@ void ScalePlaneVertical_16To8(int src_height,
   // TODO(fbarchard): Allow higher wpp.
   int dst_width_words = dst_width * wpp;
   // TODO(https://crbug.com/libyuv/931): Add NEON 32 bit and AVX2 versions.
-  void (*InterpolateRow_16To8)(uint8_t * dst_argb, const uint16_t* src_argb,
+  void (*InterpolateRow_16To8)(uint8_t* dst_argb, const uint16_t* src_argb,
                                ptrdiff_t src_stride, int scale, int dst_width,
                                int source_y_fraction) = InterpolateRow_16To8_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
@@ -1799,35 +1964,6 @@ void ScaleSlope(int src_width,
 }
 #undef CENTERSTART
 
-// Read 8x2 upsample with filtering and write 16x1.
-// actually reads an extra pixel, so 9x2.
-void ScaleRowUp2_16_C(const uint16_t* src_ptr,
-                      ptrdiff_t src_stride,
-                      uint16_t* dst,
-                      int dst_width) {
-  const uint16_t* src2 = src_ptr + src_stride;
-
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    uint16_t p0 = src_ptr[0];
-    uint16_t p1 = src_ptr[1];
-    uint16_t p2 = src2[0];
-    uint16_t p3 = src2[1];
-    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
-    dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
-    ++src_ptr;
-    ++src2;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    uint16_t p0 = src_ptr[0];
-    uint16_t p1 = src_ptr[1];
-    uint16_t p2 = src2[0];
-    uint16_t p3 = src2[1];
-    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
-  }
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/source/scale_gcc.cc b/source/scale_gcc.cc
index edaf2e29..17eeffad 100644
--- a/files/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -1094,7 +1094,8 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
       : "r"((intptr_t)(src_stride)),  // %3
         "r"((intptr_t)(dst_stride)),  // %4
         "m"(kLinearShuffleFar)        // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif
 
@@ -1294,7 +1295,7 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
         "+r"(dst_ptr),      // %1
         "+r"(dst_width)     // %2
       : "m"(kLinearMadd31)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif
 
diff --git a/files/source/scale_lsx.cc b/source/scale_lsx.cc
index bfe5e9fb..bfe5e9fb 100644
--- a/files/source/scale_lsx.cc
+++ b/source/scale_lsx.cc
diff --git a/files/source/scale_msa.cc b/source/scale_msa.cc
index 482a521f..482a521f 100644
--- a/files/source/scale_msa.cc
+++ b/source/scale_msa.cc
diff --git a/files/source/scale_neon.cc b/source/scale_neon.cc
index 6a0d6e1b..ccc75106 100644
--- a/files/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -1428,6 +1428,45 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
 
 #undef LOAD2_DATA32_LANE
 
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
+      "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst1.16     {q1}, [%1]!                   \n"  // store 8 UV
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1");
+}
+
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst,
+                                int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
+      "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vrhadd.u8   q0, q0, q1                    \n"  // rounding half add
+      "vst1.16     {q0}, [%1]!                   \n"  // store 8 UV
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1");
+}
+
 void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst,
diff --git a/files/source/scale_neon64.cc b/source/scale_neon64.cc
index 9f9636e6..7c072380 100644
--- a/files/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -1118,101 +1118,6 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
 
 #undef LOAD2_DATA8_LANE
 
-// 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8_t* dst_ptr,
-                          const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          int dst_width,
-                          int source_y_fraction) {
-  int y_fraction = 256 - source_y_fraction;
-  asm volatile(
-      "cmp         %w4, #0                       \n"
-      "b.eq        100f                          \n"
-      "add         %2, %2, %1                    \n"
-      "cmp         %w4, #64                      \n"
-      "b.eq        75f                           \n"
-      "cmp         %w4, #128                     \n"
-      "b.eq        50f                           \n"
-      "cmp         %w4, #192                     \n"
-      "b.eq        25f                           \n"
-
-      "dup         v5.8b, %w4                    \n"
-      "dup         v4.8b, %w5                    \n"
-      // General purpose row blend.
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%1], #16           \n"
-      "ld1         {v1.16b}, [%2], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "umull       v6.8h, v0.8b, v4.8b           \n"
-      "umull2      v7.8h, v0.16b, v4.16b         \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "umlal       v6.8h, v1.8b, v5.8b           \n"
-      "umlal2      v7.8h, v1.16b, v5.16b         \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "rshrn       v0.8b, v6.8h, #8              \n"
-      "rshrn2      v0.16b, v7.8h, #8             \n"
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        1b                            \n"
-      "b           99f                           \n"
-
-      // Blend 25 / 75.
-      "25:                                       \n"
-      "ld1         {v0.16b}, [%1], #16           \n"
-      "ld1         {v1.16b}, [%2], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        25b                           \n"
-      "b           99f                           \n"
-
-      // Blend 50 / 50.
-      "50:                                       \n"
-      "ld1         {v0.16b}, [%1], #16           \n"
-      "ld1         {v1.16b}, [%2], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        50b                           \n"
-      "b           99f                           \n"
-
-      // Blend 75 / 25.
-      "75:                                       \n"
-      "ld1         {v1.16b}, [%1], #16           \n"
-      "ld1         {v0.16b}, [%2], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        75b                           \n"
-      "b           99f                           \n"
-
-      // Blend 100 / 0 - Copy row unchanged.
-      "100:                                      \n"
-      "ld1         {v0.16b}, [%1], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        100b                          \n"
-
-      "99:                                       \n"
-      "st1         {v0.b}[15], [%0]              \n"
-      : "+r"(dst_ptr),            // %0
-        "+r"(src_ptr),            // %1
-        "+r"(src_stride),         // %2
-        "+r"(dst_width),          // %3
-        "+r"(source_y_fraction),  // %4
-        "+r"(y_fraction)          // %5
-      :
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
-}
-
 void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst,
@@ -1568,6 +1473,45 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
   );
 }
 
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v1.8h}, [%1], #16            \n"  // store 8 UV
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1");
+}
+
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst,
+                                int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v0.8h}, [%1], #16            \n"  // store 8 UV
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1");
+}
+
 void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst,
diff --git a/files/source/scale_rgb.cc b/source/scale_rgb.cc
index 8db59b56..8db59b56 100644
--- a/files/source/scale_rgb.cc
+++ b/source/scale_rgb.cc
diff --git a/source/scale_rvv.cc b/source/scale_rvv.cc
new file mode 100644
index 00000000..de037e45
--- /dev/null
+++ b/source/scale_rvv.cc
@@ -0,0 +1,1040 @@
+/*
+ *  Copyright 2023 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * Contributed by Darren Hsieh <darren.hsieh@sifive.com>
+ * Contributed by Bruce Lai <bruce.lai@sifive.com>
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+// This module is for clang rvv. GCC hasn't supported segment load & store.
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \
+    defined(__clang__)
+#include <assert.h>
+#include <riscv_vector.h>
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAS_SCALEADDROW_RVV
+void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+  size_t w = (size_t)src_width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4_t v_src = __riscv_vle8_v_u8m4(src_ptr, vl);
+    vuint16m8_t v_dst = __riscv_vle16_v_u16m8(dst_ptr, vl);
+    // Use widening multiply-add instead of widening + add
+    v_dst = __riscv_vwmaccu_vx_u16m8(v_dst, 1, v_src, vl);
+    __riscv_vse16_v_u16m8(dst_ptr, v_dst, vl);
+    w -= vl;
+    src_ptr += vl;
+    dst_ptr += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_RVV
+void ScaleARGBRowDown2_RVV(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width) {
+  (void)src_stride;
+  size_t w = (size_t)dst_width;
+  const uint64_t* src = (const uint64_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  do {
+    size_t vl = __riscv_vsetvl_e64m8(w);
+    vuint64m8_t v_data = __riscv_vle64_v_u64m8(src, vl);
+    vuint32m4_t v_dst = __riscv_vnsrl_wx_u32m4(v_data, 32, vl);
+    __riscv_vse32_v_u32m4(dst, v_dst, vl);
+    w -= vl;
+    src += vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2LINEAR_RVV
+void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  (void)src_stride;
+  size_t w = (size_t)dst_width;
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m4_t v_odd, v_even, v_dst;
+    vuint32m4_t v_odd_32, v_even_32;
+    size_t vl = __riscv_vsetvl_e32m4(w);
+    __riscv_vlseg2e32_v_u32m4(&v_even_32, &v_odd_32, src, vl);
+    v_even = __riscv_vreinterpret_v_u32m4_u8m4(v_even_32);
+    v_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_odd_32);
+    // Use round-to-nearest-up mode for averaging add
+    v_dst = __riscv_vaaddu_vv_u8m4(v_even, v_odd, vl * 4);
+    __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
+    w -= vl;
+    src += vl * 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2BOX_RVV
+void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint32_t* src0 = (const uint32_t*)(src_argb);
+  const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst;
+    vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16;
+    vuint32m4_t v_row0_odd_32, v_row0_even_32, v_row1_odd_32, v_row1_even_32;
+    size_t vl = __riscv_vsetvl_e32m4(w);
+    __riscv_vlseg2e32_v_u32m4(&v_row0_even_32, &v_row0_odd_32, src0, vl);
+    __riscv_vlseg2e32_v_u32m4(&v_row1_even_32, &v_row1_odd_32, src1, vl);
+    v_row0_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_even_32);
+    v_row0_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_odd_32);
+    v_row1_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_even_32);
+    v_row1_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_odd_32);
+    v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_even, v_row0_odd, vl * 4);
+    v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_even, v_row1_odd, vl * 4);
+    v_dst_16 = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4);
+    // Use round-to-nearest-up mode for vnclip
+    v_dst = __riscv_vnclipu_wx_u8m4(v_dst_16, 2, vl * 4);
+    __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
+    w -= vl;
+    src0 += vl * 2;
+    src1 += vl * 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_RVV
+void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  const int stride_byte = src_stepx * 4;
+  do {
+    size_t vl = __riscv_vsetvl_e32m8(w);
+    vuint32m8_t v_row = __riscv_vlse32_v_u32m8(src, stride_byte, vl);
+    __riscv_vse32_v_u32m8(dst, v_row, vl);
+    w -= vl;
+    src += vl * src_stepx;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWNEVENBOX_RVV
+void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint32_t* src0 = (const uint32_t*)(src_argb);
+  const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
+  const int stride_byte = src_stepx * 4;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst;
+    vuint16m8_t v_row0_sum, v_row1_sum, v_sum;
+    vuint32m4_t v_row0_low_32, v_row0_high_32, v_row1_low_32, v_row1_high_32;
+    size_t vl = __riscv_vsetvl_e32m4(w);
+    __riscv_vlsseg2e32_v_u32m4(&v_row0_low_32, &v_row0_high_32, src0,
+                               stride_byte, vl);
+    __riscv_vlsseg2e32_v_u32m4(&v_row1_low_32, &v_row1_high_32, src1,
+                               stride_byte, vl);
+    v_row0_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_low_32);
+    v_row0_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_high_32);
+    v_row1_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_low_32);
+    v_row1_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_high_32);
+    v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_low, v_row0_high, vl * 4);
+    v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_low, v_row1_high, vl * 4);
+    v_sum = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4);
+    // Use round-to-nearest-up mode for vnclip
+    v_dst = __riscv_vnclipu_wx_u8m4(v_sum, 2, vl * 4);
+    __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
+    w -= vl;
+    src0 += vl * src_stepx;
+    src1 += vl * src_stepx;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN2_RVV
+void ScaleRowDown2_RVV(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint16_t* src = (const uint16_t*)src_ptr;
+  (void)src_stride;
+  do {
+    size_t vl = __riscv_vsetvl_e16m8(w);
+    vuint16m8_t v_src = __riscv_vle16_v_u16m8(src, vl);
+    vuint8m4_t v_dst = __riscv_vnsrl_wx_u8m4(v_src, 8, vl);
+    __riscv_vse8_v_u8m4(dst, v_dst, vl);
+    w -= vl;
+    src += vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN2LINEAR_RVV
+void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  size_t w = (size_t)dst_width;
+  (void)src_stride;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m4_t v_s0, v_s1, v_dst;
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, src_ptr, vl);
+    // Use round-to-nearest-up mode for averaging add
+    v_dst = __riscv_vaaddu_vv_u8m4(v_s0, v_s1, vl);
+    __riscv_vse8_v_u8m4(dst, v_dst, vl);
+    w -= vl;
+    src_ptr += 2 * vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN2BOX_RVV
+void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  size_t w = (size_t)dst_width;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4_t v_s0, v_s1, v_t0, v_t1;
+    vuint16m8_t v_s01, v_t01, v_st01;
+    vuint8m4_t v_dst;
+    __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, s, vl);
+    __riscv_vlseg2e8_v_u8m4(&v_t0, &v_t1, t, vl);
+    v_s01 = __riscv_vwaddu_vv_u16m8(v_s0, v_s1, vl);
+    v_t01 = __riscv_vwaddu_vv_u16m8(v_t0, v_t1, vl);
+    v_st01 = __riscv_vadd_vv_u16m8(v_s01, v_t01, vl);
+    // Use round-to-nearest-up mode for vnclip
+    v_dst = __riscv_vnclipu_wx_u8m4(v_st01, 2, vl);
+    __riscv_vse8_v_u8m4(dst, v_dst, vl);
+    w -= vl;
+    s += 2 * vl;
+    t += 2 * vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN4_RVV
+void ScaleRowDown4_RVV(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst_ptr,
+                       int dst_width) {
+  size_t w = (size_t)dst_width;
+  (void)src_stride;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+    __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
+    __riscv_vse8_v_u8m2(dst_ptr, v_s2, vl);
+    w -= vl;
+    src_ptr += (4 * vl);
+    dst_ptr += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN4BOX_RVV
+void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+  size_t w = (size_t)dst_width;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+    vuint8m2_t v_t0, v_t1, v_t2, v_t3;
+    vuint8m2_t v_u0, v_u1, v_u2, v_u3;
+    vuint8m2_t v_v0, v_v1, v_v2, v_v3;
+    vuint16m4_t v_s01, v_s23, v_t01, v_t23;
+    vuint16m4_t v_u01, v_u23, v_v01, v_v23;
+    vuint16m4_t v_st01, v_st23, v_uv01, v_uv23;
+    vuint16m4_t v_st0123, v_uv0123, v_stuv0123;
+    vuint8m2_t v_dst;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+
+    __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
+    v_s01 = __riscv_vwaddu_vv_u16m4(v_s0, v_s1, vl);
+
+    __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, src_ptr1, vl);
+    v_t01 = __riscv_vwaddu_vv_u16m4(v_t0, v_t1, vl);
+
+    __riscv_vlseg4e8_v_u8m2(&v_u0, &v_u1, &v_u2, &v_u3, src_ptr2, vl);
+    v_u01 = __riscv_vwaddu_vv_u16m4(v_u0, v_u1, vl);
+    v_u23 = __riscv_vwaddu_vv_u16m4(v_u2, v_u3, vl);
+
+    v_s23 = __riscv_vwaddu_vv_u16m4(v_s2, v_s3, vl);
+    v_t23 = __riscv_vwaddu_vv_u16m4(v_t2, v_t3, vl);
+    v_st01 = __riscv_vadd_vv_u16m4(v_s01, v_t01, vl);
+    v_st23 = __riscv_vadd_vv_u16m4(v_s23, v_t23, vl);
+
+    __riscv_vlseg4e8_v_u8m2(&v_v0, &v_v1, &v_v2, &v_v3, src_ptr3, vl);
+
+    v_v01 = __riscv_vwaddu_vv_u16m4(v_v0, v_v1, vl);
+    v_v23 = __riscv_vwaddu_vv_u16m4(v_v2, v_v3, vl);
+
+    v_uv01 = __riscv_vadd_vv_u16m4(v_u01, v_v01, vl);
+    v_uv23 = __riscv_vadd_vv_u16m4(v_u23, v_v23, vl);
+
+    v_st0123 = __riscv_vadd_vv_u16m4(v_st01, v_st23, vl);
+    v_uv0123 = __riscv_vadd_vv_u16m4(v_uv01, v_uv23, vl);
+    v_stuv0123 = __riscv_vadd_vv_u16m4(v_st0123, v_uv0123, vl);
+    // Use round-to-nearest-up mode for vnclip
+    v_dst = __riscv_vnclipu_wx_u8m2(v_stuv0123, 4, vl);
+    __riscv_vse8_v_u8m2(dst_ptr, v_dst, vl);
+    w -= vl;
+    src_ptr += 4 * vl;
+    src_ptr1 += 4 * vl;
+    src_ptr2 += 4 * vl;
+    src_ptr3 += 4 * vl;
+    dst_ptr += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN34_RVV
+void ScaleRowDown34_RVV(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+    __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_ptr, v_s0, v_s1, v_s3, vl);
+    w -= vl;
+    src_ptr += 4 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN34_0_BOX_RVV
+void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+    vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
+    vuint8m2_t v_u0, v_u1, v_u2, v_u3;
+    vuint16m4_t v_u1_u16;
+    vuint8m2_t v_a0, v_a1, v_a2;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl);
+
+    if (src_stride == 0) {
+      v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
+      v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
+      v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_s2, 2, vl);
+      v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_s3, 2, vl);
+    } else {
+      vuint8m2_t v_t0, v_t1, v_t2, v_t3;
+      __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl);
+      v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 0, vl);
+      v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 0, vl);
+      v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_t2, 0, vl);
+      v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_t3, 0, vl);
+      t += 4 * vl;
+    }
+
+    v_t0_u16 = __riscv_vwmaccu_vx_u16m4(v_t0_u16, 3, v_s0, vl);
+    v_t1_u16 = __riscv_vwmaccu_vx_u16m4(v_t1_u16, 3, v_s1, vl);
+    v_t2_u16 = __riscv_vwmaccu_vx_u16m4(v_t2_u16, 3, v_s2, vl);
+    v_t3_u16 = __riscv_vwmaccu_vx_u16m4(v_t3_u16, 3, v_s3, vl);
+
+    // Use round-to-nearest-up mode for vnclip & averaging add
+    v_u0 = __riscv_vnclipu_wx_u8m2(v_t0_u16, 2, vl);
+    v_u1 = __riscv_vnclipu_wx_u8m2(v_t1_u16, 2, vl);
+    v_u2 = __riscv_vnclipu_wx_u8m2(v_t2_u16, 2, vl);
+    v_u3 = __riscv_vnclipu_wx_u8m2(v_t3_u16, 2, vl);
+
+    // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u1, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u0, vl);
+    v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+    // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+    v_a1 = __riscv_vaaddu_vv_u8m2(v_u1, v_u2, vl);
+
+    // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u2, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u3, vl);
+    v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+    __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl);
+
+    w -= vl;
+    s += 4 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN34_1_BOX_RVV
+void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+    vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
+    vuint16m4_t v_u1_u16;
+    vuint8m2_t v_a0, v_a1, v_a2;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl);
+
+    // Use round-to-nearest-up mode for vnclip & averaging add
+    if (src_stride == 0) {
+      v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_s0, vl);
+      v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_s1, vl);
+      v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_s2, vl);
+      v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_s3, vl);
+    } else {
+      vuint8m2_t v_t0, v_t1, v_t2, v_t3;
+      __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl);
+      v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_t0, vl);
+      v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_t1, vl);
+      v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_t2, vl);
+      v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_t3, vl);
+      t += 4 * vl;
+    }
+    // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave1, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave0, vl);
+    v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+    // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+    v_a1 = __riscv_vaaddu_vv_u8m2(v_ave1, v_ave2, vl);
+
+    // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave2, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave3, vl);
+    v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+    __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl);
+
+    w -= vl;
+    s += 4 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN38_RVV
+void ScaleRowDown38_RVV(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  (void)src_stride;
+  assert(dst_width % 3 == 0);
+  do {
+    vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
+    size_t vl = __riscv_vsetvl_e8m1(w);
+    __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
+                            &v_s7, src_ptr, vl);
+    __riscv_vsseg3e8_v_u8m1(dst_ptr, v_s0, v_s3, v_s6, vl);
+    w -= vl;
+    src_ptr += 8 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN38_2_BOX_RVV
+void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint16_t coeff_a = (65536u / 6u);
+  const uint16_t coeff_b = (65536u / 4u);
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  do {
+    vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
+    vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
+    vuint16m2_t v_e0, v_e1, v_e2, v_e;
+    vuint16m2_t v_f0, v_f1, v_f2, v_f;
+    vuint16m2_t v_g0, v_g1, v_g;
+    vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
+    size_t vl = __riscv_vsetvl_e8m1(w);
+    // s: e00, e10, e20, f00, f10, f20, g00, g10
+    // t: e01, e11, e21, f01, f11, f21, g01, g11
+    __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
+                            &v_s7, src_ptr, vl);
+    __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
+                            &v_t7, src_ptr + src_stride, vl);
+    // Calculate sum of [e00, e21] to v_e
+    // Calculate sum of [f00, f21] to v_f
+    // Calculate sum of [g00, g11] to v_g
+    v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
+    v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
+    v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
+    v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
+    v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
+    v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
+    v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
+    v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
+
+    v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
+    v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
+    v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
+    v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
+    v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
+
+    // Average in 16-bit fixed-point
+    v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
+    v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
+    v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
+
+    v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
+    v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
+    v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
+
+    __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
+    w -= vl;
+    src_ptr += 8 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN38_3_BOX_RVV
+void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint16_t coeff_a = (65536u / 9u);
+  const uint16_t coeff_b = (65536u / 6u);
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  do {
+    vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
+    vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
+    vuint8m1_t v_u0, v_u1, v_u2, v_u3, v_u4, v_u5, v_u6, v_u7;
+    vuint16m2_t v_e0, v_e1, v_e2, v_e3, v_e4, v_e;
+    vuint16m2_t v_f0, v_f1, v_f2, v_f3, v_f4, v_f;
+    vuint16m2_t v_g0, v_g1, v_g2, v_g;
+    vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
+    size_t vl = __riscv_vsetvl_e8m1(w);
+    // s: e00, e10, e20, f00, f10, f20, g00, g10
+    // t: e01, e11, e21, f01, f11, f21, g01, g11
+    // u: e02, e12, e22, f02, f12, f22, g02, g12
+    __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
+                            &v_s7, src_ptr, vl);
+    __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
+                            &v_t7, src_ptr + src_stride, vl);
+    __riscv_vlseg8e8_v_u8m1(&v_u0, &v_u1, &v_u2, &v_u3, &v_u4, &v_u5, &v_u6,
+                            &v_u7, src_ptr + 2 * src_stride, vl);
+    // Calculate sum of [e00, e22]
+    v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
+    v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
+    v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
+    v_e3 = __riscv_vwaddu_vv_u16m2(v_u0, v_u1, vl);
+    v_e4 = __riscv_vwaddu_vx_u16m2(v_u2, 0, vl);
+
+    v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
+    v_e2 = __riscv_vadd_vv_u16m2(v_e2, v_e3, vl);
+    v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e4, vl);
+    v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
+    // Calculate sum of [f00, f22]
+    v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
+    v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
+    v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
+    v_f3 = __riscv_vwaddu_vv_u16m2(v_u3, v_u4, vl);
+    v_f4 = __riscv_vwaddu_vx_u16m2(v_u5, 0, vl);
+
+    v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
+    v_f2 = __riscv_vadd_vv_u16m2(v_f2, v_f3, vl);
+    v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f4, vl);
+    v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
+    // Calculate sum of [g00, g12]
+    v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
+    v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
+    v_g2 = __riscv_vwaddu_vv_u16m2(v_u6, v_u7, vl);
+
+    v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
+    v_g = __riscv_vadd_vv_u16m2(v_g, v_g2, vl);
+
+    // Average in 16-bit fixed-point
+    v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
+    v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
+    v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
+
+    v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
+    v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
+    v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
+    __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
+    w -= vl;
+    src_ptr += 8 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+// ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
+// ScaleRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. Other
+// platforms only implement non-edge part of image and process edge with scalar.
+
+#ifdef HAS_SCALEROWUP2_LINEAR_RVV
+void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  size_t work_width = (size_t)dst_width - 1u;
+  size_t src_width = work_width >> 1u;
+  const uint8_t* work_src_ptr = src_ptr;
+  uint8_t* work_dst_ptr = dst_ptr + 1;
+  size_t vl = __riscv_vsetvlmax_e8m4();
+  vuint8m4_t v_3 = __riscv_vmv_v_x_u8m4(3, vl);
+  dst_ptr[0] = src_ptr[0];
+  while (src_width > 0) {
+    vuint8m4_t v_src0, v_src1, v_dst_odd, v_dst_even;
+    vuint16m8_t v_src0_u16, v_src1_u16;
+    size_t vl = __riscv_vsetvl_e8m4(src_width);
+    v_src0 = __riscv_vle8_v_u8m4(work_src_ptr, vl);
+    v_src1 = __riscv_vle8_v_u8m4(work_src_ptr + 1, vl);
+
+    v_src0_u16 = __riscv_vwaddu_vx_u16m8(v_src0, 2, vl);
+    v_src1_u16 = __riscv_vwaddu_vx_u16m8(v_src1, 2, vl);
+    v_src0_u16 = __riscv_vwmaccu_vv_u16m8(v_src0_u16, v_3, v_src1, vl);
+    v_src1_u16 = __riscv_vwmaccu_vv_u16m8(v_src1_u16, v_3, v_src0, vl);
+
+    v_dst_odd = __riscv_vnsrl_wx_u8m4(v_src0_u16, 2, vl);
+    v_dst_even = __riscv_vnsrl_wx_u8m4(v_src1_u16, 2, vl);
+
+    __riscv_vsseg2e8_v_u8m4(work_dst_ptr, v_dst_even, v_dst_odd, vl);
+
+    src_width -= vl;
+    work_src_ptr += vl;
+    work_dst_ptr += 2 * vl;
+  }
+  dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2];
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_RVV
+void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width) {
+  size_t work_width = ((size_t)dst_width - 1u) & ~1u;
+  size_t src_width = work_width >> 1u;
+  const uint8_t* work_s = src_ptr;
+  const uint8_t* work_t = src_ptr + src_stride;
+  const uint8_t* s = work_s;
+  const uint8_t* t = work_t;
+  uint8_t* d = dst_ptr;
+  uint8_t* e = dst_ptr + dst_stride;
+  uint8_t* work_d = d + 1;
+  uint8_t* work_e = e + 1;
+  size_t vl = __riscv_vsetvlmax_e16m4();
+  vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl);
+  vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl);
+  d[0] = (3 * s[0] + t[0] + 2) >> 2;
+  e[0] = (s[0] + 3 * t[0] + 2) >> 2;
+  while (src_width > 0) {
+    vuint8m2_t v_s0, v_s1, v_t0, v_t1;
+    vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16;
+    vuint16m4_t v_t0_u16_, v_t1_u16_;
+    vuint8m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd;
+    size_t vl = __riscv_vsetvl_e8m2(src_width);
+    v_s0 = __riscv_vle8_v_u8m2(work_s, vl);
+    v_s1 = __riscv_vle8_v_u8m2(work_s + 1, vl);
+
+    v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
+    v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
+    v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl);
+    v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl);
+
+    v_t0 = __riscv_vle8_v_u8m2(work_t, vl);
+    v_t1 = __riscv_vle8_v_u8m2(work_t + 1, vl);
+
+    v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl);
+    v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl);
+    v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl);
+    v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl);
+
+    v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl);
+    v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl);
+
+    v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl);
+    v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl);
+    v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl);
+    v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl);
+
+    v_dst0_odd = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl);
+    v_dst0_even = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl);
+    v_dst1_odd = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl);
+    v_dst1_even = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl);
+
+    __riscv_vsseg2e8_v_u8m2(work_d, v_dst0_even, v_dst0_odd, vl);
+    __riscv_vsseg2e8_v_u8m2(work_e, v_dst1_even, v_dst1_odd, vl);
+
+    src_width -= vl;
+    work_s += vl;
+    work_t += vl;
+    work_d += 2 * vl;
+    work_e += 2 * vl;
+  }
+  d[dst_width - 1] =
+      (3 * s[(dst_width - 1) / 2] + t[(dst_width - 1) / 2] + 2) >> 2;
+  e[dst_width - 1] =
+      (s[(dst_width - 1) / 2] + 3 * t[(dst_width - 1) / 2] + 2) >> 2;
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2_RVV
+void ScaleUVRowDown2_RVV(const uint8_t* src_uv,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_uv,
+                         int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint32_t* src = (const uint32_t*)src_uv;
+  uint16_t* dst = (uint16_t*)dst_uv;
+  (void)src_stride;
+  do {
+    size_t vl = __riscv_vsetvl_e32m8(w);
+    vuint32m8_t v_data = __riscv_vle32_v_u32m8(src, vl);
+    vuint16m4_t v_u1v1 = __riscv_vnsrl_wx_u16m4(v_data, 16, vl);
+    __riscv_vse16_v_u16m4(dst, v_u1v1, vl);
+    w -= vl;
+    src += vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2LINEAR_RVV
+void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_uv,
+                               int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint16_t* src = (const uint16_t*)src_uv;
+  (void)src_stride;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m4_t v_u0v0, v_u1v1, v_avg;
+    vuint16m4_t v_u0v0_16, v_u1v1_16;
+    size_t vl = __riscv_vsetvl_e16m4(w);
+    __riscv_vlseg2e16_v_u16m4(&v_u0v0_16, &v_u1v1_16, src, vl);
+    v_u0v0 = __riscv_vreinterpret_v_u16m4_u8m4(v_u0v0_16);
+    v_u1v1 = __riscv_vreinterpret_v_u16m4_u8m4(v_u1v1_16);
+    // Use round-to-nearest-up mode for averaging add
+    v_avg = __riscv_vaaddu_vv_u8m4(v_u0v0, v_u1v1, vl * 2);
+    __riscv_vse8_v_u8m4(dst_uv, v_avg, vl * 2);
+    w -= vl;
+    src += vl * 2;
+    dst_uv += vl * 2;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_RVV
+void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_uv,
+                            int dst_width) {
+  const uint8_t* src_uv_row1 = src_uv + src_stride;
+  size_t w = (size_t)dst_width;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0;
+    vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;
+    vuint16m4_t v_u0u1_row0, v_u0u1_row1, v_v0v1_row0, v_v0v1_row1;
+    vuint16m4_t v_sum0, v_sum1;
+    vuint8m2_t v_dst_u, v_dst_v;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+
+    __riscv_vlseg4e8_v_u8m2(&v_u0_row0, &v_v0_row0, &v_u1_row0, &v_v1_row0,
+                            src_uv, vl);
+    __riscv_vlseg4e8_v_u8m2(&v_u0_row1, &v_v0_row1, &v_u1_row1, &v_v1_row1,
+                            src_uv_row1, vl);
+
+    v_u0u1_row0 = __riscv_vwaddu_vv_u16m4(v_u0_row0, v_u1_row0, vl);
+    v_u0u1_row1 = __riscv_vwaddu_vv_u16m4(v_u0_row1, v_u1_row1, vl);
+    v_v0v1_row0 = __riscv_vwaddu_vv_u16m4(v_v0_row0, v_v1_row0, vl);
+    v_v0v1_row1 = __riscv_vwaddu_vv_u16m4(v_v0_row1, v_v1_row1, vl);
+
+    v_sum0 = __riscv_vadd_vv_u16m4(v_u0u1_row0, v_u0u1_row1, vl);
+    v_sum1 = __riscv_vadd_vv_u16m4(v_v0v1_row0, v_v0v1_row1, vl);
+    // Use round-to-nearest-up mode for vnclip
+    v_dst_u = __riscv_vnclipu_wx_u8m2(v_sum0, 2, vl);
+    v_dst_v = __riscv_vnclipu_wx_u8m2(v_sum1, 2, vl);
+
+    __riscv_vsseg2e8_v_u8m2(dst_uv, v_dst_u, v_dst_v, vl);
+
+    dst_uv += 2 * vl;
+    src_uv += 4 * vl;
+    w -= vl;
+    src_uv_row1 += 4 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN4_RVV
+void ScaleUVRowDown4_RVV(const uint8_t* src_uv,
+                         ptrdiff_t src_stride,
+                         int src_stepx,
+                         uint8_t* dst_uv,
+                         int dst_width) {
+  // Overflow will never happen here, since sizeof(size_t)/sizeof(int)=2.
+  // dst_width = src_width / 4 and src_width is also int.
+  size_t w = (size_t)dst_width * 8;
+  (void)src_stride;
+  (void)src_stepx;
+  do {
+    size_t vl = __riscv_vsetvl_e8m8(w);
+    vuint8m8_t v_row = __riscv_vle8_v_u8m8(src_uv, vl);
+    vuint64m8_t v_row_64 = __riscv_vreinterpret_v_u8m8_u64m8(v_row);
+    // Narrowing without clipping
+    vuint32m4_t v_tmp = __riscv_vncvt_x_x_w_u32m4(v_row_64, vl / 8);
+    vuint16m2_t v_dst_16 = __riscv_vncvt_x_x_w_u16m2(v_tmp, vl / 8);
+    vuint8m2_t v_dst = __riscv_vreinterpret_v_u16m2_u8m2(v_dst_16);
+    __riscv_vse8_v_u8m2(dst_uv, v_dst, vl / 4);
+    w -= vl;
+    src_uv += vl;
+    dst_uv += vl / 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWNEVEN_RVV
+void ScaleUVRowDownEven_RVV(const uint8_t* src_uv,
+                            ptrdiff_t src_stride,
+                            int src_stepx,
+                            uint8_t* dst_uv,
+                            int dst_width) {
+  size_t w = (size_t)dst_width;
+  const ptrdiff_t stride_byte = (ptrdiff_t)src_stepx * 2;
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  (void)src_stride;
+  do {
+    size_t vl = __riscv_vsetvl_e16m8(w);
+    vuint16m8_t v_row = __riscv_vlse16_v_u16m8(src, stride_byte, vl);
+    __riscv_vse16_v_u16m8(dst, v_row, vl);
+    w -= vl;
+    src += vl * src_stepx;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+// ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
+// ScaleUVRowUp2_(Bi)linear_Any_XXX. We process entire row in this function.
+// Other platforms only implement non-edge part of image and process edge with
+// scalar.
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_RVV
+void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t work_width = ((size_t)dst_width - 1u) & ~1u;
+  uint16_t* work_dst_ptr = (uint16_t*)dst_ptr + 1;
+  const uint8_t* work_src_ptr = src_ptr;
+  size_t vl = __riscv_vsetvlmax_e8m4();
+  vuint8m4_t v_3_u8 = __riscv_vmv_v_x_u8m4(3, vl);
+  dst_ptr[0] = src_ptr[0];
+  dst_ptr[1] = src_ptr[1];
+  while (work_width > 0) {
+    vuint8m4_t v_uv0, v_uv1, v_dst_odd_u8, v_dst_even_u8;
+    vuint16m4_t v_dst_odd, v_dst_even;
+    vuint16m8_t v_uv0_u16, v_uv1_u16;
+    size_t vl = __riscv_vsetvl_e8m4(work_width);
+    v_uv0 = __riscv_vle8_v_u8m4(work_src_ptr, vl);
+    v_uv1 = __riscv_vle8_v_u8m4(work_src_ptr + 2, vl);
+
+    v_uv0_u16 = __riscv_vwaddu_vx_u16m8(v_uv0, 2, vl);
+    v_uv1_u16 = __riscv_vwaddu_vx_u16m8(v_uv1, 2, vl);
+
+    v_uv0_u16 = __riscv_vwmaccu_vv_u16m8(v_uv0_u16, v_3_u8, v_uv1, vl);
+    v_uv1_u16 = __riscv_vwmaccu_vv_u16m8(v_uv1_u16, v_3_u8, v_uv0, vl);
+
+    v_dst_odd_u8 = __riscv_vnsrl_wx_u8m4(v_uv0_u16, 2, vl);
+    v_dst_even_u8 = __riscv_vnsrl_wx_u8m4(v_uv1_u16, 2, vl);
+
+    v_dst_even = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_even_u8);
+    v_dst_odd = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_odd_u8);
+
+    __riscv_vsseg2e16_v_u16m4(work_dst_ptr, v_dst_even, v_dst_odd, vl / 2);
+
+    work_width -= vl;
+    work_src_ptr += vl;
+    work_dst_ptr += vl;
+  }
+  dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2];
+  dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1];
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_RVV
+void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width) {
+  size_t work_width = ((size_t)dst_width - 1u) & ~1u;
+  const uint8_t* work_s = src_ptr;
+  const uint8_t* work_t = src_ptr + src_stride;
+  const uint8_t* s = work_s;
+  const uint8_t* t = work_t;
+  uint8_t* d = dst_ptr;
+  uint8_t* e = dst_ptr + dst_stride;
+  uint16_t* work_d = (uint16_t*)d + 1;
+  uint16_t* work_e = (uint16_t*)e + 1;
+  size_t vl = __riscv_vsetvlmax_e16m4();
+  vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl);
+  vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl);
+  d[0] = (3 * s[0] + t[0] + 2) >> 2;
+  e[0] = (s[0] + 3 * t[0] + 2) >> 2;
+  d[1] = (3 * s[1] + t[1] + 2) >> 2;
+  e[1] = (s[1] + 3 * t[1] + 2) >> 2;
+  while (work_width > 0) {
+    vuint8m2_t v_s0, v_s1, v_t0, v_t1;
+    vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16;
+    vuint16m4_t v_t0_u16_, v_t1_u16_;
+    vuint8m2_t v_dst0_odd_u8, v_dst0_even_u8, v_dst1_odd_u8, v_dst1_even_u8;
+    vuint16m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd;
+    size_t vl = __riscv_vsetvl_e8m2(work_width);
+    v_s0 = __riscv_vle8_v_u8m2(work_s, vl);
+    v_s1 = __riscv_vle8_v_u8m2(work_s + 2, vl);
+
+    v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
+    v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
+    v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl);
+    v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl);
+
+    v_t0 = __riscv_vle8_v_u8m2(work_t, vl);
+    v_t1 = __riscv_vle8_v_u8m2(work_t + 2, vl);
+
+    v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl);
+    v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl);
+    v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl);
+    v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl);
+
+    v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl);
+    v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl);
+
+    v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl);
+    v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl);
+    v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl);
+    v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl);
+
+    v_dst0_odd_u8 = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl);
+    v_dst0_even_u8 = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl);
+    v_dst1_odd_u8 = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl);
+    v_dst1_even_u8 = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl);
+
+    v_dst0_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_even_u8);
+    v_dst0_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_odd_u8);
+    v_dst1_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_even_u8);
+    v_dst1_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_odd_u8);
+
+    __riscv_vsseg2e16_v_u16m2(work_d, v_dst0_even, v_dst0_odd, vl / 2);
+    __riscv_vsseg2e16_v_u16m2(work_e, v_dst1_even, v_dst1_odd, vl / 2);
+
+    work_width -= vl;
+    work_s += vl;
+    work_t += vl;
+    work_d += vl;
+    work_e += vl;
+  }
+  d[2 * dst_width - 2] =
+      (3 * s[((dst_width + 1) & ~1) - 2] + t[((dst_width + 1) & ~1) - 2] + 2) >>
+      2;
+  e[2 * dst_width - 2] =
+      (s[((dst_width + 1) & ~1) - 2] + 3 * t[((dst_width + 1) & ~1) - 2] + 2) >>
+      2;
+  d[2 * dst_width - 1] =
+      (3 * s[((dst_width + 1) & ~1) - 1] + t[((dst_width + 1) & ~1) - 1] + 2) >>
+      2;
+  e[2 * dst_width - 1] =
+      (s[((dst_width + 1) & ~1) - 1] + 3 * t[((dst_width + 1) & ~1) - 1] + 2) >>
+      2;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) &&
+        // defined(__clang__)
diff --git a/files/source/scale_uv.cc b/source/scale_uv.cc
index 3b3d7b8e..0931c89a 100644
--- a/files/source/scale_uv.cc
+++ b/source/scale_uv.cc
@@ -83,9 +83,9 @@ static void ScaleUVDown2(int src_width,
   assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
   // Advance to odd row, even column.
   if (filtering == kFilterBilinear) {
-    src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2;
+    src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
   } else {
-    src_uv += (y >> 16) * (int64_t)src_stride + ((x >> 16) - 1) * 2;
+    src_uv += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 2;
   }
 
 #if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
@@ -112,6 +112,31 @@ static void ScaleUVDown2(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEUVROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
+                                          : ScaleUVRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 =
+          filtering == kFilterNone
+              ? ScaleUVRowDown2_NEON
+              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
+                                            : ScaleUVRowDown2Box_NEON);
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_RVV
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_RVV
+                                          : ScaleUVRowDown2Box_RVV);
+  }
+#endif
 
 // This code is not enabled.  Only box filter is available at this time.
 #if defined(HAS_SCALEUVROWDOWN2_SSSE3)
@@ -130,23 +155,7 @@ static void ScaleUVDown2(int src_width,
     }
   }
 #endif
-// This code is not enabled.  Only box filter is available at this time.
-#if defined(HAS_SCALEUVROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleUVRowDown2 =
-        filtering == kFilterNone
-            ? ScaleUVRowDown2_Any_NEON
-            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
-                                          : ScaleUVRowDown2Box_Any_NEON);
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleUVRowDown2 =
-          filtering == kFilterNone
-              ? ScaleUVRowDown2_NEON
-              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
-                                            : ScaleUVRowDown2Box_NEON);
-    }
-  }
-#endif
+
 #if defined(HAS_SCALEUVROWDOWN2_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ScaleUVRowDown2 =
@@ -179,28 +188,30 @@ static void ScaleUVDown2(int src_width,
 // This is an optimized version for scaling down a UV to 1/4 of
 // its original size.
 #if HAS_SCALEUVDOWN4BOX
-static void ScaleUVDown4Box(int src_width,
-                            int src_height,
-                            int dst_width,
-                            int dst_height,
-                            int src_stride,
-                            int dst_stride,
-                            const uint8_t* src_uv,
-                            uint8_t* dst_uv,
-                            int x,
-                            int dx,
-                            int y,
-                            int dy) {
+static int ScaleUVDown4Box(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint8_t* src_uv,
+                           uint8_t* dst_uv,
+                           int x,
+                           int dx,
+                           int y,
+                           int dy) {
   int j;
   // Allocate 2 rows of UV.
-  const int kRowSize = (dst_width * 2 * 2 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
+  const int row_size = (dst_width * 2 * 2 + 15) & ~15;
+  align_buffer_64(row, row_size * 2);
+  if (!row)
+    return 1;
   int row_stride = src_stride * (dy >> 16);
   void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
                           uint8_t* dst_uv, int dst_width) =
       ScaleUVRowDown2Box_C;
   // Advance to odd row, even column.
-  src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2;
+  src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
   (void)src_width;
   (void)src_height;
   (void)dx;
@@ -231,16 +242,22 @@ static void ScaleUVDown4Box(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_RVV;
+  }
+#endif
 
   for (j = 0; j < dst_height; ++j) {
     ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2);
-    ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + kRowSize,
+    ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + row_size,
                     dst_width * 2);
-    ScaleUVRowDown2(row, kRowSize, dst_uv, dst_width);
+    ScaleUVRowDown2(row, row_size, dst_uv, dst_width);
     src_uv += row_stride;
     dst_uv += dst_stride;
   }
   free_aligned_buffer_64(row);
+  return 0;
 }
 #endif  // HAS_SCALEUVDOWN4BOX
 
@@ -263,7 +280,7 @@ static void ScaleUVDownEven(int src_width,
                             enum FilterMode filtering) {
   int j;
   int col_step = dx >> 16;
-  int row_stride = (dy >> 16) * (int64_t)src_stride;
+  ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride);
   void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride,
                              int src_step, uint8_t* dst_uv, int dst_width) =
       filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C;
@@ -271,7 +288,7 @@ static void ScaleUVDownEven(int src_width,
   (void)src_height;
   assert(IS_ALIGNED(src_width, 2));
   assert(IS_ALIGNED(src_height, 2));
-  src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2;
+  src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
 #if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
@@ -310,6 +327,12 @@ static void ScaleUVDownEven(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_RVV)
+  if (TestCpuFlag(kCpuHasRVV) && !filtering) {
+    ScaleUVRowDownEven =
+        (col_step == 4) ? ScaleUVRowDown4_RVV : ScaleUVRowDownEven_RVV;
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -324,24 +347,24 @@ static void ScaleUVDownEven(int src_width,
 
 // Scale UV down with bilinear interpolation.
 #if HAS_SCALEUVBILINEARDOWN
-static void ScaleUVBilinearDown(int src_width,
-                                int src_height,
-                                int dst_width,
-                                int dst_height,
-                                int src_stride,
-                                int dst_stride,
-                                const uint8_t* src_uv,
-                                uint8_t* dst_uv,
-                                int x,
-                                int dx,
-                                int y,
-                                int dy,
-                                enum FilterMode filtering) {
+static int ScaleUVBilinearDown(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint8_t* src_uv,
+                               uint8_t* dst_uv,
+                               int x,
+                               int dx,
+                               int y,
+                               int dy,
+                               enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+  void (*InterpolateRow)(uint8_t* dst_uv, const uint8_t* src_uv,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+  void (*ScaleUVFilterCols)(uint8_t* dst_uv, const uint8_t* src_uv,
                             int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C;
   int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
@@ -397,6 +420,11 @@ static void ScaleUVBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 #if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
     ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
@@ -421,15 +449,16 @@ static void ScaleUVBilinearDown(int src_width,
   // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
   // Allocate a row of UV.
   {
-    align_buffer_64(row, clip_src_width * 2);
-
     const int max_y = (src_height - 1) << 16;
+    align_buffer_64(row, clip_src_width * 2);
+    if (!row)
+      return 1;
     if (y > max_y) {
       y = max_y;
     }
     for (j = 0; j < dst_height; ++j) {
       int yi = y >> 16;
-      const uint8_t* src = src_uv + yi * (int64_t)src_stride;
+      const uint8_t* src = src_uv + yi * (intptr_t)src_stride;
       if (filtering == kFilterLinear) {
         ScaleUVFilterCols(dst_uv, src, dst_width, x, dx);
       } else {
@@ -445,29 +474,30 @@ static void ScaleUVBilinearDown(int src_width,
     }
     free_aligned_buffer_64(row);
   }
+  return 0;
 }
 #endif
 
 // Scale UV up with bilinear interpolation.
 #if HAS_SCALEUVBILINEARUP
-static void ScaleUVBilinearUp(int src_width,
-                              int src_height,
-                              int dst_width,
-                              int dst_height,
-                              int src_stride,
-                              int dst_stride,
-                              const uint8_t* src_uv,
-                              uint8_t* dst_uv,
-                              int x,
-                              int dx,
-                              int y,
-                              int dy,
-                              enum FilterMode filtering) {
+static int ScaleUVBilinearUp(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_uv,
+                             uint8_t* dst_uv,
+                             int x,
+                             int dx,
+                             int y,
+                             int dy,
+                             enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+  void (*InterpolateRow)(uint8_t* dst_uv, const uint8_t* src_uv,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+  void (*ScaleUVFilterCols)(uint8_t* dst_uv, const uint8_t* src_uv,
                             int dst_width, int x, int dx) =
       filtering ? ScaleUVFilterCols_C : ScaleUVCols_C;
   const int max_y = (src_height - 1) << 16;
@@ -511,6 +541,11 @@ static void ScaleUVBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
   if (src_width >= 32768) {
     ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C;
   }
@@ -571,14 +606,16 @@ static void ScaleUVBilinearUp(int src_width,
 
   {
     int yi = y >> 16;
-    const uint8_t* src = src_uv + yi * (int64_t)src_stride;
+    const uint8_t* src = src_uv + yi * (intptr_t)src_stride;
 
     // Allocate 2 rows of UV.
-    const int kRowSize = (dst_width * 2 + 15) & ~15;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (dst_width * 2 + 15) & ~15;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
 
     uint8_t* rowptr = row;
-    int rowstride = kRowSize;
+    int rowstride = row_size;
     int lasty = yi;
 
     ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
@@ -596,7 +633,7 @@ static void ScaleUVBilinearUp(int src_width,
         if (y > max_y) {
           y = max_y;
           yi = y >> 16;
-          src = src_uv + yi * (int64_t)src_stride;
+          src = src_uv + yi * (intptr_t)src_stride;
         }
         if (yi != lasty) {
           ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
@@ -619,6 +656,7 @@ static void ScaleUVBilinearUp(int src_width,
     }
     free_aligned_buffer_64(row);
   }
+  return 0;
 }
 #endif  // HAS_SCALEUVBILINEARUP
 
@@ -627,14 +665,14 @@ static void ScaleUVBilinearUp(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original width, using linear interpolation.
 // This is used to scale U and V planes of NV16 to NV24.
-void ScaleUVLinearUp2(int src_width,
-                      int src_height,
-                      int dst_width,
-                      int dst_height,
-                      int src_stride,
-                      int dst_stride,
-                      const uint8_t* src_uv,
-                      uint8_t* dst_uv) {
+static void ScaleUVLinearUp2(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_uv,
+                             uint8_t* dst_uv) {
   void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) =
       ScaleUVRowUp2_Linear_Any_C;
   int i;
@@ -644,32 +682,38 @@ void ScaleUVLinearUp2(int src_width,
   // This function can only scale up by 2 times horizontally.
   assert(src_width == ((dst_width + 1) / 2));
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
+#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
     ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_NEON
+#ifdef HAS_SCALEUVROWUP2_LINEAR_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
     ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON;
   }
 #endif
 
+#ifdef HAS_SCALEUVROWUP2_LINEAR_RVV
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_RVV;
+  }
+#endif
+
   if (dst_height == 1) {
-    ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv,
+    ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv,
                dst_width);
   } else {
     dy = FixedDiv(src_height - 1, dst_height - 1);
     y = (1 << 15) - 1;
     for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_uv + (y >> 16) * (int64_t)src_stride, dst_uv, dst_width);
+      ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width);
       dst_uv += dst_stride;
       y += dy;
     }
@@ -680,14 +724,14 @@ void ScaleUVLinearUp2(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original size, using bilinear interpolation.
 // This is used to scale U and V planes of NV12 to NV24.
-void ScaleUVBilinearUp2(int src_width,
-                        int src_height,
-                        int dst_width,
-                        int dst_height,
-                        int src_stride,
-                        int dst_stride,
-                        const uint8_t* src_ptr,
-                        uint8_t* dst_ptr) {
+static void ScaleUVBilinearUp2(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint8_t* src_ptr,
+                               uint8_t* dst_ptr) {
   void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                       uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
       ScaleUVRowUp2_Bilinear_Any_C;
@@ -697,24 +741,30 @@ void ScaleUVBilinearUp2(int src_width,
   assert(src_width == ((dst_width + 1) / 2));
   assert(src_height == ((dst_height + 1) / 2));
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
   if (TestCpuFlag(kCpuHasSSSE3)) {
     Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
     Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
     Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON;
   }
 #endif
 
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_RVV
+  if (TestCpuFlag(kCpuHasRVV)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_RVV;
+  }
+#endif
+
   Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
   dst_ptr += dst_stride;
   for (x = 0; x < src_height - 1; ++x) {
@@ -734,14 +784,14 @@ void ScaleUVBilinearUp2(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original width, using linear interpolation.
 // This is used to scale U and V planes of P210 to P410.
-void ScaleUVLinearUp2_16(int src_width,
-                         int src_height,
-                         int dst_width,
-                         int dst_height,
-                         int src_stride,
-                         int dst_stride,
-                         const uint16_t* src_uv,
-                         uint16_t* dst_uv) {
+static void ScaleUVLinearUp2_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16_t* src_uv,
+                                uint16_t* dst_uv) {
   void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
       ScaleUVRowUp2_Linear_16_Any_C;
   int i;
@@ -751,32 +801,32 @@ void ScaleUVLinearUp2_16(int src_width,
   // This function can only scale up by 2 times horizontally.
   assert(src_width == ((dst_width + 1) / 2));
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
     ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
     ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
     ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON;
   }
 #endif
 
   if (dst_height == 1) {
-    ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv,
+    ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv,
                dst_width);
   } else {
     dy = FixedDiv(src_height - 1, dst_height - 1);
     y = (1 << 15) - 1;
     for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_uv + (y >> 16) * (int64_t)src_stride, dst_uv, dst_width);
+      ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width);
       dst_uv += dst_stride;
       y += dy;
     }
@@ -787,14 +837,14 @@ void ScaleUVLinearUp2_16(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original size, using bilinear interpolation.
 // This is used to scale U and V planes of P010 to P410.
-void ScaleUVBilinearUp2_16(int src_width,
-                           int src_height,
-                           int dst_width,
-                           int dst_height,
-                           int src_stride,
-                           int dst_stride,
-                           const uint16_t* src_ptr,
-                           uint16_t* dst_ptr) {
+static void ScaleUVBilinearUp2_16(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
+                                  int src_stride,
+                                  int dst_stride,
+                                  const uint16_t* src_ptr,
+                                  uint16_t* dst_ptr) {
   void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                       uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
       ScaleUVRowUp2_Bilinear_16_Any_C;
@@ -804,19 +854,19 @@ void ScaleUVBilinearUp2_16(int src_width,
   assert(src_width == ((dst_width + 1) / 2));
   assert(src_height == ((dst_height + 1) / 2));
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
     Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
     Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
     Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON;
   }
@@ -854,7 +904,7 @@ static void ScaleUVSimple(int src_width,
                           int y,
                           int dy) {
   int j;
-  void (*ScaleUVCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width,
+  void (*ScaleUVCols)(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width,
                       int x, int dx) =
       (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C;
   (void)src_height;
@@ -889,7 +939,7 @@ static void ScaleUVSimple(int src_width,
   }
 
   for (j = 0; j < dst_height; ++j) {
-    ScaleUVCols(dst_uv, src_uv + (y >> 16) * (int64_t)src_stride, dst_width, x,
+    ScaleUVCols(dst_uv, src_uv + (y >> 16) * (intptr_t)src_stride, dst_width, x,
                 dx);
     dst_uv += dst_stride;
     y += dy;
@@ -910,7 +960,7 @@ static int UVCopy(const uint8_t* src_uv,
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_uv = src_uv + (height - 1) * (int64_t)src_stride_uv;
+    src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv;
     src_stride_uv = -src_stride_uv;
   }
 
@@ -930,7 +980,7 @@ static int UVCopy_16(const uint16_t* src_uv,
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_uv = src_uv + (height - 1) * (int64_t)src_stride_uv;
+    src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv;
     src_stride_uv = -src_stride_uv;
   }
 
@@ -942,19 +992,19 @@ static int UVCopy_16(const uint16_t* src_uv,
 // Scale a UV plane (from NV12)
 // This function in turn calls a scaling function
 // suitable for handling the desired resolutions.
-static void ScaleUV(const uint8_t* src,
-                    int src_stride,
-                    int src_width,
-                    int src_height,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int dst_width,
-                    int dst_height,
-                    int clip_x,
-                    int clip_y,
-                    int clip_width,
-                    int clip_height,
-                    enum FilterMode filtering) {
+static int ScaleUV(const uint8_t* src,
+                   int src_stride,
+                   int src_width,
+                   int src_height,
+                   uint8_t* dst,
+                   int dst_stride,
+                   int dst_width,
+                   int dst_height,
+                   int clip_x,
+                   int clip_y,
+                   int clip_width,
+                   int clip_height,
+                   enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
@@ -968,7 +1018,7 @@ static void ScaleUV(const uint8_t* src,
   // Negative src_height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
-    src = src + (src_height - 1) * (int64_t)src_stride;
+    src = src + (src_height - 1) * (intptr_t)src_stride;
     src_stride = -src_stride;
   }
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -983,7 +1033,7 @@ static void ScaleUV(const uint8_t* src,
   if (clip_y) {
     int64_t clipf = (int64_t)(clip_y)*dy;
     y += (clipf & 0xffff);
-    src += (clipf >> 16) * (int64_t)src_stride;
+    src += (clipf >> 16) * (intptr_t)src_stride;
     dst += clip_y * dst_stride;
   }
 
@@ -1000,22 +1050,22 @@ static void ScaleUV(const uint8_t* src,
           ScaleUVDown2(src_width, src_height, clip_width, clip_height,
                        src_stride, dst_stride, src, dst, x, dx, y, dy,
                        filtering);
-          return;
+          return 0;
         }
 #endif
 #if HAS_SCALEUVDOWN4BOX
         if (dx == 0x40000 && filtering == kFilterBox) {
           // Optimized 1/4 box downsample.
-          ScaleUVDown4Box(src_width, src_height, clip_width, clip_height,
-                          src_stride, dst_stride, src, dst, x, dx, y, dy);
-          return;
+          return ScaleUVDown4Box(src_width, src_height, clip_width, clip_height,
+                                 src_stride, dst_stride, src, dst, x, dx, y,
+                                 dy);
         }
 #endif
 #if HAS_SCALEUVDOWNEVEN
         ScaleUVDownEven(src_width, src_height, clip_width, clip_height,
                         src_stride, dst_stride, src, dst, x, dx, y, dy,
                         filtering);
-        return;
+        return 0;
 #endif
       }
       // Optimized odd scale down. ie 3, 5, 7, 9x.
@@ -1024,9 +1074,9 @@ static void ScaleUV(const uint8_t* src,
 #ifdef HAS_UVCOPY
         if (dx == 0x10000 && dy == 0x10000) {
           // Straight copy.
-          UVCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 2,
+          UVCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2,
                  src_stride, dst, dst_stride, clip_width, clip_height);
-          return;
+          return 0;
         }
 #endif
       }
@@ -1037,38 +1087,37 @@ static void ScaleUV(const uint8_t* src,
     // Arbitrary scale vertically, but unscaled horizontally.
     ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
                        dst_stride, src, dst, x, y, dy, /*bpp=*/2, filtering);
-    return;
+    return 0;
   }
-  if (filtering && (dst_width + 1) / 2 == src_width) {
+  if ((filtering == kFilterLinear) && ((dst_width + 1) / 2 == src_width)) {
     ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride,
                      dst_stride, src, dst);
-    return;
+    return 0;
   }
   if ((clip_height + 1) / 2 == src_height &&
       (clip_width + 1) / 2 == src_width &&
       (filtering == kFilterBilinear || filtering == kFilterBox)) {
     ScaleUVBilinearUp2(src_width, src_height, clip_width, clip_height,
                        src_stride, dst_stride, src, dst);
-    return;
+    return 0;
   }
 #if HAS_SCALEUVBILINEARUP
   if (filtering && dy < 65536) {
-    ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,
-                      src_stride, dst_stride, src, dst, x, dx, y, dy,
-                      filtering);
-    return;
+    return ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,
+                             src_stride, dst_stride, src, dst, x, dx, y, dy,
+                             filtering);
   }
 #endif
 #if HAS_SCALEUVBILINEARDOWN
   if (filtering) {
-    ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height,
-                        src_stride, dst_stride, src, dst, x, dx, y, dy,
-                        filtering);
-    return;
+    return ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height,
+                               src_stride, dst_stride, src, dst, x, dx, y, dy,
+                               filtering);
   }
 #endif
   ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride,
                 dst_stride, src, dst, x, dx, y, dy);
+  return 0;
 }
 
 // Scale an UV image.
@@ -1086,9 +1135,9 @@ int UVScale(const uint8_t* src_uv,
       src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
-  ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv, dst_stride_uv,
-          dst_width, dst_height, 0, 0, dst_width, dst_height, filtering);
-  return 0;
+  return ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv,
+                 dst_stride_uv, dst_width, dst_height, 0, 0, dst_width,
+                 dst_height, filtering);
 }
 
 // Scale a 16 bit UV image.
@@ -1118,7 +1167,7 @@ int UVScale_16(const uint16_t* src_uv,
   // Negative src_height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
-    src_uv = src_uv + (src_height - 1) * (int64_t)src_stride_uv;
+    src_uv = src_uv + (src_height - 1) * (intptr_t)src_stride_uv;
     src_stride_uv = -src_stride_uv;
   }
   src_width = Abs(src_width);
@@ -1126,20 +1175,20 @@ int UVScale_16(const uint16_t* src_uv,
 #ifdef HAS_UVCOPY
   if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) {
     if (dst_height == 1) {
-      UVCopy_16(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride_uv,
+      UVCopy_16(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride_uv,
                 src_stride_uv, dst_uv, dst_stride_uv, dst_width, dst_height);
     } else {
       dy = src_height / dst_height;
-      UVCopy_16(src_uv + ((dy - 1) / 2) * (int64_t)src_stride_uv,
-                dy * (int64_t)src_stride_uv, dst_uv, dst_stride_uv, dst_width,
-                dst_height);
+      UVCopy_16(src_uv + ((dy - 1) / 2) * (intptr_t)src_stride_uv,
+                (int)(dy * (intptr_t)src_stride_uv), dst_uv, dst_stride_uv,
+                dst_width, dst_height);
     }
 
     return 0;
   }
 #endif
 
-  if (filtering && (dst_width + 1) / 2 == src_width) {
+  if ((filtering == kFilterLinear) && ((dst_width + 1) / 2 == src_width)) {
     ScaleUVLinearUp2_16(src_width, src_height, dst_width, dst_height,
                         src_stride_uv, dst_stride_uv, src_uv, dst_uv);
     return 0;
diff --git a/files/source/scale_win.cc b/source/scale_win.cc
index ea1f95c6..ea1f95c6 100644
--- a/files/source/scale_win.cc
+++ b/source/scale_win.cc
diff --git a/files/source/test.sh b/source/test.sh
index 7f12c3c1..7f12c3c1 100755
--- a/files/source/test.sh
+++ b/source/test.sh
diff --git a/files/source/video_common.cc b/source/video_common.cc
index 92384c05..92384c05 100644
--- a/files/source/video_common.cc
+++ b/source/video_common.cc
diff --git a/tools_libyuv/OWNERS b/tools_libyuv/OWNERS
new file mode 100644
index 00000000..aae4fb6e
--- /dev/null
+++ b/tools_libyuv/OWNERS
@@ -0,0 +1,4 @@
+mbonadei@chromium.org
+fbarchard@chromium.org
+pbos@chromium.org
+
diff --git a/tools_libyuv/autoroller/roll_deps.py b/tools_libyuv/autoroller/roll_deps.py
new file mode 100755
index 00000000..d5c1089f
--- /dev/null
+++ b/tools_libyuv/autoroller/roll_deps.py
@@ -0,0 +1,822 @@
+#!/usr/bin/env vpython3
+
+# Copyright (c) 2017 The LibYUV project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+"""Script to automatically roll dependencies in the LibYUV DEPS file."""
+
+
+import argparse
+import base64
+import collections
+import logging
+import os
+import re
+import subprocess
+import sys
+import urllib.request
+
+
+def FindSrcDirPath():
+  """Returns the abs path to the src/ dir of the project."""
+  src_dir = os.path.dirname(os.path.abspath(__file__))
+  while os.path.basename(src_dir) != 'src':
+    src_dir = os.path.normpath(os.path.join(src_dir, os.pardir))
+  return src_dir
+
+
+# Skip these dependencies (list without solution name prefix).
+DONT_AUTOROLL_THESE = [
+    'third_party/fuchsia-gn-sdk',
+    'src/third_party/gflags/src',
+    'src/third_party/mockito/src',
+]
+
+# These dependencies are missing in chromium/src/DEPS, either unused or already
+# in-tree. For instance, src/base is a part of the Chromium source git repo,
+# but we pull it through a subtree mirror, so therefore it isn't listed in
+# Chromium's deps but it is in ours.
+LIBYUV_ONLY_DEPS = [
+    'src/base',
+    'src/build',
+    'src/buildtools',
+    'src/ios',
+    'src/testing',
+    'src/third_party',
+    'src/third_party/android_support_test_runner',
+    'src/third_party/bazel',
+    'src/third_party/bouncycastle',
+    'src/third_party/errorprone/lib',
+    'src/third_party/findbugs',
+    'src/third_party/gson',
+    'src/third_party/gtest-parallel',
+    'src/third_party/guava',
+    'src/third_party/intellij',
+    'src/third_party/jsr-305/src',
+    'src/third_party/ow2_asm',
+    'src/third_party/proguard',
+    'src/third_party/ub-uiautomator/lib',
+    'src/tools',
+    'src/tools/clang/dsymutil',
+]
+
+LIBYUV_URL = 'https://chromium.googlesource.com/libyuv/libyuv'
+CHROMIUM_SRC_URL = 'https://chromium.googlesource.com/chromium/src'
+CHROMIUM_COMMIT_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s'
+CHROMIUM_LOG_TEMPLATE = CHROMIUM_SRC_URL + '/+log/%s'
+CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s'
+
+COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$')
+CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'([-0-9a-z]+)\'$')
+ROLL_BRANCH_NAME = 'roll_chromium_revision'
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+CHECKOUT_SRC_DIR = FindSrcDirPath()
+CHECKOUT_ROOT_DIR = os.path.realpath(os.path.join(CHECKOUT_SRC_DIR, os.pardir))
+
+# Copied from tools/android/roll/android_deps/.../BuildConfigGenerator.groovy.
+ANDROID_DEPS_START = r'=== ANDROID_DEPS Generated Code Start ==='
+ANDROID_DEPS_END = r'=== ANDROID_DEPS Generated Code End ==='
+# Location of automically gathered android deps.
+ANDROID_DEPS_PATH = 'src/third_party/android_deps/'
+
+sys.path.append(os.path.join(CHECKOUT_SRC_DIR, 'build'))
+import find_depot_tools
+
+find_depot_tools.add_depot_tools_to_path()
+
+CLANG_UPDATE_SCRIPT_URL_PATH = 'tools/clang/scripts/update.py'
+CLANG_UPDATE_SCRIPT_LOCAL_PATH = os.path.join(CHECKOUT_SRC_DIR, 'tools',
+                                              'clang', 'scripts', 'update.py')
+
+DepsEntry = collections.namedtuple('DepsEntry', 'path url revision')
+ChangedDep = collections.namedtuple('ChangedDep',
+                                    'path url current_rev new_rev')
+CipdDepsEntry = collections.namedtuple('CipdDepsEntry', 'path packages')
+VersionEntry = collections.namedtuple('VersionEntry', 'version')
+ChangedCipdPackage = collections.namedtuple(
+    'ChangedCipdPackage', 'path package current_version new_version')
+ChangedVersionEntry = collections.namedtuple(
+    'ChangedVersionEntry', 'path current_version new_version')
+
+ChromiumRevisionUpdate = collections.namedtuple('ChromiumRevisionUpdate',
+                                                ('current_chromium_rev '
+                                                 'new_chromium_rev '))
+
+
+class RollError(Exception):
+  pass
+
+
+def StrExpansion():
+  return lambda str_value: str_value
+
+
+def VarLookup(local_scope):
+  return lambda var_name: local_scope['vars'][var_name]
+
+
+def ParseDepsDict(deps_content):
+  local_scope = {}
+  global_scope = {
+      'Str': StrExpansion(),
+      'Var': VarLookup(local_scope),
+      'deps_os': {},
+  }
+  exec(deps_content, global_scope, local_scope)
+  return local_scope
+
+
+def ParseLocalDepsFile(filename):
+  with open(filename, 'rb') as f:
+    deps_content = f.read().decode('utf-8')
+  return ParseDepsDict(deps_content)
+
+
+def ParseCommitPosition(commit_message):
+  for line in reversed(commit_message.splitlines()):
+    m = COMMIT_POSITION_RE.match(line.strip())
+    if m:
+      return int(m.group(1))
+  logging.error('Failed to parse commit position id from:\n%s\n',
+                commit_message)
+  sys.exit(-1)
+
+
+def _RunCommand(command,
+                working_dir=None,
+                ignore_exit_code=False,
+                extra_env=None,
+                input_data=None):
+  """Runs a command and returns the output from that command.
+
+    If the command fails (exit code != 0), the function will exit the process.
+
+    Returns:
+      A tuple containing the stdout and stderr outputs as strings.
+    """
+  working_dir = working_dir or CHECKOUT_SRC_DIR
+  logging.debug('CMD: %s CWD: %s', ' '.join(command), working_dir)
+  env = os.environ.copy()
+  if extra_env:
+    assert all(isinstance(value, str) for value in extra_env.values())
+    logging.debug('extra env: %s', extra_env)
+    env.update(extra_env)
+  p = subprocess.Popen(command,
+                       stdin=subprocess.PIPE,
+                       stdout=subprocess.PIPE,
+                       stderr=subprocess.PIPE,
+                       env=env,
+                       cwd=working_dir,
+                       universal_newlines=True)
+  std_output, err_output = p.communicate(input_data)
+  p.stdout.close()
+  p.stderr.close()
+  if not ignore_exit_code and p.returncode != 0:
+    logging.error('Command failed: %s\n'
+                  'stdout:\n%s\n'
+                  'stderr:\n%s\n', ' '.join(command), std_output, err_output)
+    sys.exit(p.returncode)
+  return std_output, err_output
+
+
+def _GetBranches():
+  """Returns a tuple of active,branches.
+
+    The 'active' is the name of the currently active branch and 'branches' is a
+    list of all branches.
+    """
+  lines = _RunCommand(['git', 'branch'])[0].split('\n')
+  branches = []
+  active = ''
+  for line in lines:
+    if '*' in line:
+      # The assumption is that the first char will always be the '*'.
+      active = line[1:].strip()
+      branches.append(active)
+    else:
+      branch = line.strip()
+      if branch:
+        branches.append(branch)
+  return active, branches
+
+
+def _ReadGitilesContent(url):
+  # Download and decode BASE64 content until
+  # https://code.google.com/p/gitiles/issues/detail?id=7 is fixed.
+  base64_content = ReadUrlContent(url + '?format=TEXT')
+  return base64.b64decode(base64_content[0]).decode('utf-8')
+
+
+def ReadRemoteCrFile(path_below_src, revision):
+  """Reads a remote Chromium file of a specific revision.
+
+    Args:
+      path_below_src: A path to the target file relative to src dir.
+      revision: Revision to read.
+    Returns:
+      A string with file content.
+    """
+  return _ReadGitilesContent(CHROMIUM_FILE_TEMPLATE %
+                             (revision, path_below_src))
+
+
+def ReadRemoteCrCommit(revision):
+  """Reads a remote Chromium commit message. Returns a string."""
+  return _ReadGitilesContent(CHROMIUM_COMMIT_TEMPLATE % revision)
+
+
+def ReadUrlContent(url):
+  """Connect to a remote host and read the contents.
+
+    Args:
+      url: URL to connect to.
+    Returns:
+      A list of lines.
+    """
+  conn = urllib.request.urlopen(url)
+  try:
+    return conn.readlines()
+  except IOError as e:
+    logging.exception('Error connecting to %s. Error: %s', url, e)
+    raise
+  finally:
+    conn.close()
+
+
+def GetMatchingDepsEntries(depsentry_dict, dir_path):
+  """Gets all deps entries matching the provided path.
+
+    This list may contain more than one DepsEntry object.
+    Example: dir_path='src/testing' would give results containing both
+    'src/testing/gtest' and 'src/testing/gmock' deps entries for Chromium's
+    DEPS.
+    Example 2: dir_path='src/build' should return 'src/build' but not
+    'src/buildtools'.
+
+    Returns:
+      A list of DepsEntry objects.
+    """
+  result = []
+  for path, depsentry in depsentry_dict.items():
+    if path == dir_path:
+      result.append(depsentry)
+    else:
+      parts = path.split('/')
+      if all(part == parts[i] for i, part in enumerate(dir_path.split('/'))):
+        result.append(depsentry)
+  return result
+
+
+def BuildDepsentryDict(deps_dict):
+  """Builds a dict of paths to DepsEntry objects from a raw deps dict."""
+  result = {}
+
+  def AddDepsEntries(deps_subdict):
+    for path, dep in deps_subdict.items():
+      if path in result:
+        continue
+      if not isinstance(dep, dict):
+        dep = {'url': dep}
+      if dep.get('dep_type') == 'cipd':
+        result[path] = CipdDepsEntry(path, dep['packages'])
+      else:
+        if '@' not in dep['url']:
+          continue
+        url, revision = dep['url'].split('@')
+        result[path] = DepsEntry(path, url, revision)
+
+  def AddVersionEntry(vars_subdict):
+    for key, value in vars_subdict.items():
+      if key in result:
+        continue
+      if not key.endswith('_version'):
+        continue
+      key = re.sub('_version$', '', key)
+      result[key] = VersionEntry(value)
+
+  AddDepsEntries(deps_dict['deps'])
+  for deps_os in ['win', 'mac', 'unix', 'android', 'ios', 'unix']:
+    AddDepsEntries(deps_dict.get('deps_os', {}).get(deps_os, {}))
+  AddVersionEntry(deps_dict.get('vars', {}))
+  return result
+
+
+def _FindChangedCipdPackages(path, old_pkgs, new_pkgs):
+  old_pkgs_names = {p['package'] for p in old_pkgs}
+  new_pkgs_names = {p['package'] for p in new_pkgs}
+  pkgs_equal = (old_pkgs_names == new_pkgs_names)
+  added_pkgs = [p for p in new_pkgs_names if p not in old_pkgs_names]
+  removed_pkgs = [p for p in old_pkgs_names if p not in new_pkgs_names]
+
+  assert pkgs_equal, ('Old: %s\n New: %s.\nYou need to do a manual roll '
+                      'and remove/add entries in DEPS so the old and new '
+                      'list match.\nMost likely, you should add \"%s\" and '
+                      'remove \"%s\"' %
+                      (old_pkgs, new_pkgs, added_pkgs, removed_pkgs))
+
+  for old_pkg in old_pkgs:
+    for new_pkg in new_pkgs:
+      old_version = old_pkg['version']
+      new_version = new_pkg['version']
+      if (old_pkg['package'] == new_pkg['package']
+          and old_version != new_version):
+        logging.debug('Roll dependency %s to %s', path, new_version)
+        yield ChangedCipdPackage(path, old_pkg['package'], old_version,
+                                 new_version)
+
+
+def _FindChangedVars(name, old_version, new_version):
+  if old_version != new_version:
+    logging.debug('Roll dependency %s to %s', name, new_version)
+    yield ChangedVersionEntry(name, old_version, new_version)
+
+
+def _FindNewDeps(old, new):
+  """ Gather dependencies only in `new` and return corresponding paths. """
+  old_entries = set(BuildDepsentryDict(old))
+  new_entries = set(BuildDepsentryDict(new))
+  return [
+      path for path in new_entries - old_entries
+      if path not in DONT_AUTOROLL_THESE
+  ]
+
+
+def FindAddedDeps(libyuv_deps, new_cr_deps):
+  """
+    Calculate new deps entries of interest.
+
+    Ideally, that would mean: only appearing in chromium DEPS
+    but transitively used in LibYUV.
+
+    Since it's hard to compute, we restrict ourselves to a well defined subset:
+    deps sitting in `ANDROID_DEPS_PATH`.
+    Otherwise, assumes that's a Chromium-only dependency.
+
+    Args:
+      libyuv_deps: dict of deps as defined in the LibYUV DEPS file.
+      new_cr_deps: dict of deps as defined in the chromium DEPS file.
+
+    Caveat: Doesn't detect a new package in existing dep.
+
+    Returns:
+      A tuple consisting of:
+        A list of paths added dependencies sitting in `ANDROID_DEPS_PATH`.
+        A list of paths for other added dependencies.
+    """
+  all_added_deps = _FindNewDeps(libyuv_deps, new_cr_deps)
+  generated_android_deps = [
+      path for path in all_added_deps if path.startswith(ANDROID_DEPS_PATH)
+  ]
+  other_deps = [
+      path for path in all_added_deps if path not in generated_android_deps
+  ]
+  return generated_android_deps, other_deps
+
+
+def FindRemovedDeps(libyuv_deps, new_cr_deps):
+  """
+    Calculate obsolete deps entries.
+
+    Ideally, that would mean: no more appearing in chromium DEPS
+    and not used in LibYUV.
+
+    Since it's hard to compute:
+     1/ We restrict ourselves to a well defined subset:
+        deps sitting in `ANDROID_DEPS_PATH`.
+     2/ We rely on existing behavior of CalculateChangeDeps.
+        I.e. Assumes non-CIPD dependencies are LibYUV-only, don't remove them.
+
+    Args:
+      libyuv_deps: dict of deps as defined in the LibYUV DEPS file.
+      new_cr_deps: dict of deps as defined in the chromium DEPS file.
+
+    Caveat: Doesn't detect a deleted package in existing dep.
+
+    Returns:
+      A tuple consisting of:
+        A list of paths of dependencies removed from `ANDROID_DEPS_PATH`.
+        A list of paths of unexpected disappearing dependencies.
+    """
+  all_removed_deps = _FindNewDeps(new_cr_deps, libyuv_deps)
+  generated_android_deps = sorted(
+      [path for path in all_removed_deps if path.startswith(ANDROID_DEPS_PATH)])
+  # Webrtc-only dependencies are handled in CalculateChangedDeps.
+  other_deps = sorted([
+      path for path in all_removed_deps
+      if path not in generated_android_deps and path not in LIBYUV_ONLY_DEPS
+  ])
+  return generated_android_deps, other_deps
+
+
+def CalculateChangedDeps(libyuv_deps, new_cr_deps):
+  """
+    Calculate changed deps entries based on entries defined in the LibYUV DEPS
+    file:
+     - If a shared dependency with the Chromium DEPS file: roll it to the same
+       revision as Chromium (i.e. entry in the new_cr_deps dict)
+     - If it's a Chromium sub-directory, roll it to the HEAD revision (notice
+       this means it may be ahead of the chromium_revision, but generally these
+       should be close).
+     - If it's another DEPS entry (not shared with Chromium), roll it to HEAD
+       unless it's configured to be skipped.
+
+    Returns:
+      A list of ChangedDep objects representing the changed deps.
+    """
+  result = []
+  libyuv_entries = BuildDepsentryDict(libyuv_deps)
+  new_cr_entries = BuildDepsentryDict(new_cr_deps)
+  for path, libyuv_deps_entry in libyuv_entries.items():
+    if path in DONT_AUTOROLL_THESE:
+      continue
+    cr_deps_entry = new_cr_entries.get(path)
+    if cr_deps_entry:
+      assert type(cr_deps_entry) is type(libyuv_deps_entry)
+
+      if isinstance(cr_deps_entry, CipdDepsEntry):
+        result.extend(
+            _FindChangedCipdPackages(path, libyuv_deps_entry.packages,
+                                     cr_deps_entry.packages))
+        continue
+
+      if isinstance(cr_deps_entry, VersionEntry):
+        result.extend(
+            _FindChangedVars(path, libyuv_deps_entry.version,
+                             cr_deps_entry.version))
+        continue
+
+      # Use the revision from Chromium's DEPS file.
+      new_rev = cr_deps_entry.revision
+      assert libyuv_deps_entry.url == cr_deps_entry.url, (
+          'LibYUV DEPS entry %s has a different URL %s than Chromium %s.' %
+          (path, libyuv_deps_entry.url, cr_deps_entry.url))
+    else:
+      if isinstance(libyuv_deps_entry, DepsEntry):
+        # Use the HEAD of the deps repo.
+        stdout, _ = _RunCommand(
+            ['git', 'ls-remote', libyuv_deps_entry.url, 'HEAD'])
+        new_rev = stdout.strip().split('\t')[0]
+      else:
+        # The dependency has been removed from chromium.
+        # This is handled by FindRemovedDeps.
+        continue
+
+    # Check if an update is necessary.
+    if libyuv_deps_entry.revision != new_rev:
+      logging.debug('Roll dependency %s to %s', path, new_rev)
+      result.append(
+          ChangedDep(path, libyuv_deps_entry.url, libyuv_deps_entry.revision,
+                     new_rev))
+  return sorted(result)
+
+
+def CalculateChangedClang(new_cr_rev):
+
+  def GetClangRev(lines):
+    for line in lines:
+      match = CLANG_REVISION_RE.match(line)
+      if match:
+        return match.group(1)
+    raise RollError('Could not parse Clang revision!')
+
+  with open(CLANG_UPDATE_SCRIPT_LOCAL_PATH, 'r') as f:
+    current_lines = f.readlines()
+  current_rev = GetClangRev(current_lines)
+
+  new_clang_update_py = ReadRemoteCrFile(CLANG_UPDATE_SCRIPT_URL_PATH,
+                                         new_cr_rev).splitlines()
+  new_rev = GetClangRev(new_clang_update_py)
+  return ChangedDep(CLANG_UPDATE_SCRIPT_LOCAL_PATH, None, current_rev, new_rev)
+
+
+def GenerateCommitMessage(
+        rev_update,
+        current_commit_pos,
+        new_commit_pos,
+        changed_deps_list,
+        added_deps_paths=None,
+        removed_deps_paths=None,
+        clang_change=None,
+):
+  current_cr_rev = rev_update.current_chromium_rev[0:10]
+  new_cr_rev = rev_update.new_chromium_rev[0:10]
+  rev_interval = '%s..%s' % (current_cr_rev, new_cr_rev)
+  git_number_interval = '%s:%s' % (current_commit_pos, new_commit_pos)
+
+  commit_msg = [
+      'Roll chromium_revision %s (%s)\n' % (rev_interval, git_number_interval),
+      'Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval),
+      'Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE % rev_interval)
+  ]
+
+  def Section(adjective, deps):
+    noun = 'dependency' if len(deps) == 1 else 'dependencies'
+    commit_msg.append('%s %s' % (adjective, noun))
+
+  if changed_deps_list:
+    Section('Changed', changed_deps_list)
+
+    for c in changed_deps_list:
+      if isinstance(c, ChangedCipdPackage):
+        commit_msg.append('* %s: %s..%s' %
+                          (c.path, c.current_version, c.new_version))
+      elif isinstance(c, ChangedVersionEntry):
+        commit_msg.append('* %s_vesion: %s..%s' %
+                          (c.path, c.current_version, c.new_version))
+      else:
+        commit_msg.append('* %s: %s/+log/%s..%s' %
+                          (c.path, c.url, c.current_rev[0:10], c.new_rev[0:10]))
+
+  if added_deps_paths:
+    Section('Added', added_deps_paths)
+    commit_msg.extend('* %s' % p for p in added_deps_paths)
+
+  if removed_deps_paths:
+    Section('Removed', removed_deps_paths)
+    commit_msg.extend('* %s' % p for p in removed_deps_paths)
+
+  if any([changed_deps_list, added_deps_paths, removed_deps_paths]):
+    change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval, 'DEPS')
+    commit_msg.append('DEPS diff: %s\n' % change_url)
+  else:
+    commit_msg.append('No dependencies changed.')
+
+  if clang_change and clang_change.current_rev != clang_change.new_rev:
+    commit_msg.append('Clang version changed %s:%s' %
+                      (clang_change.current_rev, clang_change.new_rev))
+    change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval,
+                                           CLANG_UPDATE_SCRIPT_URL_PATH)
+    commit_msg.append('Details: %s\n' % change_url)
+  else:
+    commit_msg.append('No update to Clang.\n')
+
+  commit_msg.append('BUG=None')
+  return '\n'.join(commit_msg)
+
+
+def UpdateDepsFile(deps_filename, rev_update, changed_deps, new_cr_content):
+  """Update the DEPS file with the new revision."""
+
+  with open(deps_filename, 'rb') as deps_file:
+    deps_content = deps_file.read().decode('utf-8')
+
+  # Update the chromium_revision variable.
+  deps_content = deps_content.replace(rev_update.current_chromium_rev,
+                                      rev_update.new_chromium_rev)
+
+  # Add and remove dependencies. For now: only generated android deps.
+  # Since gclient cannot add or remove deps, we on the fact that
+  # these android deps are located in one place we can copy/paste.
+  deps_re = re.compile(ANDROID_DEPS_START + '.*' + ANDROID_DEPS_END, re.DOTALL)
+  new_deps = deps_re.search(new_cr_content)
+  old_deps = deps_re.search(deps_content)
+  if not new_deps or not old_deps:
+    faulty = 'Chromium' if not new_deps else 'LibYUV'
+    raise RollError('Was expecting to find "%s" and "%s"\n'
+                    'in %s DEPS' %
+                    (ANDROID_DEPS_START, ANDROID_DEPS_END, faulty))
+  deps_content = deps_re.sub(new_deps.group(0), deps_content)
+
+  for dep in changed_deps:
+    if isinstance(dep, ChangedVersionEntry):
+      deps_content = deps_content.replace(dep.current_version, dep.new_version)
+
+  with open(deps_filename, 'wb') as deps_file:
+    deps_file.write(deps_content.encode('utf-8'))
+
+  # Update each individual DEPS entry.
+  for dep in changed_deps:
+    # ChangedVersionEntry types are already been processed.
+    if isinstance(dep, ChangedVersionEntry):
+      continue
+    local_dep_dir = os.path.join(CHECKOUT_ROOT_DIR, dep.path)
+    if not os.path.isdir(local_dep_dir):
+      raise RollError(
+          'Cannot find local directory %s. Either run\n'
+          'gclient sync --deps=all\n'
+          'or make sure the .gclient file for your solution contains all '
+          'platforms in the target_os list, i.e.\n'
+          'target_os = ["android", "unix", "mac", "ios", "win"];\n'
+          'Then run "gclient sync" again.' % local_dep_dir)
+    if isinstance(dep, ChangedCipdPackage):
+      package = dep.package.format()  # Eliminate double curly brackets
+      update = '%s:%s@%s' % (dep.path, package, dep.new_version)
+    else:
+      update = '%s@%s' % (dep.path, dep.new_rev)
+    _RunCommand(['gclient', 'setdep', '--revision', update],
+                working_dir=CHECKOUT_SRC_DIR)
+
+
+def _IsTreeClean():
+  stdout, _ = _RunCommand(['git', 'status', '--porcelain'])
+  if len(stdout) == 0:
+    return True
+
+  logging.error('Dirty/unversioned files:\n%s', stdout)
+  return False
+
+
+def _EnsureUpdatedMainBranch(dry_run):
+  current_branch = _RunCommand(['git', 'rev-parse', '--abbrev-ref',
+                                'HEAD'])[0].splitlines()[0]
+  if current_branch != 'main':
+    logging.error('Please checkout the main branch and re-run this script.')
+    if not dry_run:
+      sys.exit(-1)
+
+  logging.info('Updating main branch...')
+  _RunCommand(['git', 'pull'])
+
+
+def _CreateRollBranch(dry_run):
+  logging.info('Creating roll branch: %s', ROLL_BRANCH_NAME)
+  if not dry_run:
+    _RunCommand(['git', 'checkout', '-b', ROLL_BRANCH_NAME])
+
+
+def _RemovePreviousRollBranch(dry_run):
+  active_branch, branches = _GetBranches()
+  if active_branch == ROLL_BRANCH_NAME:
+    active_branch = 'main'
+  if ROLL_BRANCH_NAME in branches:
+    logging.info('Removing previous roll branch (%s)', ROLL_BRANCH_NAME)
+    if not dry_run:
+      _RunCommand(['git', 'checkout', active_branch])
+      _RunCommand(['git', 'branch', '-D', ROLL_BRANCH_NAME])
+
+
+def _LocalCommit(commit_msg, dry_run):
+  logging.info('Committing changes locally.')
+  if not dry_run:
+    _RunCommand(['git', 'add', '--update', '.'])
+    _RunCommand(['git', 'commit', '-m', commit_msg])
+
+
+def ChooseCQMode(skip_cq, cq_over, current_commit_pos, new_commit_pos):
+  if skip_cq:
+    return 0
+  if (new_commit_pos - current_commit_pos) < cq_over:
+    return 1
+  return 2
+
+
+def _GetCcRecipients(changed_deps_list):
+  """Returns a list of emails to notify based on the changed deps list.
+    """
+  cc_recipients = []
+  for c in changed_deps_list:
+    pass
+  return cc_recipients
+
+
+def _UploadCL(commit_queue_mode, add_cc=None):
+  """Upload the committed changes as a changelist to Gerrit.
+
+    commit_queue_mode:
+     - 2: Submit to commit queue.
+     - 1: Run trybots but do not submit to CQ.
+     - 0: Skip CQ, upload only.
+
+    add_cc: A list of email addresses to add as CC recipients.
+    """
+  cc_recipients = []
+  if add_cc:
+    cc_recipients.extend(add_cc)
+  cmd = ['git', 'cl', 'upload', '--force', '--bypass-hooks']
+  if commit_queue_mode >= 2:
+    logging.info('Sending the CL to the CQ...')
+    cmd.extend(['-o', 'label=Bot-Commit+1'])
+    cmd.extend(['-o', 'label=Commit-Queue+2'])
+    cmd.extend(['--send-mail', '--cc', ','.join(cc_recipients)])
+  elif commit_queue_mode >= 1:
+    logging.info('Starting CQ dry run...')
+    cmd.extend(['-o', 'label=Commit-Queue+1'])
+  extra_env = {
+      'EDITOR': 'true',
+      'SKIP_GCE_AUTH_FOR_GIT': '1',
+  }
+  stdout, stderr = _RunCommand(cmd, extra_env=extra_env)
+  logging.debug('Output from "git cl upload":\nstdout:\n%s\n\nstderr:\n%s',
+                stdout, stderr)
+
+
+def GetRollRevisionRanges(opts, libyuv_deps):
+  current_cr_rev = libyuv_deps['vars']['chromium_revision']
+  new_cr_rev = opts.revision
+  if not new_cr_rev:
+    stdout, _ = _RunCommand(['git', 'ls-remote', CHROMIUM_SRC_URL, 'HEAD'])
+    head_rev = stdout.strip().split('\t')[0]
+    logging.info('No revision specified. Using HEAD: %s', head_rev)
+    new_cr_rev = head_rev
+
+  return ChromiumRevisionUpdate(current_cr_rev, new_cr_rev)
+
+
+def main():
+  p = argparse.ArgumentParser()
+  p.add_argument('--clean',
+                 action='store_true',
+                 default=False,
+                 help='Removes any previous local roll branch.')
+  p.add_argument('-r',
+                 '--revision',
+                 help=('Chromium Git revision to roll to. Defaults to the '
+                       'Chromium HEAD revision if omitted.'))
+  p.add_argument('--dry-run',
+                 action='store_true',
+                 default=False,
+                 help=('Calculate changes and modify DEPS, but don\'t create '
+                       'any local branch, commit, upload CL or send any '
+                       'tryjobs.'))
+  p.add_argument('-i',
+                 '--ignore-unclean-workdir',
+                 action='store_true',
+                 default=False,
+                 help=('Ignore if the current branch is not main or if there '
+                       'are uncommitted changes (default: %(default)s).'))
+  grp = p.add_mutually_exclusive_group()
+  grp.add_argument('--skip-cq',
+                   action='store_true',
+                   default=False,
+                   help='Skip sending the CL to the CQ (default: %(default)s)')
+  grp.add_argument('--cq-over',
+                   type=int,
+                   default=1,
+                   help=('Commit queue dry run if the revision difference '
+                         'is below this number (default: %(default)s)'))
+  p.add_argument('-v',
+                 '--verbose',
+                 action='store_true',
+                 default=False,
+                 help='Be extra verbose in printing of log messages.')
+  opts = p.parse_args()
+
+  if opts.verbose:
+    logging.basicConfig(level=logging.DEBUG)
+  else:
+    logging.basicConfig(level=logging.INFO)
+
+  if not opts.ignore_unclean_workdir and not _IsTreeClean():
+    logging.error('Please clean your local checkout first.')
+    return 1
+
+  if opts.clean:
+    _RemovePreviousRollBranch(opts.dry_run)
+
+  if not opts.ignore_unclean_workdir:
+    _EnsureUpdatedMainBranch(opts.dry_run)
+
+  deps_filename = os.path.join(CHECKOUT_SRC_DIR, 'DEPS')
+  libyuv_deps = ParseLocalDepsFile(deps_filename)
+
+  rev_update = GetRollRevisionRanges(opts, libyuv_deps)
+
+  current_commit_pos = ParseCommitPosition(
+      ReadRemoteCrCommit(rev_update.current_chromium_rev))
+  new_commit_pos = ParseCommitPosition(
+      ReadRemoteCrCommit(rev_update.new_chromium_rev))
+
+  new_cr_content = ReadRemoteCrFile('DEPS', rev_update.new_chromium_rev)
+  new_cr_deps = ParseDepsDict(new_cr_content)
+  changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps)
+  # Discard other deps, assumed to be chromium-only dependencies.
+  new_generated_android_deps, _ = FindAddedDeps(libyuv_deps, new_cr_deps)
+  removed_generated_android_deps, other_deps = FindRemovedDeps(
+      libyuv_deps, new_cr_deps)
+  if other_deps:
+    raise RollError('LibYUV DEPS entries are missing from Chromium: %s.\n'
+                    'Remove them or add them to either '
+                    'LIBYUV_ONLY_DEPS or DONT_AUTOROLL_THESE.' % other_deps)
+  clang_change = CalculateChangedClang(rev_update.new_chromium_rev)
+  commit_msg = GenerateCommitMessage(
+      rev_update,
+      current_commit_pos,
+      new_commit_pos,
+      changed_deps,
+      added_deps_paths=new_generated_android_deps,
+      removed_deps_paths=removed_generated_android_deps,
+      clang_change=clang_change)
+  logging.debug('Commit message:\n%s', commit_msg)
+
+  _CreateRollBranch(opts.dry_run)
+  if not opts.dry_run:
+    UpdateDepsFile(deps_filename, rev_update, changed_deps, new_cr_content)
+  if _IsTreeClean():
+    logging.info("No DEPS changes detected, skipping CL creation.")
+  else:
+    _LocalCommit(commit_msg, opts.dry_run)
+    commit_queue_mode = ChooseCQMode(opts.skip_cq, opts.cq_over,
+                                     current_commit_pos, new_commit_pos)
+    logging.info('Uploading CL...')
+    if not opts.dry_run:
+      _UploadCL(commit_queue_mode, _GetCcRecipients(changed_deps))
+  return 0
+
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/files/tools_libyuv/autoroller/unittests/roll_deps_test.py b/tools_libyuv/autoroller/unittests/roll_deps_test.py
index af86bdd5..af86bdd5 100755
--- a/files/tools_libyuv/autoroller/unittests/roll_deps_test.py
+++ b/tools_libyuv/autoroller/unittests/roll_deps_test.py
diff --git a/files/tools_libyuv/autoroller/unittests/testdata/DEPS b/tools_libyuv/autoroller/unittests/testdata/DEPS
index 4f45860c..4f45860c 100644
--- a/files/tools_libyuv/autoroller/unittests/testdata/DEPS
+++ b/tools_libyuv/autoroller/unittests/testdata/DEPS
diff --git a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new b/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new
index d53083ce..d53083ce 100644
--- a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new
+++ b/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new
diff --git a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old b/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old
index dd6ddaec..dd6ddaec 100644
--- a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old
+++ b/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old
diff --git a/files/tools_libyuv/get_landmines.py b/tools_libyuv/get_landmines.py
index 8b33483e..8b33483e 100755
--- a/files/tools_libyuv/get_landmines.py
+++ b/tools_libyuv/get_landmines.py
diff --git a/tools_libyuv/msan/OWNERS b/tools_libyuv/msan/OWNERS
new file mode 100644
index 00000000..9b67a8f6
--- /dev/null
+++ b/tools_libyuv/msan/OWNERS
@@ -0,0 +1,3 @@
+mbonadei@chromium.org
+fbarchard@chromium.org
+pbos@chromium.org
diff --git a/files/tools_libyuv/msan/blacklist.txt b/tools_libyuv/msan/blacklist.txt
index 8b5e42a7..8b5e42a7 100644
--- a/files/tools_libyuv/msan/blacklist.txt
+++ b/tools_libyuv/msan/blacklist.txt
diff --git a/tools_libyuv/ubsan/OWNERS b/tools_libyuv/ubsan/OWNERS
new file mode 100644
index 00000000..9b67a8f6
--- /dev/null
+++ b/tools_libyuv/ubsan/OWNERS
@@ -0,0 +1,3 @@
+mbonadei@chromium.org
+fbarchard@chromium.org
+pbos@chromium.org
diff --git a/files/tools_libyuv/ubsan/blacklist.txt b/tools_libyuv/ubsan/blacklist.txt
index 8bcb2907..8bcb2907 100644
--- a/files/tools_libyuv/ubsan/blacklist.txt
+++ b/tools_libyuv/ubsan/blacklist.txt
diff --git a/files/tools_libyuv/ubsan/vptr_blacklist.txt b/tools_libyuv/ubsan/vptr_blacklist.txt
index 23cfca53..23cfca53 100644
--- a/files/tools_libyuv/ubsan/vptr_blacklist.txt
+++ b/tools_libyuv/ubsan/vptr_blacklist.txt
diff --git a/files/unit_test/basictypes_test.cc b/unit_test/basictypes_test.cc
index 9aaa2dcd..9aaa2dcd 100644
--- a/files/unit_test/basictypes_test.cc
+++ b/unit_test/basictypes_test.cc
diff --git a/files/unit_test/color_test.cc b/unit_test/color_test.cc
index 01267ff1..01267ff1 100644
--- a/files/unit_test/color_test.cc
+++ b/unit_test/color_test.cc
diff --git a/files/unit_test/compare_test.cc b/unit_test/compare_test.cc
index c29562cb..c29562cb 100644
--- a/files/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc
diff --git a/files/unit_test/convert_test.cc b/unit_test/convert_argb_test.cc
index 1f975825..aeee8a7f 100644
--- a/files/unit_test/convert_test.cc
+++ b/unit_test/convert_argb_test.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *  Copyright 2023 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -31,6 +31,13 @@
 #include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */
 #endif
 
+#if defined(__riscv) && !defined(__clang__)
+#define DISABLE_SLOW_TESTS
+#undef ENABLE_FULL_TESTS
+#undef ENABLE_ROW_TESTS
+#define LEAN_TESTS
+#endif
+
 // Some functions fail on big endian. Enable these tests on all cpus except
 // PowerPC, but they are not optimized so disabled by default.
 #if !defined(DISABLE_SLOW_TESTS) && !defined(__powerpc__)
@@ -48,500 +55,15 @@ namespace libyuv {
 #define AR30ToAR30 ARGBCopy
 #define ABGRToABGR ARGBCopy
 
+// subsample amount uses a divide.
 #define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
 
-// Planar test
-
-#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,         \
-                       SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,             \
-                       DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,      \
-                       SRC_DEPTH)                                             \
-  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
-    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");       \
-    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");       \
-    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                   \
-                  "SRC_SUBSAMP_X unsupported");                               \
-    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                   \
-                  "SRC_SUBSAMP_Y unsupported");                               \
-    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                   \
-                  "DST_SUBSAMP_X unsupported");                               \
-    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
-                  "DST_SUBSAMP_Y unsupported");                               \
-    const int kWidth = W1280;                                                 \
-    const int kHeight = benchmark_height_;                                    \
-    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
-    const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
-    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
-    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
-    align_buffer_page_end(src_u,                                              \
-                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
-    align_buffer_page_end(src_v,                                              \
-                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
-    align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
-    align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
-    align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
-    align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
-    MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                    \
-    MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
-    MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
-    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
-    SRC_T* src_u_p = reinterpret_cast<SRC_T*>(src_u + OFF);                   \
-    SRC_T* src_v_p = reinterpret_cast<SRC_T*>(src_v + OFF);                   \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
-      src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1);                       \
-    }                                                                         \
-    for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) {                \
-      src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1);                       \
-      src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1);                       \
-    }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
-    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
-    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
-    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
-    memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
-    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
-    MaskCpuFlags(disable_cpu_flags_);                                         \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
-        src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth,      \
-        reinterpret_cast<DST_T*>(dst_y_c), kWidth,                            \
-        reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth,                     \
-        reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth,             \
-        NEG kHeight);                                                         \
-    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
-          src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth,    \
-          reinterpret_cast<DST_T*>(dst_y_opt), kWidth,                        \
-          reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth,                 \
-          reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth,         \
-          NEG kHeight);                                                       \
-    }                                                                         \
-    for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                    \
-      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                    \
-    }                                                                         \
-    for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) {      \
-      EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);                                    \
-      EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);                                    \
-    }                                                                         \
-    free_aligned_buffer_page_end(dst_y_c);                                    \
-    free_aligned_buffer_page_end(dst_u_c);                                    \
-    free_aligned_buffer_page_end(dst_v_c);                                    \
-    free_aligned_buffer_page_end(dst_y_opt);                                  \
-    free_aligned_buffer_page_end(dst_u_opt);                                  \
-    free_aligned_buffer_page_end(dst_v_opt);                                  \
-    free_aligned_buffer_page_end(src_y);                                      \
-    free_aligned_buffer_page_end(src_u);                                      \
-    free_aligned_buffer_page_end(src_v);                                      \
-  }
-
-#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,           \
-                      SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,               \
-                      DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)                 \
-  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
-                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
-                 benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH)                  \
-  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
-                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
-                 benchmark_width_, _Unaligned, +, 2, SRC_DEPTH)                \
-  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
-                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
-                 benchmark_width_, _Invert, -, 0, SRC_DEPTH)                   \
-  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
-                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
-                 benchmark_width_, _Opt, +, 0, SRC_DEPTH)
-
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
-TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2, 8)
-TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2, 8)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1, 8)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1, 8)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2, 8)
-TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1, 8)
-TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I444, uint8_t, 1, 1, 1, 8)
-TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1, 8)
-TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2, 8)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I012, uint16_t, 2, 2, 2, 8)
-TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10)
-TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10)
-TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8)
-TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H012, uint16_t, 2, 2, 2, 8)
-TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10)
-TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10)
-TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12)
-TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I412, uint16_t, 2, 1, 1, 12)
-TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I010, uint16_t, 2, 2, 2, 10)
-TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I010, uint16_t, 2, 2, 2, 10)
-TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I012, uint16_t, 2, 2, 2, 12)
-TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I012, uint16_t, 2, 2, 2, 12)
-TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10)
-TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I420, uint8_t, 1, 2, 2, 10)
-TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 10)
-TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 10)
-TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 12)
-TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 12)
-TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12)
-
-// Test Android 420 to I420
-#define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X,          \
-                        SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                        W1280, N, NEG, OFF, PN, OFF_U, OFF_V)                 \
-  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##To##PN##N) {      \
-    const int kWidth = W1280;                                                 \
-    const int kHeight = benchmark_height_;                                    \
-    const int kSizeUV =                                                       \
-        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
-    align_buffer_page_end(src_uv,                                             \
-                          kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);       \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
-    align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
-                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
-                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
-    align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
-                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
-                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    uint8_t* src_u = src_uv + OFF_U;                                          \
-    uint8_t* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V);          \
-    int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE;          \
-    for (int i = 0; i < kHeight; ++i)                                         \
-      for (int j = 0; j < kWidth; ++j)                                        \
-        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                    \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {             \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {            \
-        src_u[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] =                 \
-            (fastrand() & 0xff);                                              \
-        src_v[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] =                 \
-            (fastrand() & 0xff);                                              \
-      }                                                                       \
-    }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight);                                      \
-    memset(dst_u_c, 2,                                                        \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    memset(dst_v_c, 3,                                                        \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
-    memset(dst_u_opt, 102,                                                    \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    memset(dst_v_opt, 103,                                                    \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    MaskCpuFlags(disable_cpu_flags_);                                         \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
-        src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X),   \
-        src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, dst_y_c, \
-        kWidth, dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c,               \
-        SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);                   \
-    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
-          src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
-          src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE,        \
-          dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),         \
-          dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);      \
-    }                                                                         \
-    for (int i = 0; i < kHeight; ++i) {                                       \
-      for (int j = 0; j < kWidth; ++j) {                                      \
-        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
-      }                                                                       \
-    }                                                                         \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
-        EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
-                  dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
-      }                                                                       \
-    }                                                                         \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
-        EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
-                  dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
-      }                                                                       \
-    }                                                                         \
-    free_aligned_buffer_page_end(dst_y_c);                                    \
-    free_aligned_buffer_page_end(dst_u_c);                                    \
-    free_aligned_buffer_page_end(dst_v_c);                                    \
-    free_aligned_buffer_page_end(dst_y_opt);                                  \
-    free_aligned_buffer_page_end(dst_u_opt);                                  \
-    free_aligned_buffer_page_end(dst_v_opt);                                  \
-    free_aligned_buffer_page_end(src_y);                                      \
-    free_aligned_buffer_page_end(src_uv);                                     \
-  }
-
-#define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V,         \
-                       SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X,    \
-                       SUBSAMP_Y)                                              \
-  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
-                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ + 1,      \
-                  _Any, +, 0, PN, OFF_U, OFF_V)                                \
-  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
-                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_,          \
-                  _Unaligned, +, 2, PN, OFF_U, OFF_V)                          \
-  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
-                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, \
-                  -, 0, PN, OFF_U, OFF_V)                                      \
-  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
-                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \
-                  0, PN, OFF_U, OFF_V)
-
-TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2)
-TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2)
-TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
-#undef TESTAPLANARTOP
-#undef TESTAPLANARTOPI
-
-// wrapper to keep API the same
-int I400ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* /* src_u */,
-               int /* src_stride_u */,
-               const uint8_t* /* src_v */,
-               int /* src_stride_v */,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  return I400ToNV21(src_y, src_stride_y, dst_y, dst_stride_y, dst_vu,
-                    dst_stride_vu, width, height);
-}
-
-#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,        \
-                        SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,            \
-                        DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,     \
-                        SRC_DEPTH)                                            \
-  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
-    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");       \
-    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");       \
-    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                   \
-                  "SRC_SUBSAMP_X unsupported");                               \
-    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                   \
-                  "SRC_SUBSAMP_Y unsupported");                               \
-    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                   \
-                  "DST_SUBSAMP_X unsupported");                               \
-    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
-                  "DST_SUBSAMP_Y unsupported");                               \
-    const int kWidth = W1280;                                                 \
-    const int kHeight = benchmark_height_;                                    \
-    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
-    const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
-    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
-    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
-    align_buffer_page_end(src_u,                                              \
-                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
-    align_buffer_page_end(src_v,                                              \
-                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
-    align_buffer_page_end(dst_uv_c,                                           \
-                          kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);        \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
-    align_buffer_page_end(dst_uv_opt,                                         \
-                          kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);        \
-    MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                    \
-    MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
-    MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
-    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
-    SRC_T* src_u_p = reinterpret_cast<SRC_T*>(src_u + OFF);                   \
-    SRC_T* src_v_p = reinterpret_cast<SRC_T*>(src_v + OFF);                   \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
-      src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1);                       \
-    }                                                                         \
-    for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) {                \
-      src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1);                       \
-      src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1);                       \
-    }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
-    memset(dst_uv_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);          \
-    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
-    memset(dst_uv_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);      \
-    MaskCpuFlags(disable_cpu_flags_);                                         \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y_p, kWidth, src_u_p, kSrcHalfWidth,   \
-                                   src_v_p, kSrcHalfWidth,                    \
-                                   reinterpret_cast<DST_T*>(dst_y_c), kWidth, \
-                                   reinterpret_cast<DST_T*>(dst_uv_c),        \
-                                   kDstHalfWidth * 2, kWidth, NEG kHeight);   \
-    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
-          src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth,    \
-          reinterpret_cast<DST_T*>(dst_y_opt), kWidth,                        \
-          reinterpret_cast<DST_T*>(dst_uv_opt), kDstHalfWidth * 2, kWidth,    \
-          NEG kHeight);                                                       \
-    }                                                                         \
-    for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                    \
-      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                    \
-    }                                                                         \
-    for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC * 2; ++i) {  \
-      EXPECT_EQ(dst_uv_c[i], dst_uv_opt[i]);                                  \
-    }                                                                         \
-    free_aligned_buffer_page_end(dst_y_c);                                    \
-    free_aligned_buffer_page_end(dst_uv_c);                                   \
-    free_aligned_buffer_page_end(dst_y_opt);                                  \
-    free_aligned_buffer_page_end(dst_uv_opt);                                 \
-    free_aligned_buffer_page_end(src_y);                                      \
-    free_aligned_buffer_page_end(src_u);                                      \
-    free_aligned_buffer_page_end(src_v);                                      \
-  }
-
-#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,         \
-                       SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,             \
-                       DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)               \
-  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
-                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                  DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \
-  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
-                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                  DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2,          \
-                  SRC_DEPTH)                                                  \
-  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
-                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                  DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH)  \
-  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
-                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                  DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH)
-
-TESTPLANARTOBP(I420, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8)
-TESTPLANARTOBP(I420, uint8_t, 1, 2, 2, NV21, uint8_t, 1, 2, 2, 8)
-TESTPLANARTOBP(I422, uint8_t, 1, 2, 1, NV21, uint8_t, 1, 2, 2, 8)
-TESTPLANARTOBP(I444, uint8_t, 1, 1, 1, NV12, uint8_t, 1, 2, 2, 8)
-TESTPLANARTOBP(I444, uint8_t, 1, 1, 1, NV21, uint8_t, 1, 2, 2, 8)
-TESTPLANARTOBP(I400, uint8_t, 1, 2, 2, NV21, uint8_t, 1, 2, 2, 8)
-TESTPLANARTOBP(I010, uint16_t, 2, 2, 2, P010, uint16_t, 2, 2, 2, 10)
-TESTPLANARTOBP(I210, uint16_t, 2, 2, 1, P210, uint16_t, 2, 2, 1, 10)
-TESTPLANARTOBP(I012, uint16_t, 2, 2, 2, P012, uint16_t, 2, 2, 2, 12)
-TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
-
-#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,       \
-                          SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,           \
-                          DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,    \
-                          DOY, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)             \
-  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {               \
-    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");        \
-    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");        \
-    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                    \
-                  "SRC_SUBSAMP_X unsupported");                                \
-    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                    \
-                  "SRC_SUBSAMP_Y unsupported");                                \
-    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                    \
-                  "DST_SUBSAMP_X unsupported");                                \
-    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                    \
-                  "DST_SUBSAMP_Y unsupported");                                \
-    const int kWidth = W1280;                                                  \
-    const int kHeight = benchmark_height_;                                     \
-    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);                \
-    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);                \
-    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);              \
-    const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1);  \
-    const int kPaddedHeight =                                                  \
-        (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1);                    \
-    const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X);    \
-    const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y);  \
-    align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF);  \
-    align_buffer_page_end(                                                     \
-        src_uv,                                                                \
-        2 * kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC + OFF);       \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                  \
-    align_buffer_page_end(dst_uv_c,                                            \
-                          2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);       \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);                \
-    align_buffer_page_end(dst_uv_opt,                                          \
-                          2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);       \
-    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                    \
-    SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                  \
-    for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) {                   \
-      src_y_p[i] =                                                             \
-          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
-    }                                                                          \
-    for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2; ++i) { \
-      src_uv_p[i] =                                                            \
-          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
-    }                                                                          \
-    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                              \
-    memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);         \
-    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                          \
-    memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);     \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(                                            \
-        src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth,                          \
-        DOY ? reinterpret_cast<DST_T*>(dst_y_c) : NULL, kWidth,                \
-        reinterpret_cast<DST_T*>(dst_uv_c), 2 * kDstHalfWidth, kWidth,         \
-        NEG kHeight);                                                          \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      SRC_FMT_PLANAR##To##FMT_PLANAR(                                          \
-          src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth,                        \
-          DOY ? reinterpret_cast<DST_T*>(dst_y_opt) : NULL, kWidth,            \
-          reinterpret_cast<DST_T*>(dst_uv_opt), 2 * kDstHalfWidth, kWidth,     \
-          NEG kHeight);                                                        \
-    }                                                                          \
-    if (DOY) {                                                                 \
-      for (int i = 0; i < kHeight; ++i) {                                      \
-        for (int j = 0; j < kWidth; ++j) {                                     \
-          EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);       \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-    for (int i = 0; i < kDstHalfHeight; ++i) {                                 \
-      for (int j = 0; j < 2 * kDstHalfWidth; ++j) {                            \
-        EXPECT_EQ(dst_uv_c[i * 2 * kDstHalfWidth + j],                         \
-                  dst_uv_opt[i * 2 * kDstHalfWidth + j]);                      \
-      }                                                                        \
-    }                                                                          \
-    free_aligned_buffer_page_end(dst_y_c);                                     \
-    free_aligned_buffer_page_end(dst_uv_c);                                    \
-    free_aligned_buffer_page_end(dst_y_opt);                                   \
-    free_aligned_buffer_page_end(dst_uv_opt);                                  \
-    free_aligned_buffer_page_end(src_y);                                       \
-    free_aligned_buffer_page_end(src_uv);                                      \
-  }
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
 
-#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,        \
-                         SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,            \
-                         DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH,  \
-                         TILE_HEIGHT)                                          \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
-                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
-                    DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, 1,        \
-                    SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)                        \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
-                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
-                    DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2, 1,      \
-                    SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)                        \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
-                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
-                    DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1,         \
-                    SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)                        \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
-                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
-                    DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH, \
-                    TILE_WIDTH, TILE_HEIGHT)                                   \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
-                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
-                    DST_SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0,          \
-                    SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)
-
-TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1)
-TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8, 1, 1)
-TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
-TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
-TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10, 1, 1)
-TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10, 1, 1)
-TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10, 1, 1)
-TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12, 1, 1)
-TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12, 1, 1)
-TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12, 1, 1)
-TESTBIPLANARTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32)
-
-#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,        \
-                         SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,            \
-                         DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,     \
-                         SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)                   \
+#define TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                   DST_SUBSAMP_Y, W1280, N, NEG, OFF, SRC_DEPTH, TILE_WIDTH,   \
+                   TILE_HEIGHT)                                                \
   TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {               \
     static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");        \
     static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");        \
@@ -621,30 +143,39 @@ TESTBIPLANARTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32)
     free_aligned_buffer_page_end(src_uv);                                      \
   }
 
-#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,         \
-                        SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,             \
-                        DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH,   \
-                        TILE_HEIGHT)                                           \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
-                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                   DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH, \
-                   TILE_WIDTH, TILE_HEIGHT)                                    \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
-                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                   DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2,          \
-                   SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)                         \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
-                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                   DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH,  \
-                   TILE_WIDTH, TILE_HEIGHT)                                    \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
-                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                   DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH,     \
-                   TILE_WIDTH, TILE_HEIGHT)
+#if defined(ENABLE_FULL_TESTS)
+#define TESTBPTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,            \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+                  DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)        \
+  TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+             FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+             benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH, TILE_WIDTH,       \
+             TILE_HEIGHT)                                                   \
+  TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+             FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+             benchmark_width_, _Unaligned, +, 2, SRC_DEPTH, TILE_WIDTH,     \
+             TILE_HEIGHT)                                                   \
+  TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+             FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+             benchmark_width_, _Invert, -, 0, SRC_DEPTH, TILE_WIDTH,        \
+             TILE_HEIGHT)                                                   \
+  TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+             FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+             benchmark_width_, _Opt, +, 0, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)
+#else
+#define TESTBPTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,            \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+                  DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)        \
+  TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+             FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+             benchmark_width_, _Opt, +, 0, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)
+#endif
 
-TESTBIPLANARTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
-TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
-TESTBIPLANARTOP(MM21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 16, 32)
+TESTBPTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOP(MM21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 16, 32)
+TESTBPTOP(P010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10, 1, 1)
+TESTBPTOP(P012, uint16_t, 2, 2, 2, I012, uint16_t, 2, 2, 2, 12, 1, 1)
 
 // Provide matrix wrappers for full range bt.709
 #define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \
@@ -680,8 +211,12 @@ TESTBIPLANARTOP(MM21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 16, 32)
 #define I422ToARGBFilter(a, b, c, d, e, f, g, h, i, j)                     \
   I422ToARGBMatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
                          kFilterBilinear)
-
-#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
+#define I420ToRGB24Filter(a, b, c, d, e, f, g, h, i, j)                     \
+  I420ToRGB24MatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+                          kFilterBilinear)
+#define I422ToRGB24Filter(a, b, c, d, e, f, g, h, i, j)                     \
+  I420ToRGB24MatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+                          kFilterBilinear)
 
 #define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
                        YALIGN, W1280, N, NEG, OFF)                            \
@@ -746,8 +281,6 @@ TESTBIPLANARTOP(MM21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 16, 32)
 #define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
                       YALIGN)                                                \
   TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                 YALIGN, benchmark_width_ + 1, _Any, +, 0)                   \
-  TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
                  YALIGN, benchmark_width_, _Opt, +, 0)
 #endif
 
@@ -792,8 +325,12 @@ TESTPLANARTOB(V422, 2, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(V422, 2, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
+TESTPLANARTOB(I422, 1, 1, RGB24, 3, 3, 1)
+TESTPLANARTOB(I422, 1, 1, RAW, 3, 3, 1)
 TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(I444, 1, 1, RGB24, 3, 3, 1)
+TESTPLANARTOB(I444, 1, 1, RAW, 3, 3, 1)
 TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(J444, 1, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(H444, 1, 1, ARGB, 4, 4, 1)
@@ -816,7 +353,9 @@ TESTPLANARTOB(H420, 2, 2, AB30, 4, 4, 1)
 #endif
 TESTPLANARTOB(I420, 2, 2, ARGBFilter, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, ARGBFilter, 4, 4, 1)
-#else
+TESTPLANARTOB(I420, 2, 2, RGB24Filter, 3, 3, 1)
+TESTPLANARTOB(I422, 2, 2, RGB24Filter, 3, 3, 1)
+#else  // FULL_TESTS
 TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1)
 TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1)
 TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1)
@@ -832,232 +371,21 @@ TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1)
 TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1)
 TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1)
 TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1)
-TESTPLANARTOB(I420, 2, 2, ARGBFilter, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1)
 TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
+TESTPLANARTOB(I420, 2, 2, ARGBFilter, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, ARGBFilter, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, RGB24Filter, 3, 3, 1)
 TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
 #endif
 
-#define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                        YALIGN, W1280, N, NEG, OFF, ATTEN)                     \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
-    const int kWidth = W1280;                                                  \
-    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
-    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
-    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(src_u, kSizeUV + OFF);                               \
-    align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(src_a, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
-      src_y[i + OFF] = (fastrand() & 0xff);                                    \
-      src_a[i + OFF] = (fastrand() & 0xff);                                    \
-    }                                                                          \
-    for (int i = 0; i < kSizeUV; ++i) {                                        \
-      src_u[i + OFF] = (fastrand() & 0xff);                                    \
-      src_v[i + OFF] = (fastrand() & 0xff);                                    \
-    }                                                                          \
-    memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                           \
-    memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                       \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,         \
-                          src_v + OFF, kStrideUV, src_a + OFF, kWidth,         \
-                          dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight,     \
-                          ATTEN);                                              \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,       \
-                            src_v + OFF, kStrideUV, src_a + OFF, kWidth,       \
-                            dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, \
-                            ATTEN);                                            \
-    }                                                                          \
-    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                       \
-      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                   \
-    }                                                                          \
-    free_aligned_buffer_page_end(src_y);                                       \
-    free_aligned_buffer_page_end(src_u);                                       \
-    free_aligned_buffer_page_end(src_v);                                       \
-    free_aligned_buffer_page_end(src_a);                                       \
-    free_aligned_buffer_page_end(dst_argb_c);                                  \
-    free_aligned_buffer_page_end(dst_argb_opt);                                \
-  }
-
-#if defined(ENABLE_FULL_TESTS)
-#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                       YALIGN)                                                \
-  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_ + 1, _Any, +, 0, 0)                \
-  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_, _Unaligned, +, 2, 0)              \
-  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_, _Invert, -, 0, 0)                 \
-  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_, _Opt, +, 0, 0)                    \
-  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_, _Premult, +, 0, 1)
-#else
-#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                       YALIGN)                                                \
-  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_, _Opt, +, 0, 0)
-#endif
-
-#define J420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
-                        l, m)
-#define J420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
-                        l, m)
-#define F420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
-                        l, m)
-#define F420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
-                        l, m)
-#define H420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
-                        l, m)
-#define H420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
-                        l, m)
-#define U420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
-                        l, m)
-#define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
-                        l, m)
-#define V420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
-  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
-                        l, m)
-#define V420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
-  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
-                        l, m)
-#define J422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
-                        l, m)
-#define J422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
-                        l, m)
-#define F422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
-                        l, m)
-#define F422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
-                        l, m)
-#define H422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
-                        l, m)
-#define H422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
-                        l, m)
-#define U422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
-                        l, m)
-#define U422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
-                        l, m)
-#define V422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
-  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
-                        l, m)
-#define V422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
-  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
-                        l, m)
-#define J444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
-                        l, m)
-#define J444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
-                        l, m)
-#define F444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
-                        l, m)
-#define F444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
-                        l, m)
-#define H444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
-                        l, m)
-#define H444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
-                        l, m)
-#define U444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
-                        l, m)
-#define U444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
-  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
-                        l, m)
-#define V444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
-  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
-                        l, m)
-#define V444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
-  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
-                        l, m)
-
-#define I420AlphaToARGBFilter(a, b, c, d, e, f, g, h, i, j, k, l, m) \
-  I420AlphaToARGBMatrixFilter(a, b, c, d, e, f, g, h, i, j,          \
-                              &kYuvI601Constants, k, l, m, kFilterBilinear)
-#define I422AlphaToARGBFilter(a, b, c, d, e, f, g, h, i, j, k, l, m) \
-  I422AlphaToARGBMatrixFilter(a, b, c, d, e, f, g, h, i, j,          \
-                              &kYuvI601Constants, k, l, m, kFilterBilinear)
-
-#if defined(ENABLE_FULL_TESTS)
-TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1)
-TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1)
-TESTQPLANARTOB(J420Alpha, 2, 2, ARGB, 4, 4, 1)
-TESTQPLANARTOB(J420Alpha, 2, 2, ABGR, 4, 4, 1)
-TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1)
-TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1)
-TESTQPLANARTOB(F420Alpha, 2, 2, ARGB, 4, 4, 1)
-TESTQPLANARTOB(F420Alpha, 2, 2, ABGR, 4, 4, 1)
-TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1)
-TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1)
-TESTQPLANARTOB(V420Alpha, 2, 2, ARGB, 4, 4, 1)
-TESTQPLANARTOB(V420Alpha, 2, 2, ABGR, 4, 4, 1)
-TESTQPLANARTOB(I422Alpha, 2, 1, ARGB, 4, 4, 1)
-TESTQPLANARTOB(I422Alpha, 2, 1, ABGR, 4, 4, 1)
-TESTQPLANARTOB(J422Alpha, 2, 1, ARGB, 4, 4, 1)
-TESTQPLANARTOB(J422Alpha, 2, 1, ABGR, 4, 4, 1)
-TESTQPLANARTOB(H422Alpha, 2, 1, ARGB, 4, 4, 1)
-TESTQPLANARTOB(H422Alpha, 2, 1, ABGR, 4, 4, 1)
-TESTQPLANARTOB(F422Alpha, 2, 1, ARGB, 4, 4, 1)
-TESTQPLANARTOB(F422Alpha, 2, 1, ABGR, 4, 4, 1)
-TESTQPLANARTOB(U422Alpha, 2, 1, ARGB, 4, 4, 1)
-TESTQPLANARTOB(U422Alpha, 2, 1, ABGR, 4, 4, 1)
-TESTQPLANARTOB(V422Alpha, 2, 1, ARGB, 4, 4, 1)
-TESTQPLANARTOB(V422Alpha, 2, 1, ABGR, 4, 4, 1)
-TESTQPLANARTOB(I444Alpha, 1, 1, ARGB, 4, 4, 1)
-TESTQPLANARTOB(I444Alpha, 1, 1, ABGR, 4, 4, 1)
-TESTQPLANARTOB(J444Alpha, 1, 1, ARGB, 4, 4, 1)
-TESTQPLANARTOB(J444Alpha, 1, 1, ABGR, 4, 4, 1)
-TESTQPLANARTOB(H444Alpha, 1, 1, ARGB, 4, 4, 1)
-TESTQPLANARTOB(H444Alpha, 1, 1, ABGR, 4, 4, 1)
-TESTQPLANARTOB(F444Alpha, 1, 1, ARGB, 4, 4, 1)
-TESTQPLANARTOB(F444Alpha, 1, 1, ABGR, 4, 4, 1)
-TESTQPLANARTOB(U444Alpha, 1, 1, ARGB, 4, 4, 1)
-TESTQPLANARTOB(U444Alpha, 1, 1, ABGR, 4, 4, 1)
-TESTQPLANARTOB(V444Alpha, 1, 1, ARGB, 4, 4, 1)
-TESTQPLANARTOB(V444Alpha, 1, 1, ABGR, 4, 4, 1)
-TESTQPLANARTOB(I420Alpha, 2, 2, ARGBFilter, 4, 4, 1)
-TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1)
-#else
-TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1)
-TESTQPLANARTOB(I422Alpha, 2, 1, ARGB, 4, 4, 1)
-TESTQPLANARTOB(I444Alpha, 1, 1, ARGB, 4, 4, 1)
-TESTQPLANARTOB(I420Alpha, 2, 2, ARGBFilter, 4, 4, 1)
-TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1)
-#endif
-
-#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C,       \
-                         BPP_B, W1280, N, NEG, OFF)                            \
+#define TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+                   W1280, N, NEG, OFF)                                         \
   TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
     const int kWidth = W1280;                                                  \
     const int kHeight = benchmark_height_;                                     \
@@ -1110,15 +438,21 @@ TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1)
     free_aligned_buffer_page_end(dst_argb32_opt);                              \
   }
 
-#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
-                   benchmark_width_ + 1, _Any, +, 0)                           \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
-                   benchmark_width_, _Unaligned, +, 2)                         \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
-                   benchmark_width_, _Invert, -, 0)                            \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
-                   benchmark_width_, _Opt, +, 0)
+#if defined(ENABLE_FULL_TESTS)
+#define TESTBPTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \
+  TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+             benchmark_width_ + 1, _Any, +, 0)                           \
+  TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+             benchmark_width_, _Unaligned, +, 2)                         \
+  TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+             benchmark_width_, _Invert, -, 0)                            \
+  TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+             benchmark_width_, _Opt, +, 0)
+#else
+#define TESTBPTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \
+  TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+             benchmark_width_, _Opt, +, 0)
+#endif
 
 #define JNV12ToARGB(a, b, c, d, e, f, g, h) \
   NV12ToARGBMatrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
@@ -1139,187 +473,30 @@ TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1)
 #define JNV12ToRGB565(a, b, c, d, e, f, g, h) \
   NV12ToRGB565Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
 
-TESTBIPLANARTOB(JNV12, 2, 2, ARGB, ARGB, 4)
-TESTBIPLANARTOB(JNV21, 2, 2, ARGB, ARGB, 4)
-TESTBIPLANARTOB(JNV12, 2, 2, ABGR, ABGR, 4)
-TESTBIPLANARTOB(JNV21, 2, 2, ABGR, ABGR, 4)
-TESTBIPLANARTOB(JNV12, 2, 2, RGB24, RGB24, 3)
-TESTBIPLANARTOB(JNV21, 2, 2, RGB24, RGB24, 3)
-TESTBIPLANARTOB(JNV12, 2, 2, RAW, RAW, 3)
-TESTBIPLANARTOB(JNV21, 2, 2, RAW, RAW, 3)
-#ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTBIPLANARTOB(JNV12, 2, 2, RGB565, RGB565, 2)
-#endif
-
-TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4)
-TESTBIPLANARTOB(NV21, 2, 2, ARGB, ARGB, 4)
-TESTBIPLANARTOB(NV12, 2, 2, ABGR, ABGR, 4)
-TESTBIPLANARTOB(NV21, 2, 2, ABGR, ABGR, 4)
-TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3)
-TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3)
-TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3)
-TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3)
-TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3)
+TESTBPTOB(JNV12, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(JNV21, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(JNV12, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(JNV21, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(JNV12, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(JNV21, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(JNV12, 2, 2, RAW, RAW, 3)
+TESTBPTOB(JNV21, 2, 2, RAW, RAW, 3)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2)
-#endif
-
-#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
-                       W1280, N, NEG, OFF)                                     \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                        \
-    const int kWidth = W1280;                                                  \
-    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
-    const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
-    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                   \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
-    align_buffer_page_end(dst_uv_c,                                            \
-                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
-    align_buffer_page_end(dst_uv_opt,                                          \
-                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    memset(dst_y_c, 1, kWidth* kHeight);                                       \
-    memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                   \
-    memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));    \
-    for (int i = 0; i < kHeight; ++i)                                          \
-      for (int j = 0; j < kStride; ++j)                                        \
-        src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);               \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c,  \
-                          kStrideUV * 2, dst_uv_c + kStrideUV, kStrideUV * 2,  \
-                          kWidth, NEG kHeight);                                \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth,        \
-                            dst_uv_opt, kStrideUV * 2, dst_uv_opt + kStrideUV, \
-                            kStrideUV * 2, kWidth, NEG kHeight);               \
-    }                                                                          \
-    for (int i = 0; i < kHeight; ++i) {                                        \
-      for (int j = 0; j < kWidth; ++j) {                                       \
-        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);         \
-      }                                                                        \
-    }                                                                          \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) {              \
-      for (int j = 0; j < kStrideUV; ++j) {                                    \
-        EXPECT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \
-      }                                                                        \
-    }                                                                          \
-    free_aligned_buffer_page_end(dst_y_c);                                     \
-    free_aligned_buffer_page_end(dst_uv_c);                                    \
-    free_aligned_buffer_page_end(dst_y_opt);                                   \
-    free_aligned_buffer_page_end(dst_uv_opt);                                  \
-    free_aligned_buffer_page_end(src_argb);                                    \
-  }
-
-#if defined(ENABLE_FULL_TESTS)
-#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
-  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                 benchmark_width_ + 1, _Any, +, 0)                            \
-  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                 benchmark_width_, _Unaligned, +, 2)                          \
-  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                 benchmark_width_, _Invert, -, 0)                             \
-  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                 benchmark_width_, _Opt, +, 0)
-#else
-#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
-  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                 benchmark_width_ + 1, _Any, +, 0)                            \
-  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                 benchmark_width_, _Opt, +, 0)
+TESTBPTOB(JNV12, 2, 2, RGB565, RGB565, 2)
 #endif
 
-TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2)
-TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2)
-TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1)
-TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1)
-TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2)
-TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1)
+TESTBPTOB(NV12, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(NV21, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(NV12, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(NV21, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(NV12, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(NV21, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(NV12, 2, 2, RAW, RAW, 3)
+TESTBPTOB(NV21, 2, 2, RAW, RAW, 3)
+TESTBPTOB(NV21, 2, 2, YUV24, RAW, 3)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2)
-TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2)
-TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2)
+TESTBPTOB(NV12, 2, 2, RGB565, RGB565, 2)
 #endif
-TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2)
-TESTATOPLANAR(I400, 1, 1, I420, 2, 2)
-TESTATOPLANAR(J400, 1, 1, J420, 2, 2)
-TESTATOPLANAR(RAW, 3, 1, I420, 2, 2)
-TESTATOPLANAR(RAW, 3, 1, J420, 2, 2)
-TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2)
-TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2)
-TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2)
-TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2)
-TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1)
-TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2)
-TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1)
-
-#define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X,          \
-                         SUBSAMP_Y, W1280, N, NEG, OFF)                       \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                       \
-    const int kWidth = W1280;                                                 \
-    const int kHeight = benchmark_height_;                                    \
-    const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                     \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
-    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                  \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
-    align_buffer_page_end(dst_uv_c,                                           \
-                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
-    align_buffer_page_end(dst_uv_opt,                                         \
-                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    for (int i = 0; i < kHeight; ++i)                                         \
-      for (int j = 0; j < kStride; ++j)                                       \
-        src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);              \
-    memset(dst_y_c, 1, kWidth* kHeight);                                      \
-    memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));       \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
-    memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));   \
-    MaskCpuFlags(disable_cpu_flags_);                                         \
-    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
-                          kStrideUV * 2, kWidth, NEG kHeight);                \
-    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth,       \
-                            dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight);  \
-    }                                                                         \
-    for (int i = 0; i < kHeight; ++i) {                                       \
-      for (int j = 0; j < kWidth; ++j) {                                      \
-        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
-      }                                                                       \
-    }                                                                         \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
-      for (int j = 0; j < kStrideUV * 2; ++j) {                               \
-        EXPECT_EQ(dst_uv_c[i * kStrideUV * 2 + j],                            \
-                  dst_uv_opt[i * kStrideUV * 2 + j]);                         \
-      }                                                                       \
-    }                                                                         \
-    free_aligned_buffer_page_end(dst_y_c);                                    \
-    free_aligned_buffer_page_end(dst_uv_c);                                   \
-    free_aligned_buffer_page_end(dst_y_opt);                                  \
-    free_aligned_buffer_page_end(dst_uv_opt);                                 \
-    free_aligned_buffer_page_end(src_argb);                                   \
-  }
-
-#define TESTATOBIPLANAR(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
-  TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                   benchmark_width_ + 1, _Any, +, 0)                           \
-  TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                   benchmark_width_, _Unaligned, +, 2)                         \
-  TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                   benchmark_width_, _Invert, -, 0)                            \
-  TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                   benchmark_width_, _Opt, +, 0)
-
-TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2)
-TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2)
-TESTATOBIPLANAR(ABGR, 1, 4, NV12, 2, 2)
-TESTATOBIPLANAR(ABGR, 1, 4, NV21, 2, 2)
-TESTATOBIPLANAR(RAW, 1, 3, JNV21, 2, 2)
-TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2)
-TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
-TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2)
-TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2)
 
 #define TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B,     \
                   EPP_B, STRIDE_B, HEIGHT_B, W1280, N, NEG, OFF)               \
@@ -1440,6 +617,7 @@ TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGBMirror, uint8_t, 4, 4, 1)
 TESTATOB(ARGB, uint8_t, 4, 4, 1, BGRA, uint8_t, 4, 4, 1)
 TESTATOB(ARGB, uint8_t, 4, 4, 1, I400, uint8_t, 1, 1, 1)
 TESTATOB(ARGB, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
 TESTATOB(RGBA, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
 TESTATOB(ARGB, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
 TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
@@ -1450,7 +628,7 @@ TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB565, uint8_t, 2, 2, 1)
 #endif
 TESTATOB(ARGB, uint8_t, 4, 4, 1, RGBA, uint8_t, 4, 4, 1)
 TESTATOB(ARGB, uint8_t, 4, 4, 1, UYVY, uint8_t, 2, 4, 1)
-TESTATOB(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1)  // 4
+TESTATOB(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1)
 TESTATOB(ARGB1555, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
 TESTATOB(ARGB4444, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
 TESTATOB(BGRA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
@@ -1484,6 +662,127 @@ TESTATOB(AB64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
 TESTATOB(AR64, uint16_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
 TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
 
+// in place test
+#define TESTATOAI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B,    \
+                  EPP_B, STRIDE_B, HEIGHT_B, W1280, N, NEG, OFF)              \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) {                            \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;      \
+    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;      \
+    const int kStrideA =                                                      \
+        (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                \
+    const int kStrideB =                                                      \
+        (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                \
+    align_buffer_page_end(src_argb,                                           \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
+    align_buffer_page_end(dst_argb_c,                                         \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
+    align_buffer_page_end(dst_argb_opt,                                       \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
+    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {     \
+      src_argb[i + OFF] = (fastrand() & 0xff);                                \
+    }                                                                         \
+    memcpy(dst_argb_c + OFF, src_argb,                                        \
+           kStrideA * kHeightA * (int)sizeof(TYPE_A));                        \
+    memcpy(dst_argb_opt + OFF, src_argb,                                      \
+           kStrideA * kHeightA * (int)sizeof(TYPE_A));                        \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    FMT_A##To##FMT_B((TYPE_A*)(dst_argb_c /* src */ + OFF), kStrideA,         \
+                     (TYPE_B*)dst_argb_c, kStrideB, kWidth, NEG kHeight);     \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      FMT_A##To##FMT_B((TYPE_A*)(dst_argb_opt /* src */ + OFF), kStrideA,     \
+                       (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight); \
+    }                                                                         \
+    memcpy(dst_argb_opt + OFF, src_argb,                                      \
+           kStrideA * kHeightA * (int)sizeof(TYPE_A));                        \
+    FMT_A##To##FMT_B((TYPE_A*)(dst_argb_opt /* src */ + OFF), kStrideA,       \
+                     (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight);   \
+    for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) {     \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                              \
+    }                                                                         \
+    free_aligned_buffer_page_end(src_argb);                                   \
+    free_aligned_buffer_page_end(dst_argb_c);                                 \
+    free_aligned_buffer_page_end(dst_argb_opt);                               \
+  }
+
+#define TESTATOA(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B,   \
+                 EPP_B, STRIDE_B, HEIGHT_B)                                 \
+  TESTATOAI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+            STRIDE_B, HEIGHT_B, benchmark_width_, _Inplace, +, 0)
+
+TESTATOA(AB30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOA(AB30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(ABGR, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(ABGR, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(AR30, uint8_t, 4, 4, 1, AB30, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(AR30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(AR30, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+TESTATOA(AR30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(ARGB, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGB1555, uint8_t, 2, 2, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGB4444, uint8_t, 2, 2, 1)
+// TODO(fbarchard): Support in place for mirror.
+// TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGBMirror, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, BGRA, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, I400, uint8_t, 1, 1, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOA(RGBA, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
+TESTATOA(ABGR, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
+TESTATOA(ABGR, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RGB565, uint8_t, 2, 2, 1)
+#endif
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RGBA, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, UYVY, uint8_t, 2, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1)
+// TODO(fbarchard): Support in place for conversions that increase bpp.
+// TESTATOA(ARGB1555, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(ARGB4444, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(BGRA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(I400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(I400, uint8_t, 1, 1, 1, I400, uint8_t, 1, 1, 1)
+// TESTATOA(I400, uint8_t, 1, 1, 1, I400Mirror, uint8_t, 1, 1, 1)
+// TESTATOA(J400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(J400, uint8_t, 1, 1, 1, J400, uint8_t, 1, 1, 1)
+// TESTATOA(RAW, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(RAW, uint8_t, 3, 3, 1, RGBA, uint8_t, 4, 4, 1)
+TESTATOA(RAW, uint8_t, 3, 3, 1, RGB24, uint8_t, 3, 3, 1)
+// TESTATOA(RGB24, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(RGB24, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1)
+// TESTATOA(RGB24, uint8_t, 3, 3, 1, RGB24Mirror, uint8_t, 3, 3, 1)
+TESTATOA(RAW, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+// TESTATOA(RGB565, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(RGBA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(UYVY, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(YUY2, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(YUY2, uint8_t, 2, 4, 1, Y, uint8_t, 1, 1, 1)
+// TESTATOA(ARGB, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+// TESTATOA(ARGB, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+// TESTATOA(ABGR, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+// TESTATOA(ABGR, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOA(AR64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(AB64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(AR64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOA(AB64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOA(AR64, uint16_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+
 #define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
                    HEIGHT_B, W1280, N, NEG, OFF)                             \
   TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) {                   \
@@ -1554,6 +853,7 @@ TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
     }                                                                          \
   }
 
+#if defined(ENABLE_FULL_TESTS)
 #define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
                   HEIGHT_B)                                                 \
   TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
@@ -1566,6 +866,12 @@ TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
              HEIGHT_B, benchmark_width_, _Opt, +, 0)                        \
   TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
                   HEIGHT_B)
+#else
+#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+                  HEIGHT_B)                                                 \
+  TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+                  HEIGHT_B)
+#endif
 
 #ifdef LITTLE_ENDIAN_ONLY_TEST
 TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
@@ -1634,1081 +940,217 @@ TESTEND(BGRAToARGB, uint8_t, 4, 4, 1)
 TESTEND(ABGRToARGB, uint8_t, 4, 4, 1)
 TESTEND(AB64ToAR64, uint16_t, 4, 4, 1)
 
-#ifdef HAVE_JPEG
-TEST_F(LibYUVConvertTest, ValidateJpeg) {
-  const int kOff = 10;
-  const int kMinJpeg = 64;
-  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
-                             ? benchmark_width_ * benchmark_height_
-                             : kMinJpeg;
-  const int kSize = kImageSize + kOff;
-  align_buffer_page_end(orig_pixels, kSize);
-
-  // No SOI or EOI. Expect fail.
-  memset(orig_pixels, 0, kSize);
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
-  // Test special value that matches marker start.
-  memset(orig_pixels, 0xff, kSize);
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
-  // EOI, SOI. Expect pass.
-  orig_pixels[0] = 0xff;
-  orig_pixels[1] = 0xd8;  // SOI.
-  orig_pixels[2] = 0xff;
-  orig_pixels[kSize - kOff + 0] = 0xff;
-  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
-  for (int times = 0; times < benchmark_iterations_; ++times) {
-    EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
-  }
-  free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
-  const int kOff = 10;
-  const int kMinJpeg = 64;
-  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
-                             ? benchmark_width_ * benchmark_height_
-                             : kMinJpeg;
-  const int kSize = kImageSize + kOff;
-  const int kMultiple = 10;
-  const int kBufSize = kImageSize * kMultiple + kOff;
-  align_buffer_page_end(orig_pixels, kBufSize);
-
-  // No SOI or EOI. Expect fail.
-  memset(orig_pixels, 0, kBufSize);
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, kBufSize));
-
-  // EOI, SOI. Expect pass.
-  orig_pixels[0] = 0xff;
-  orig_pixels[1] = 0xd8;  // SOI.
-  orig_pixels[2] = 0xff;
-  orig_pixels[kSize - kOff + 0] = 0xff;
-  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
-  for (int times = 0; times < benchmark_iterations_; ++times) {
-    EXPECT_TRUE(ValidateJpeg(orig_pixels, kBufSize));
-  }
-  free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(LibYUVConvertTest, InvalidateJpeg) {
-  const int kOff = 10;
-  const int kMinJpeg = 64;
-  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
-                             ? benchmark_width_ * benchmark_height_
-                             : kMinJpeg;
-  const int kSize = kImageSize + kOff;
-  align_buffer_page_end(orig_pixels, kSize);
-
-  // NULL pointer. Expect fail.
-  EXPECT_FALSE(ValidateJpeg(NULL, kSize));
-
-  // Negative size. Expect fail.
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, -1));
-
-  // Too large size. Expect fail.
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, 0xfb000000ull));
-
-  // No SOI or EOI. Expect fail.
-  memset(orig_pixels, 0, kSize);
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
-  // SOI but no EOI. Expect fail.
-  orig_pixels[0] = 0xff;
-  orig_pixels[1] = 0xd8;  // SOI.
-  orig_pixels[2] = 0xff;
-  for (int times = 0; times < benchmark_iterations_; ++times) {
-    EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-  }
-
-  // EOI but no SOI. Expect fail.
-  orig_pixels[0] = 0;
-  orig_pixels[1] = 0;
-  orig_pixels[kSize - kOff + 0] = 0xff;
-  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
-  free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(LibYUVConvertTest, FuzzJpeg) {
-  // SOI but no EOI. Expect fail.
-  for (int times = 0; times < benchmark_iterations_; ++times) {
-    const int kSize = fastrand() % 5000 + 3;
-    align_buffer_page_end(orig_pixels, kSize);
-    MemRandomize(orig_pixels, kSize);
-
-    // Add SOI so frame will be scanned.
-    orig_pixels[0] = 0xff;
-    orig_pixels[1] = 0xd8;  // SOI.
-    orig_pixels[2] = 0xff;
-    orig_pixels[kSize - 1] = 0xff;
-    ValidateJpeg(orig_pixels,
-                 kSize);  // Failure normally expected.
-    free_aligned_buffer_page_end(orig_pixels);
-  }
-}
-
-// Test data created in GIMP.  In export jpeg, disable
-// thumbnails etc, choose a subsampling, and use low quality
-// (50) to keep size small. Generated with xxd -i test.jpg
-// test 0 is J400
-static const uint8_t kTest0Jpg[] = {
-    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
-    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
-    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
-    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
-    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
-    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
-    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
-    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xc2, 0x00, 0x0b, 0x08, 0x00, 0x10,
-    0x00, 0x20, 0x01, 0x01, 0x11, 0x00, 0xff, 0xc4, 0x00, 0x17, 0x00, 0x01,
-    0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xda, 0x00, 0x08, 0x01,
-    0x01, 0x00, 0x00, 0x00, 0x01, 0x43, 0x7e, 0xa7, 0x97, 0x57, 0xff, 0xc4,
-    0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
-    0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
-    0x02, 0x3b, 0xc0, 0x6f, 0x66, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
-    0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
-    0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
-    0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
-    0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
-    0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
-    0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
-    0x3f, 0x21, 0x65, 0x6e, 0x31, 0x86, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
-    0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
-    0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x08,
-    0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x35, 0xff, 0xc4, 0x00, 0x1f, 0x10,
-    0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
-    0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
-    0x3f, 0x10, 0x0b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x88, 0xab, 0x8b,
-    0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
-    0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
-    0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
-    0xd9};
-static const size_t kTest0JpgLen = 421;
-
-// test 1 is J444
-static const uint8_t kTest1Jpg[] = {
-    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
-    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
-    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
-    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
-    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
-    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
-    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
-    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
-    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
-    0x01, 0x11, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
-    0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
-    0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x01, 0x03, 0xff, 0xda,
-    0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00, 0x01,
-    0x40, 0x8f, 0x26, 0xe8, 0xf4, 0xcc, 0xf9, 0x69, 0x2b, 0x1b, 0x2a, 0xcb,
-    0xff, 0xc4, 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11,
-    0x00, 0x03, 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00,
-    0x01, 0x05, 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99,
-    0x0d, 0x26, 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x01, 0x00,
-    0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x01, 0x00, 0x10, 0x11, 0x02, 0x12, 0xff, 0xda, 0x00, 0x08,
-    0x01, 0x03, 0x01, 0x01, 0x3f, 0x01, 0xf1, 0x00, 0x27, 0x45, 0xbb, 0x31,
-    0xaf, 0xff, 0xc4, 0x00, 0x1a, 0x11, 0x00, 0x02, 0x03, 0x01, 0x01, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
-    0x02, 0x10, 0x11, 0x41, 0x12, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01,
-    0x01, 0x3f, 0x01, 0xf6, 0x4b, 0x5f, 0x48, 0xb3, 0x69, 0x63, 0x35, 0x72,
-    0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11,
-    0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00,
-    0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2,
-    0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c,
-    0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61,
-    0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21,
-    0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01,
-    0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48,
-    0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01,
-    0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x26, 0x61, 0xd4, 0xff,
-    0xc4, 0x00, 0x1a, 0x11, 0x00, 0x03, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
-    0x31, 0x41, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
-    0x10, 0x54, 0xa8, 0xbf, 0x50, 0x87, 0xb0, 0x9d, 0x8b, 0xc4, 0x6a, 0x26,
-    0x6b, 0x2a, 0x9c, 0x1f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x01, 0x01, 0x01,
-    0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x01, 0x00, 0x11, 0x21, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02,
-    0x01, 0x01, 0x3f, 0x10, 0x70, 0xe1, 0x3e, 0xd1, 0x8e, 0x0d, 0xe1, 0xb5,
-    0xd5, 0x91, 0x76, 0x43, 0x82, 0x45, 0x4c, 0x7b, 0x7f, 0xff, 0xc4, 0x00,
-    0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61,
-    0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01,
-    0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a,
-    0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96,
-    0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad,
-    0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7,
-    0xd4, 0xff, 0xd9};
-static const size_t kTest1JpgLen = 735;
-
-// test 2 is J420
-static const uint8_t kTest2Jpg[] = {
-    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
-    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
-    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
-    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
-    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
-    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
-    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
-    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
-    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
-    0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
-    0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x05, 0x01, 0x02, 0x04, 0xff,
-    0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x01, 0x02, 0xff,
-    0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
-    0x01, 0x20, 0xe7, 0x28, 0xa3, 0x0b, 0x2e, 0x2d, 0xcf, 0xff, 0xc4, 0x00,
-    0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03, 0x10,
-    0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05, 0x02,
-    0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26, 0x62,
-    0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x00, 0x03, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
-    0x01, 0xc8, 0x53, 0xff, 0xc4, 0x00, 0x16, 0x11, 0x01, 0x01, 0x01, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x11, 0x32, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f,
-    0x01, 0xd2, 0xc7, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
-    0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
-    0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
-    0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
-    0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
-    0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
-    0x3f, 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
-    0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
-    0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c,
-    0x03, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x13, 0x5f,
-    0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11,
-    0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x0e,
-    0xa1, 0x3a, 0x76, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x21, 0x11, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01,
-    0x3f, 0x10, 0x57, 0x0b, 0x08, 0x70, 0xdb, 0xff, 0xc4, 0x00, 0x1f, 0x10,
-    0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
-    0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
-    0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b,
-    0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
-    0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
-    0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
-    0xd9};
-static const size_t kTest2JpgLen = 685;
-
-// test 3 is J422
-static const uint8_t kTest3Jpg[] = {
-    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
-    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
-    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
-    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
-    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
-    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
-    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
-    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
-    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
-    0x01, 0x21, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
-    0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
-    0x00, 0x17, 0x01, 0x00, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x00, 0xff,
-    0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
-    0x01, 0x43, 0x8d, 0x1f, 0xa2, 0xb3, 0xca, 0x1b, 0x57, 0x0f, 0xff, 0xc4,
-    0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
-    0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
-    0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
-    0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03, 0x01,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x01, 0x02, 0x10, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03,
-    0x01, 0x01, 0x3f, 0x01, 0x51, 0xce, 0x8c, 0x75, 0xff, 0xc4, 0x00, 0x18,
-    0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x61, 0x21, 0xff, 0xda,
-    0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xa6, 0xd9, 0x2f, 0x84,
-    0xe8, 0xf0, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda,
-    0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32,
-    0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00,
-    0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31,
-    0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f,
-    0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9,
-    0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6,
-    0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03,
-    0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x2e, 0x45, 0xff,
-    0xc4, 0x00, 0x18, 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
-    0x31, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x53,
-    0x50, 0xba, 0x54, 0xc1, 0x67, 0x4f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
-    0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x01, 0x11, 0x21, 0x00, 0x10, 0xff, 0xda, 0x00, 0x08,
-    0x01, 0x02, 0x01, 0x01, 0x3f, 0x10, 0x18, 0x81, 0x5c, 0x04, 0x1a, 0xca,
-    0x91, 0xbf, 0xff, 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04,
-    0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
-    0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff,
-    0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9,
-    0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5,
-    0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c,
-    0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00,
-    0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff, 0xd9};
-static const size_t kTest3JpgLen = 704;
-
-// test 4 is J422 vertical - not supported
-static const uint8_t kTest4Jpg[] = {
-    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
-    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
-    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
-    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
-    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
-    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
-    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
-    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
-    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
-    0x01, 0x12, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
-    0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x05, 0x01, 0x02, 0x03, 0xff,
-    0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03, 0xff,
-    0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
-    0x01, 0xd2, 0x98, 0xe9, 0x03, 0x0c, 0x00, 0x46, 0x21, 0xd9, 0xff, 0xc4,
-    0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
-    0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
-    0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
-    0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x11, 0x01, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01,
-    0x3f, 0x01, 0x98, 0xb1, 0xbd, 0x47, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
-    0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x12, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08,
-    0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xb6, 0x35, 0xa2, 0xe1, 0x47, 0xff,
-    0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x21, 0x02,
-    0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00, 0x08, 0x01,
-    0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2, 0xed, 0xf9,
-    0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c, 0x10, 0x01,
-    0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61, 0x81, 0xf0,
-    0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21, 0x75, 0x6e,
-    0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01, 0xf3, 0xde,
-    0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48, 0x5d, 0x7a,
-    0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02,
-    0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x24, 0xaf, 0xff, 0xc4, 0x00, 0x19,
-    0x11, 0x00, 0x03, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x51, 0x21, 0x31, 0xff,
-    0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x59, 0x11, 0xca,
-    0x42, 0x60, 0x9f, 0x69, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03,
-    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x01, 0x11, 0x21, 0x31, 0x61, 0xff, 0xda, 0x00, 0x08, 0x01,
-    0x02, 0x01, 0x01, 0x3f, 0x10, 0xb0, 0xd7, 0x27, 0x51, 0xb6, 0x41, 0xff,
-    0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31,
-    0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08,
-    0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a,
-    0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd,
-    0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30,
-    0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03,
-    0x0b, 0xb7, 0xd4, 0xff, 0xd9};
-static const size_t kTest4JpgLen = 701;
-
-TEST_F(LibYUVConvertTest, TestMJPGSize) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  printf("test jpeg size %d x %d\n", width, height);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToI420) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_u, half_width * half_height);
-  align_buffer_page_end(dst_v, half_width * half_height);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_u, half_width,
-                     dst_v, half_width, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Test result matches known hash value.
-  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
-  uint32_t dst_u_hash = HashDjb2(dst_u, half_width * half_height, 5381);
-  uint32_t dst_v_hash = HashDjb2(dst_v, half_width * half_height, 5381);
-  EXPECT_EQ(dst_y_hash, 2682851208u);
-  EXPECT_EQ(dst_u_hash, 2501859930u);
-  EXPECT_EQ(dst_v_hash, 2126459123u);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_u);
-  free_aligned_buffer_page_end(dst_v);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  // Convert to NV21
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_vu, half_width * half_height * 2);
-
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_vu,
-                     half_width * 2, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Convert to I420
-  align_buffer_page_end(dst2_y, width * height);
-  align_buffer_page_end(dst2_u, half_width * half_height);
-  align_buffer_page_end(dst2_v, half_width * half_height);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width,
-                     dst2_v, half_width, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Convert I420 to NV21
-  align_buffer_page_end(dst3_y, width * height);
-  align_buffer_page_end(dst3_vu, half_width * half_height * 2);
-
-  I420ToNV21(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y,
-             width, dst3_vu, half_width * 2, width, height);
-
-  for (int i = 0; i < width * height; ++i) {
-    EXPECT_EQ(dst_y[i], dst3_y[i]);
-  }
-  for (int i = 0; i < half_width * half_height * 2; ++i) {
-    EXPECT_EQ(dst_vu[i], dst3_vu[i]);
-    EXPECT_EQ(dst_vu[i], dst3_vu[i]);
-  }
-
-  free_aligned_buffer_page_end(dst3_y);
-  free_aligned_buffer_page_end(dst3_vu);
-
-  free_aligned_buffer_page_end(dst2_y);
-  free_aligned_buffer_page_end(dst2_u);
-  free_aligned_buffer_page_end(dst2_v);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_vu);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToI420_NV12) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  // Convert to NV12
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_uv, half_width * half_height * 2);
-
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
-                     half_width * 2, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Convert to I420
-  align_buffer_page_end(dst2_y, width * height);
-  align_buffer_page_end(dst2_u, half_width * half_height);
-  align_buffer_page_end(dst2_v, half_width * half_height);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width,
-                     dst2_v, half_width, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Convert I420 to NV12
-  align_buffer_page_end(dst3_y, width * height);
-  align_buffer_page_end(dst3_uv, half_width * half_height * 2);
-
-  I420ToNV12(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y,
-             width, dst3_uv, half_width * 2, width, height);
-
-  for (int i = 0; i < width * height; ++i) {
-    EXPECT_EQ(dst_y[i], dst3_y[i]);
-  }
-  for (int i = 0; i < half_width * half_height * 2; ++i) {
-    EXPECT_EQ(dst_uv[i], dst3_uv[i]);
-    EXPECT_EQ(dst_uv[i], dst3_uv[i]);
-  }
-
-  free_aligned_buffer_page_end(dst3_y);
-  free_aligned_buffer_page_end(dst3_uv);
-
-  free_aligned_buffer_page_end(dst2_y);
-  free_aligned_buffer_page_end(dst2_u);
-  free_aligned_buffer_page_end(dst2_v);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_uv);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_uv, half_width * half_height * 2);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
-                     half_width * 2, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Test result matches known hash value.
-  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
-  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
-  EXPECT_EQ(dst_y_hash, 2682851208u);
-  EXPECT_EQ(dst_uv_hash, 1069662856u);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_uv);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToNV12_420) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_uv, half_width * half_height * 2);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
-                     half_width * 2, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Test result matches known hash value. Hashes are for VU so flip the plane.
-  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
-  align_buffer_page_end(dst_vu, half_width * half_height * 2);
-  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
-              half_height);
-  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
-  EXPECT_EQ(dst_y_hash, 2682851208u);
-  EXPECT_EQ(dst_vu_hash, 1069662856u);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_uv);
-  free_aligned_buffer_page_end(dst_vu);
-}
-
-// TODO(fbarchard): Improve test to compare against I422, not checksum
-TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV21_422) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_uv, half_width * half_height * 2);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToNV21(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv,
-                     half_width * 2, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Test result matches known hash value.
-  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
-  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
-  EXPECT_EQ(dst_y_hash, 2682851208u);
-  EXPECT_EQ(dst_uv_hash, 493520167u);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_uv);
-}
-
-TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV12_422) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_uv, half_width * half_height * 2);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToNV12(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv,
-                     half_width * 2, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Test result matches known hash value. Hashes are for VU so flip the plane.
-  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
-  align_buffer_page_end(dst_vu, half_width * half_height * 2);
-  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
-              half_height);
-  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
-  EXPECT_EQ(dst_y_hash, 2682851208u);
-  EXPECT_EQ(dst_vu_hash, 493520167u);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_uv);
-  free_aligned_buffer_page_end(dst_vu);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_uv, half_width * half_height * 2);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToNV21(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv,
-                     half_width * 2, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Test result matches known hash value.
-  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
-  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
-  EXPECT_EQ(dst_y_hash, 330644005u);
-  EXPECT_EQ(dst_uv_hash, 135214341u);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_uv);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToNV12_400) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_uv, half_width * half_height * 2);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToNV12(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv,
-                     half_width * 2, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Test result matches known hash value. Hashes are for VU so flip the plane.
-  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
-  align_buffer_page_end(dst_vu, half_width * half_height * 2);
-  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
-              half_height);
-  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
-  EXPECT_EQ(dst_y_hash, 330644005u);
-  EXPECT_EQ(dst_vu_hash, 135214341u);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_uv);
-  free_aligned_buffer_page_end(dst_vu);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_uv, half_width * half_height * 2);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToNV21(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv,
-                     half_width * 2, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Test result matches known hash value.
-  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
-  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
-  EXPECT_EQ(dst_y_hash, 2682851208u);
-  EXPECT_EQ(dst_uv_hash, 506143297u);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_uv);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToNV12_444) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_uv, half_width * half_height * 2);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToNV12(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv,
-                     half_width * 2, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Test result matches known hash value. Hashes are for VU so flip the plane.
-  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
-  align_buffer_page_end(dst_vu, half_width * half_height * 2);
-  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
-              half_height);
-  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
-  EXPECT_EQ(dst_y_hash, 2682851208u);
-  EXPECT_EQ(dst_vu_hash, 506143297u);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_uv);
-  free_aligned_buffer_page_end(dst_vu);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToARGB) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_argb, width * height * 4);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToARGB(kTest3Jpg, kTest3JpgLen, dst_argb, width * 4, width,
-                     height, width, height);
+#define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                        YALIGN, W1280, N, NEG, OFF, ATTEN)                     \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_u, kSizeUV + OFF);                               \
+    align_buffer_page_end(src_v, kSizeUV + OFF);                               \
+    align_buffer_page_end(src_a, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
+      src_y[i + OFF] = (fastrand() & 0xff);                                    \
+      src_a[i + OFF] = (fastrand() & 0xff);                                    \
+    }                                                                          \
+    for (int i = 0; i < kSizeUV; ++i) {                                        \
+      src_u[i + OFF] = (fastrand() & 0xff);                                    \
+      src_v[i + OFF] = (fastrand() & 0xff);                                    \
+    }                                                                          \
+    memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                           \
+    memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                       \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,         \
+                          src_v + OFF, kStrideUV, src_a + OFF, kWidth,         \
+                          dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight,     \
+                          ATTEN);                                              \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,       \
+                            src_v + OFF, kStrideUV, src_a + OFF, kWidth,       \
+                            dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, \
+                            ATTEN);                                            \
+    }                                                                          \
+    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                       \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                   \
+    }                                                                          \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_u);                                       \
+    free_aligned_buffer_page_end(src_v);                                       \
+    free_aligned_buffer_page_end(src_a);                                       \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
   }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
 
-  // Test result matches known hash value.
-  uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381);
-#ifdef LIBYUV_UNLIMITED_DATA
-  EXPECT_EQ(dst_argb_hash, 3900633302u);
+#if defined(ENABLE_FULL_TESTS)
+#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                       YALIGN)                                                \
+  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_ + 1, _Any, +, 0, 0)                \
+  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Unaligned, +, 2, 0)              \
+  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Invert, -, 0, 0)                 \
+  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Opt, +, 0, 0)                    \
+  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Premult, +, 0, 1)
 #else
-  EXPECT_EQ(dst_argb_hash, 2355976473u);
+#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                       YALIGN)                                                \
+  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Opt, +, 0, 0)
 #endif
 
-  free_aligned_buffer_page_end(dst_argb);
-}
-
-static int ShowJPegInfo(const uint8_t* sample, size_t sample_size) {
-  MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-
-  int width = mjpeg_decoder.GetWidth();
-  int height = mjpeg_decoder.GetHeight();
-
-  // YUV420
-  if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
-      mjpeg_decoder.GetNumComponents() == 3 &&
-      mjpeg_decoder.GetVertSampFactor(0) == 2 &&
-      mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-      mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-      mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-      mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-      mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-    printf("JPeg is J420, %dx%d %d bytes\n", width, height,
-           static_cast<int>(sample_size));
-    // YUV422
-  } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
-             mjpeg_decoder.GetNumComponents() == 3 &&
-             mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-             mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-             mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-             mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-             mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-             mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-    printf("JPeg is J422, %dx%d %d bytes\n", width, height,
-           static_cast<int>(sample_size));
-    // YUV444
-  } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
-             mjpeg_decoder.GetNumComponents() == 3 &&
-             mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-             mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
-             mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-             mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-             mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-             mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-    printf("JPeg is J444, %dx%d %d bytes\n", width, height,
-           static_cast<int>(sample_size));
-    // YUV400
-  } else if (mjpeg_decoder.GetColorSpace() ==
-                 MJpegDecoder::kColorSpaceGrayscale &&
-             mjpeg_decoder.GetNumComponents() == 1 &&
-             mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-             mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-    printf("JPeg is J400, %dx%d %d bytes\n", width, height,
-           static_cast<int>(sample_size));
-  } else {
-    // Unknown colorspace.
-    printf("JPeg is Unknown colorspace.\n");
-  }
-  mjpeg_decoder.UnloadFrame();
-  return ret;
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGInfo) {
-  EXPECT_EQ(1, ShowJPegInfo(kTest0Jpg, kTest0JpgLen));
-  EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen));
-  EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen));
-  EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen));
-  EXPECT_EQ(1, ShowJPegInfo(kTest4Jpg,
-                            kTest4JpgLen));  // Valid but unsupported.
-}
-#endif  // HAVE_JPEG
-
-TEST_F(LibYUVConvertTest, NV12Crop) {
-  const int SUBSAMP_X = 2;
-  const int SUBSAMP_Y = 2;
-  const int kWidth = benchmark_width_;
-  const int kHeight = benchmark_height_;
-  const int crop_y =
-      ((benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2 + 1) & ~1;
-  const int kDestWidth = benchmark_width_;
-  const int kDestHeight = benchmark_height_ - crop_y * 2;
-  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);
-  const int sample_size =
-      kWidth * kHeight + kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;
-  align_buffer_page_end(src_y, sample_size);
-  uint8_t* src_uv = src_y + kWidth * kHeight;
-
-  align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
-  align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
-                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-  align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
-                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
-  align_buffer_page_end(dst_y_2, kDestWidth * kDestHeight);
-  align_buffer_page_end(dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
-                                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-  align_buffer_page_end(dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
-                                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
-  for (int i = 0; i < kHeight * kWidth; ++i) {
-    src_y[i] = (fastrand() & 0xff);
-  }
-  for (int i = 0; i < (SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideUV) * 2; ++i) {
-    src_uv[i] = (fastrand() & 0xff);
-  }
-  memset(dst_y, 1, kDestWidth * kDestHeight);
-  memset(dst_u, 2,
-         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-  memset(dst_v, 3,
-         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-  memset(dst_y_2, 1, kDestWidth * kDestHeight);
-  memset(dst_u_2, 2,
-         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-  memset(dst_v_2, 3,
-         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
-  ConvertToI420(src_y, sample_size, dst_y_2, kDestWidth, dst_u_2,
-                SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v_2,
-                SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
-                kDestWidth, kDestHeight, libyuv::kRotate0, libyuv::FOURCC_NV12);
-
-  NV12ToI420(src_y + crop_y * kWidth, kWidth,
-             src_uv + (crop_y / 2) * kStrideUV * 2, kStrideUV * 2, dst_y,
-             kDestWidth, dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
-             SUBSAMPLE(kDestWidth, SUBSAMP_X), kDestWidth, kDestHeight);
-
-  for (int i = 0; i < kDestHeight; ++i) {
-    for (int j = 0; j < kDestWidth; ++j) {
-      EXPECT_EQ(dst_y[i * kWidth + j], dst_y_2[i * kWidth + j]);
-    }
-  }
-  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
-    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
-      EXPECT_EQ(dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
-                dst_u_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
-    }
-  }
-  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
-    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
-      EXPECT_EQ(dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
-                dst_v_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
-    }
-  }
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_u);
-  free_aligned_buffer_page_end(dst_v);
-  free_aligned_buffer_page_end(dst_y_2);
-  free_aligned_buffer_page_end(dst_u_2);
-  free_aligned_buffer_page_end(dst_v_2);
-  free_aligned_buffer_page_end(src_y);
-}
-
-TEST_F(LibYUVConvertTest, I420CropOddY) {
-  const int SUBSAMP_X = 2;
-  const int SUBSAMP_Y = 2;
-  const int kWidth = benchmark_width_;
-  const int kHeight = benchmark_height_;
-  const int crop_y = benchmark_height_ > 1 ? 1 : 0;
-  const int kDestWidth = benchmark_width_;
-  const int kDestHeight = benchmark_height_ - crop_y * 2;
-  const int kStrideU = SUBSAMPLE(kWidth, SUBSAMP_X);
-  const int kStrideV = SUBSAMPLE(kWidth, SUBSAMP_X);
-  const int sample_size = kWidth * kHeight +
-                          kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y) +
-                          kStrideV * SUBSAMPLE(kHeight, SUBSAMP_Y);
-  align_buffer_page_end(src_y, sample_size);
-  uint8_t* src_u = src_y + kWidth * kHeight;
-  uint8_t* src_v = src_u + kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y);
-
-  align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
-  align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
-                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-  align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
-                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
-  for (int i = 0; i < kHeight * kWidth; ++i) {
-    src_y[i] = (fastrand() & 0xff);
-  }
-  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideU; ++i) {
-    src_u[i] = (fastrand() & 0xff);
-  }
-  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideV; ++i) {
-    src_v[i] = (fastrand() & 0xff);
-  }
-  memset(dst_y, 1, kDestWidth * kDestHeight);
-  memset(dst_u, 2,
-         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-  memset(dst_v, 3,
-         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
-  MaskCpuFlags(benchmark_cpu_info_);
-  for (int i = 0; i < benchmark_iterations_; ++i) {
-    ConvertToI420(src_y, sample_size, dst_y, kDestWidth, dst_u,
-                  SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
-                  SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
-                  kDestWidth, kDestHeight, libyuv::kRotate0,
-                  libyuv::FOURCC_I420);
-  }
+#define J420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define J420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define F420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define F420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define H420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define H420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define U420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define V420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define J422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define J422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define F422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define F422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define H422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define H422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define U422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define U422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define V422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define J444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define J444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define F444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define F444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define H444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define H444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define U444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define U444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define V444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
 
-  for (int i = 0; i < kDestHeight; ++i) {
-    for (int j = 0; j < kDestWidth; ++j) {
-      EXPECT_EQ(src_y[crop_y * kWidth + i * kWidth + j],
-                dst_y[i * kDestWidth + j]);
-    }
-  }
-  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
-    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
-      EXPECT_EQ(src_u[(crop_y / 2 + i) * kStrideU + j],
-                dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
-    }
-  }
-  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
-    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
-      EXPECT_EQ(src_v[(crop_y / 2 + i) * kStrideV + j],
-                dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
-    }
-  }
+#define I420AlphaToARGBFilter(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+  I420AlphaToARGBMatrixFilter(a, b, c, d, e, f, g, h, i, j,          \
+                              &kYuvI601Constants, k, l, m, kFilterBilinear)
+#define I422AlphaToARGBFilter(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+  I422AlphaToARGBMatrixFilter(a, b, c, d, e, f, g, h, i, j,          \
+                              &kYuvI601Constants, k, l, m, kFilterBilinear)
 
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_u);
-  free_aligned_buffer_page_end(dst_v);
-  free_aligned_buffer_page_end(src_y);
-}
+#if defined(ENABLE_FULL_TESTS)
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(J420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(J420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(F420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(F420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(I422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(J422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(J422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(H422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(H422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(F422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(F422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(U422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(U422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(I444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(J444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(J444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(H444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(H444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(F444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(F444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(U444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(U444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGBFilter, 4, 4, 1)
+TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1)
+#else
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGBFilter, 4, 4, 1)
+TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1)
+#endif
 
 TEST_F(LibYUVConvertTest, TestYToARGB) {
   uint8_t y[32];
@@ -2846,6 +1288,7 @@ TEST_F(LibYUVConvertTest, TestDither) {
     free_aligned_buffer_page_end(dst_argb32_opt);                              \
   }
 
+#if defined(ENABLE_FULL_TESTS)
 #define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
                        YALIGN, FMT_C, BPP_C)                                  \
   TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
@@ -2856,71 +1299,17 @@ TEST_F(LibYUVConvertTest, TestDither) {
                   YALIGN, benchmark_width_, _Invert, -, 0, FMT_C, BPP_C)      \
   TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
                   YALIGN, benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
+#else
+#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                       YALIGN, FMT_C, BPP_C)                                  \
+  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
+#endif
 
 #ifdef LITTLE_ENDIAN_ONLY_TEST
 TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
 #endif
 
-#define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12)                                \
-  TEST_F(LibYUVConvertTest, NAME) {                                           \
-    const int kWidth = benchmark_width_;                                      \
-    const int kHeight = benchmark_height_;                                    \
-                                                                              \
-    align_buffer_page_end(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight);     \
-    align_buffer_page_end(orig_y, kWidth* kHeight);                           \
-    align_buffer_page_end(orig_u,                                             \
-                          SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));      \
-    align_buffer_page_end(orig_v,                                             \
-                          SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));      \
-                                                                              \
-    align_buffer_page_end(dst_y_orig, kWidth* kHeight);                       \
-    align_buffer_page_end(dst_uv_orig,                                        \
-                          2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));  \
-                                                                              \
-    align_buffer_page_end(dst_y, kWidth* kHeight);                            \
-    align_buffer_page_end(dst_uv,                                             \
-                          2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));  \
-                                                                              \
-    MemRandomize(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight);              \
-                                                                              \
-    /* Convert UYVY to NV12 in 2 steps for reference */                       \
-    libyuv::UYVYTOI420(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), orig_y, kWidth,   \
-                       orig_u, SUBSAMPLE(kWidth, 2), orig_v,                  \
-                       SUBSAMPLE(kWidth, 2), kWidth, kHeight);                \
-    libyuv::I420ToNV12(orig_y, kWidth, orig_u, SUBSAMPLE(kWidth, 2), orig_v,  \
-                       SUBSAMPLE(kWidth, 2), dst_y_orig, kWidth, dst_uv_orig, \
-                       2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);            \
-                                                                              \
-    /* Convert to NV12 */                                                     \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      libyuv::UYVYTONV12(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), dst_y, kWidth,  \
-                         dst_uv, 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);  \
-    }                                                                         \
-                                                                              \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
-      EXPECT_EQ(orig_y[i], dst_y[i]);                                         \
-    }                                                                         \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
-      EXPECT_EQ(dst_y_orig[i], dst_y[i]);                                     \
-    }                                                                         \
-    for (int i = 0; i < 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2);     \
-         ++i) {                                                               \
-      EXPECT_EQ(dst_uv_orig[i], dst_uv[i]);                                   \
-    }                                                                         \
-                                                                              \
-    free_aligned_buffer_page_end(orig_uyvy);                                  \
-    free_aligned_buffer_page_end(orig_y);                                     \
-    free_aligned_buffer_page_end(orig_u);                                     \
-    free_aligned_buffer_page_end(orig_v);                                     \
-    free_aligned_buffer_page_end(dst_y_orig);                                 \
-    free_aligned_buffer_page_end(dst_uv_orig);                                \
-    free_aligned_buffer_page_end(dst_y);                                      \
-    free_aligned_buffer_page_end(dst_uv);                                     \
-  }
-
-TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12)
-TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
-
 // Transitive test.  A to B to C is same as A to C.
 // Benchmarks A To B to C for comparison to 1 step, benchmarked elsewhere.
 #define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
@@ -3223,6 +1612,7 @@ TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
     free_aligned_buffer_page_end(dst_argb_bc);                                 \
   }
 
+#if defined(ENABLE_FULL_TESTS)
 #define TESTPLANETOE(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, FMT_C, BPP_C) \
   TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B,                    \
                 benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C)              \
@@ -3232,6 +1622,11 @@ TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
                 _Invert, -, 0, FMT_C, BPP_C)                                 \
   TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_,  \
                 _Opt, +, 0, FMT_C, BPP_C)
+#else
+#define TESTPLANETOE(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, FMT_C, BPP_C) \
+  TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_,  \
+                _Opt, +, 0, FMT_C, BPP_C)
+#endif
 
 // Caveat: Destination needs to be 4 bytes
 #ifdef LITTLE_ENDIAN_ONLY_TEST
@@ -3348,11 +1743,15 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
 }
 #endif  // HAS_ABGRTOAR30ROW_AVX2
 
+#if !defined(LEAN_TESTS)
+
 // Provide matrix wrappers for 12 bit YUV
 #define I012ToARGB(a, b, c, d, e, f, g, h, i, j) \
   I012ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
 #define I012ToAR30(a, b, c, d, e, f, g, h, i, j) \
   I012ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define I012ToAB30(a, b, c, d, e, f, g, h, i, j) \
+  I012ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
 
 #define I410ToARGB(a, b, c, d, e, f, g, h, i, j) \
   I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
@@ -3440,6 +1839,7 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
     free_aligned_buffer_page_end(dst_argb_opt);                               \
   }
 
+#if defined(ENABLE_FULL_TESTS)
 #define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B,   \
                         BPP_B, ALIGN, YALIGN)                                \
   TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
@@ -3450,6 +1850,12 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
                    ALIGN, YALIGN, benchmark_width_, _Invert, -, 0, 0)        \
   TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
                    ALIGN, YALIGN, benchmark_width_, _Opt, +, 0, 0)
+#else
+#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B,   \
+                        BPP_B, ALIGN, YALIGN)                                \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+                   ALIGN, YALIGN, benchmark_width_, _Opt, +, 0, 0)
+#endif
 
 // These conversions are only optimized for x86
 #if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
@@ -3495,6 +1901,7 @@ TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AB30, 4, 4, 1)
 TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AR30, 4, 4, 1)
 TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AB30, 4, 4, 1)
 TESTPLANAR16TOB(I012, 2, 2, 0xfff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I012, 2, 2, 0xfff, AB30, 4, 4, 1)
 TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AR30Filter, 4, 4, 1)
 TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30Filter, 4, 4, 1)
 #endif  // LITTLE_ENDIAN_ONLY_TEST
@@ -3733,8 +2140,8 @@ TESTQPLANAR16TOB(I010Alpha, 2, 2, ARGBFilter, 4, 4, 1, 10)
 TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10)
 #endif  // DISABLE_SLOW_TESTS
 
-#define TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,     \
-                           ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH)  \
+#define TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,    \
+                     YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH)               \
   TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
     const int kWidth = W1280;                                                  \
     const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
@@ -3777,16 +2184,23 @@ TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10)
     free_aligned_buffer_page_end(dst_argb_opt);                                \
   }
 
-#define TESTBIPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,    \
-                          ALIGN, YALIGN, S_DEPTH)                            \
-  TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
-                     YALIGN, benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH)   \
-  TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
-                     YALIGN, benchmark_width_, _Unaligned, +, 4, 4, S_DEPTH) \
-  TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
-                     YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH)    \
-  TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
-                     YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH)
+#if defined(ENABLE_FULL_TESTS)
+#define TESTBP16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,    \
+                    YALIGN, S_DEPTH)                                          \
+  TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+               benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH)                  \
+  TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+               benchmark_width_, _Unaligned, +, 4, 4, S_DEPTH)                \
+  TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+               benchmark_width_, _Invert, -, 0, 0, S_DEPTH)                   \
+  TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+               benchmark_width_, _Opt, +, 0, 0, S_DEPTH)
+#else
+#define TESTBP16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,    \
+                    YALIGN, S_DEPTH)                                          \
+  TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+               benchmark_width_, _Opt, +, 0, 0, S_DEPTH)
+#endif
 
 #define P010ToARGB(a, b, c, d, e, f, g, h) \
   P010ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
@@ -3829,23 +2243,23 @@ TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10)
                          kFilterBilinear)
 
 #if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
-TESTBIPLANAR16TOB(P010, 2, 2, ARGB, 4, 4, 1, 10)
-TESTBIPLANAR16TOB(P210, 2, 1, ARGB, 4, 4, 1, 10)
-TESTBIPLANAR16TOB(P012, 2, 2, ARGB, 4, 4, 1, 12)
-TESTBIPLANAR16TOB(P212, 2, 1, ARGB, 4, 4, 1, 12)
-TESTBIPLANAR16TOB(P016, 2, 2, ARGB, 4, 4, 1, 16)
-TESTBIPLANAR16TOB(P216, 2, 1, ARGB, 4, 4, 1, 16)
-TESTBIPLANAR16TOB(P010, 2, 2, ARGBFilter, 4, 4, 1, 10)
-TESTBIPLANAR16TOB(P210, 2, 1, ARGBFilter, 4, 4, 1, 10)
+TESTBP16TOB(P010, 2, 2, ARGB, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, ARGB, 4, 4, 1, 10)
+TESTBP16TOB(P012, 2, 2, ARGB, 4, 4, 1, 12)
+TESTBP16TOB(P212, 2, 1, ARGB, 4, 4, 1, 12)
+TESTBP16TOB(P016, 2, 2, ARGB, 4, 4, 1, 16)
+TESTBP16TOB(P216, 2, 1, ARGB, 4, 4, 1, 16)
+TESTBP16TOB(P010, 2, 2, ARGBFilter, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, ARGBFilter, 4, 4, 1, 10)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTBIPLANAR16TOB(P010, 2, 2, AR30, 4, 4, 1, 10)
-TESTBIPLANAR16TOB(P210, 2, 1, AR30, 4, 4, 1, 10)
-TESTBIPLANAR16TOB(P012, 2, 2, AR30, 4, 4, 1, 12)
-TESTBIPLANAR16TOB(P212, 2, 1, AR30, 4, 4, 1, 12)
-TESTBIPLANAR16TOB(P016, 2, 2, AR30, 4, 4, 1, 16)
-TESTBIPLANAR16TOB(P216, 2, 1, AR30, 4, 4, 1, 16)
-TESTBIPLANAR16TOB(P010, 2, 2, AR30Filter, 4, 4, 1, 10)
-TESTBIPLANAR16TOB(P210, 2, 1, AR30Filter, 4, 4, 1, 10)
+TESTBP16TOB(P010, 2, 2, AR30, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, AR30, 4, 4, 1, 10)
+TESTBP16TOB(P012, 2, 2, AR30, 4, 4, 1, 12)
+TESTBP16TOB(P212, 2, 1, AR30, 4, 4, 1, 12)
+TESTBP16TOB(P016, 2, 2, AR30, 4, 4, 1, 16)
+TESTBP16TOB(P216, 2, 1, AR30, 4, 4, 1, 16)
+TESTBP16TOB(P010, 2, 2, AR30Filter, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, AR30Filter, 4, 4, 1, 10)
 #endif  // LITTLE_ENDIAN_ONLY_TEST
 #endif  // DISABLE_SLOW_TESTS
 
@@ -4281,61 +2695,6 @@ TEST_F(LibYUVConvertTest, Test565) {
   uint32_t checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
   EXPECT_EQ(610919429u, checksum);
 }
-
-// Test RGB24 to J420 is exact
-#if defined(LIBYUV_BIT_EXACT)
-TEST_F(LibYUVConvertTest, TestRGB24ToJ420) {
-  const int kSize = 256;
-  align_buffer_page_end(orig_rgb24, kSize * 3 * 2);  // 2 rows of RGB24
-  align_buffer_page_end(dest_j420, kSize * 3 / 2 * 2);
-  int iterations256 = (benchmark_width_ * benchmark_height_ + (kSize * 2 - 1)) /
-                      (kSize * 2) * benchmark_iterations_;
-
-  for (int i = 0; i < kSize * 3 * 2; ++i) {
-    orig_rgb24[i] = i;
-  }
-
-  for (int i = 0; i < iterations256; ++i) {
-    RGB24ToJ420(orig_rgb24, kSize * 3, dest_j420, kSize,  // Y plane
-                dest_j420 + kSize * 2, kSize / 2,         // U plane
-                dest_j420 + kSize * 5 / 2, kSize / 2,     // V plane
-                kSize, 2);
-  }
-
-  uint32_t checksum = HashDjb2(dest_j420, kSize * 3 / 2 * 2, 5381);
-  EXPECT_EQ(2755440272u, checksum);
-
-  free_aligned_buffer_page_end(orig_rgb24);
-  free_aligned_buffer_page_end(dest_j420);
-}
-#endif
-
-// Test RGB24 to I420 is exact
-#if defined(LIBYUV_BIT_EXACT)
-TEST_F(LibYUVConvertTest, TestRGB24ToI420) {
-  const int kSize = 256;
-  align_buffer_page_end(orig_rgb24, kSize * 3 * 2);  // 2 rows of RGB24
-  align_buffer_page_end(dest_i420, kSize * 3 / 2 * 2);
-  int iterations256 = (benchmark_width_ * benchmark_height_ + (kSize * 2 - 1)) /
-                      (kSize * 2) * benchmark_iterations_;
-
-  for (int i = 0; i < kSize * 3 * 2; ++i) {
-    orig_rgb24[i] = i;
-  }
-
-  for (int i = 0; i < iterations256; ++i) {
-    RGB24ToI420(orig_rgb24, kSize * 3, dest_i420, kSize,  // Y plane
-                dest_i420 + kSize * 2, kSize / 2,         // U plane
-                dest_i420 + kSize * 5 / 2, kSize / 2,     // V plane
-                kSize, 2);
-  }
-
-  uint32_t checksum = HashDjb2(dest_i420, kSize * 3 / 2 * 2, 5381);
-  EXPECT_EQ(1526656597u, checksum);
-
-  free_aligned_buffer_page_end(orig_rgb24);
-  free_aligned_buffer_page_end(dest_i420);
-}
-#endif
+#endif  // !defined(LEAN_TESTS)
 
 }  // namespace libyuv
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
new file mode 100644
index 00000000..f55bace3
--- /dev/null
+++ b/unit_test/convert_test.cc
@@ -0,0 +1,2110 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "../unit_test/unit_test.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/video_common.h"
+
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */
+#endif
+
+#if defined(__riscv) && !defined(__clang__)
+#define DISABLE_SLOW_TESTS
+#undef ENABLE_FULL_TESTS
+#undef ENABLE_ROW_TESTS
+#define LEAN_TESTS
+#endif
+
+// Some functions fail on big endian. Enable these tests on all cpus except
+// PowerPC, but they are not optimized so disabled by default.
+#if !defined(DISABLE_SLOW_TESTS) && !defined(__powerpc__)
+#define LITTLE_ENDIAN_ONLY_TEST 1
+#endif
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
+namespace libyuv {
+
+// Alias to copy pixels as is
+#define AR30ToAR30 ARGBCopy
+#define ABGRToABGR ARGBCopy
+
+// subsample amount uses a divide.
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
+
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
+
+// Planar test
+
+#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,         \
+                       SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,             \
+                       DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,      \
+                       SRC_DEPTH)                                             \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
+    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");       \
+    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");       \
+    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                   \
+                  "SRC_SUBSAMP_X unsupported");                               \
+    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                   \
+                  "SRC_SUBSAMP_Y unsupported");                               \
+    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                   \
+                  "DST_SUBSAMP_X unsupported");                               \
+    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
+                  "DST_SUBSAMP_Y unsupported");                               \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
+    const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
+    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
+    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
+    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
+    align_buffer_page_end(src_u,                                              \
+                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+    align_buffer_page_end(src_v,                                              \
+                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
+    align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
+    align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
+    align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+    align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+    MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                    \
+    MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
+    MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
+    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
+    SRC_T* src_u_p = reinterpret_cast<SRC_T*>(src_u + OFF);                   \
+    SRC_T* src_v_p = reinterpret_cast<SRC_T*>(src_v + OFF);                   \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+    }                                                                         \
+    for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) {                \
+      src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+      src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+    }                                                                         \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
+    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
+    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
+    memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
+    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
+        src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth,      \
+        reinterpret_cast<DST_T*>(dst_y_c), kWidth,                            \
+        reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth,                     \
+        reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth,             \
+        NEG kHeight);                                                         \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
+          src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth,    \
+          reinterpret_cast<DST_T*>(dst_y_opt), kWidth,                        \
+          reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth,                 \
+          reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth,         \
+          NEG kHeight);                                                       \
+    }                                                                         \
+    for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                    \
+      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                    \
+    }                                                                         \
+    for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) {      \
+      EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);                                    \
+      EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);                                    \
+    }                                                                         \
+    free_aligned_buffer_page_end(dst_y_c);                                    \
+    free_aligned_buffer_page_end(dst_u_c);                                    \
+    free_aligned_buffer_page_end(dst_v_c);                                    \
+    free_aligned_buffer_page_end(dst_y_opt);                                  \
+    free_aligned_buffer_page_end(dst_u_opt);                                  \
+    free_aligned_buffer_page_end(dst_v_opt);                                  \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_u);                                      \
+    free_aligned_buffer_page_end(src_v);                                      \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,           \
+                      SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,               \
+                      DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)                 \
+  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
+                 benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH)                  \
+  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
+                 benchmark_width_, _Unaligned, +, 2, SRC_DEPTH)                \
+  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
+                 benchmark_width_, _Invert, -, 0, SRC_DEPTH)                   \
+  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
+                 benchmark_width_, _Opt, +, 0, SRC_DEPTH)
+#else
+#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,           \
+                      SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,               \
+                      DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)                 \
+  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
+                 benchmark_width_, _Opt, +, 0, SRC_DEPTH)
+#endif
+
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1, 8)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I444, uint8_t, 1, 1, 1, 8)
+TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1, 8)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I012, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H012, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10)
+TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I412, uint16_t, 2, 1, 1, 12)
+TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I012, uint16_t, 2, 2, 2, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I012, uint16_t, 2, 2, 2, 12)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 10)
+TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 10)
+TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I420, uint8_t, 1, 2, 2, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 12)
+TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I420, uint8_t, 1, 2, 2, 12)
+TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12)
+
+// Test Android 420 to I420
+#define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X,          \
+                        SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                        W1280, N, NEG, OFF, PN, OFF_U, OFF_V)                 \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##To##PN##N) {      \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kSizeUV =                                                       \
+        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
+    align_buffer_page_end(src_uv,                                             \
+                          kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);       \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
+    align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
+                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+    align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
+                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
+    align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
+                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
+                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    uint8_t* src_u = src_uv + OFF_U;                                          \
+    uint8_t* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V);          \
+    int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE;          \
+    for (int i = 0; i < kHeight; ++i)                                         \
+      for (int j = 0; j < kWidth; ++j)                                        \
+        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                    \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {             \
+      for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {            \
+        src_u[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] =                 \
+            (fastrand() & 0xff);                                              \
+        src_v[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] =                 \
+            (fastrand() & 0xff);                                              \
+      }                                                                       \
+    }                                                                         \
+    memset(dst_y_c, 1, kWidth* kHeight);                                      \
+    memset(dst_u_c, 2,                                                        \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    memset(dst_v_c, 3,                                                        \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
+    memset(dst_u_opt, 102,                                                    \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    memset(dst_v_opt, 103,                                                    \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
+        src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X),   \
+        src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, dst_y_c, \
+        kWidth, dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c,               \
+        SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);                   \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
+          src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+          src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE,        \
+          dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),         \
+          dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);      \
+    }                                                                         \
+    for (int i = 0; i < kHeight; ++i) {                                       \
+      for (int j = 0; j < kWidth; ++j) {                                      \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
+      }                                                                       \
+    }                                                                         \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
+      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
+        EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
+                  dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
+      }                                                                       \
+    }                                                                         \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
+      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
+        EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
+                  dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
+      }                                                                       \
+    }                                                                         \
+    free_aligned_buffer_page_end(dst_y_c);                                    \
+    free_aligned_buffer_page_end(dst_u_c);                                    \
+    free_aligned_buffer_page_end(dst_v_c);                                    \
+    free_aligned_buffer_page_end(dst_y_opt);                                  \
+    free_aligned_buffer_page_end(dst_u_opt);                                  \
+    free_aligned_buffer_page_end(dst_v_opt);                                  \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_uv);                                     \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V,         \
+                       SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X,    \
+                       SUBSAMP_Y)                                              \
+  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ + 1,      \
+                  _Any, +, 0, PN, OFF_U, OFF_V)                                \
+  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_,          \
+                  _Unaligned, +, 2, PN, OFF_U, OFF_V)                          \
+  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, \
+                  -, 0, PN, OFF_U, OFF_V)                                      \
+  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \
+                  0, PN, OFF_U, OFF_V)
+#else
+#define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V,         \
+                       SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X,    \
+                       SUBSAMP_Y)                                              \
+  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \
+                  0, PN, OFF_U, OFF_V)
+#endif
+
+TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2)
+TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2)
+TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
+#undef TESTAPLANARTOP
+#undef TESTAPLANARTOPI
+
+// wrapper to keep API the same
+int I400ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* /* src_u */,
+               int /* src_stride_u */,
+               const uint8_t* /* src_v */,
+               int /* src_stride_v */,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  return I400ToNV21(src_y, src_stride_y, dst_y, dst_stride_y, dst_vu,
+                    dst_stride_vu, width, height);
+}
+
+#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,        \
+                        SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,            \
+                        DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,     \
+                        SRC_DEPTH)                                            \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
+    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");       \
+    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");       \
+    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                   \
+                  "SRC_SUBSAMP_X unsupported");                               \
+    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                   \
+                  "SRC_SUBSAMP_Y unsupported");                               \
+    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                   \
+                  "DST_SUBSAMP_X unsupported");                               \
+    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
+                  "DST_SUBSAMP_Y unsupported");                               \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
+    const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
+    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
+    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
+    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
+    align_buffer_page_end(src_u,                                              \
+                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+    align_buffer_page_end(src_v,                                              \
+                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
+    align_buffer_page_end(dst_uv_c,                                           \
+                          kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);        \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
+    align_buffer_page_end(dst_uv_opt,                                         \
+                          kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);        \
+    MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                    \
+    MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
+    MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
+    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
+    SRC_T* src_u_p = reinterpret_cast<SRC_T*>(src_u + OFF);                   \
+    SRC_T* src_v_p = reinterpret_cast<SRC_T*>(src_v + OFF);                   \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+    }                                                                         \
+    for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) {                \
+      src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+      src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+    }                                                                         \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
+    memset(dst_uv_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);          \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
+    memset(dst_uv_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);      \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y_p, kWidth, src_u_p, kSrcHalfWidth,   \
+                                   src_v_p, kSrcHalfWidth,                    \
+                                   reinterpret_cast<DST_T*>(dst_y_c), kWidth, \
+                                   reinterpret_cast<DST_T*>(dst_uv_c),        \
+                                   kDstHalfWidth * 2, kWidth, NEG kHeight);   \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
+          src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth,    \
+          reinterpret_cast<DST_T*>(dst_y_opt), kWidth,                        \
+          reinterpret_cast<DST_T*>(dst_uv_opt), kDstHalfWidth * 2, kWidth,    \
+          NEG kHeight);                                                       \
+    }                                                                         \
+    for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                    \
+      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                    \
+    }                                                                         \
+    for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC * 2; ++i) {  \
+      EXPECT_EQ(dst_uv_c[i], dst_uv_opt[i]);                                  \
+    }                                                                         \
+    free_aligned_buffer_page_end(dst_y_c);                                    \
+    free_aligned_buffer_page_end(dst_uv_c);                                   \
+    free_aligned_buffer_page_end(dst_y_opt);                                  \
+    free_aligned_buffer_page_end(dst_uv_opt);                                 \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_u);                                      \
+    free_aligned_buffer_page_end(src_v);                                      \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,         \
+                       SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,             \
+                       DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)               \
+  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                  DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \
+  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                  DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2,          \
+                  SRC_DEPTH)                                                  \
+  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                  DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH)  \
+  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                  DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH)
+#else
+#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,       \
+                       SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,           \
+                       DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)             \
+  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,            \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+                  DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH)
+#endif
+
+TESTPLANARTOBP(I420, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I420, uint8_t, 1, 2, 2, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I422, uint8_t, 1, 2, 1, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I444, uint8_t, 1, 1, 1, NV12, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I444, uint8_t, 1, 1, 1, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I400, uint8_t, 1, 2, 2, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I010, uint16_t, 2, 2, 2, P010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOBP(I210, uint16_t, 2, 2, 1, P210, uint16_t, 2, 2, 1, 10)
+TESTPLANARTOBP(I012, uint16_t, 2, 2, 2, P012, uint16_t, 2, 2, 2, 12)
+TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
+
+#define TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,            \
+                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+                    DST_SUBSAMP_Y, W1280, N, NEG, OFF, DOY, SRC_DEPTH,        \
+                    TILE_WIDTH, TILE_HEIGHT)                                  \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
+    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");       \
+    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                   \
+                  "SRC_SUBSAMP_X unsupported");                               \
+    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                   \
+                  "SRC_SUBSAMP_Y unsupported");                               \
+    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                   \
+                  "DST_SUBSAMP_X unsupported");                               \
+    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
+                  "DST_SUBSAMP_Y unsupported");                               \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
+    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
+    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
+    const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1); \
+    const int kPaddedHeight =                                                 \
+        (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1);                   \
+    const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X);   \
+    const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \
+    align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \
+    align_buffer_page_end(                                                    \
+        src_uv,                                                               \
+        2 * kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC + OFF);      \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
+    align_buffer_page_end(dst_uv_c,                                           \
+                          2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);      \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
+    align_buffer_page_end(dst_uv_opt,                                         \
+                          2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);      \
+    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
+    SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                 \
+    for (int i = 0;                                                           \
+         i < kPaddedWidth * kPaddedHeight * SRC_BPC / (int)sizeof(SRC_T);     \
+         ++i) {                                                               \
+      src_y_p[i] =                                                            \
+          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));      \
+    }                                                                         \
+    for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2 *      \
+                            SRC_BPC / (int)sizeof(SRC_T);                     \
+         ++i) {                                                               \
+      src_uv_p[i] =                                                           \
+          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));      \
+    }                                                                         \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
+    memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);        \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
+    memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);    \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
+        src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p,              \
+        2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T),                     \
+        DOY ? reinterpret_cast<DST_T*>(dst_y_c) : NULL, kWidth,               \
+        reinterpret_cast<DST_T*>(dst_uv_c), 2 * kDstHalfWidth, kWidth,        \
+        NEG kHeight);                                                         \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
+          src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p,            \
+          2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T),                   \
+          DOY ? reinterpret_cast<DST_T*>(dst_y_opt) : NULL, kWidth,           \
+          reinterpret_cast<DST_T*>(dst_uv_opt), 2 * kDstHalfWidth, kWidth,    \
+          NEG kHeight);                                                       \
+    }                                                                         \
+    if (DOY) {                                                                \
+      for (int i = 0; i < kHeight; ++i) {                                     \
+        for (int j = 0; j < kWidth; ++j) {                                    \
+          EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);      \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    for (int i = 0; i < kDstHalfHeight; ++i) {                                \
+      for (int j = 0; j < 2 * kDstHalfWidth; ++j) {                           \
+        EXPECT_EQ(dst_uv_c[i * 2 * kDstHalfWidth + j],                        \
+                  dst_uv_opt[i * 2 * kDstHalfWidth + j]);                     \
+      }                                                                       \
+    }                                                                         \
+    free_aligned_buffer_page_end(dst_y_c);                                    \
+    free_aligned_buffer_page_end(dst_uv_c);                                   \
+    free_aligned_buffer_page_end(dst_y_opt);                                  \
+    free_aligned_buffer_page_end(dst_uv_opt);                                 \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_uv);                                     \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTBPTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,            \
+                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+                   DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)        \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_ + 1, _Any, +, 0, 1, SRC_DEPTH, TILE_WIDTH,    \
+              TILE_HEIGHT)                                                   \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_, _Unaligned, +, 2, 1, SRC_DEPTH, TILE_WIDTH,  \
+              TILE_HEIGHT)                                                   \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_, _Invert, -, 0, 1, SRC_DEPTH, TILE_WIDTH,     \
+              TILE_HEIGHT)                                                   \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH, TILE_WIDTH,        \
+              TILE_HEIGHT)                                                   \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_, _NullY, +, 0, 0, SRC_DEPTH, TILE_WIDTH,      \
+              TILE_HEIGHT)
+#else
+#define TESTBPTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,            \
+                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+                   DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)        \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_, _NullY, +, 0, 0, SRC_DEPTH, TILE_WIDTH,      \
+              TILE_HEIGHT)
+#endif
+
+TESTBPTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
+TESTBPTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
+TESTBPTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10, 1, 1)
+TESTBPTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10, 1, 1)
+TESTBPTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10, 1, 1)
+TESTBPTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12, 1, 1)
+TESTBPTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12, 1, 1)
+TESTBPTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12, 1, 1)
+TESTBPTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32)
+TESTBPTOBP(MT2T, uint8_t, 10 / 8, 2, 2, P010, uint16_t, 2, 2, 2, 10, 16, 32)
+
+#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+                       W1280, N, NEG, OFF)                                     \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                        \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
+    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                   \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
+    align_buffer_page_end(dst_uv_c,                                            \
+                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
+    align_buffer_page_end(dst_uv_opt,                                          \
+                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    memset(dst_y_c, 1, kWidth* kHeight);                                       \
+    memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+    memset(dst_y_opt, 101, kWidth* kHeight);                                   \
+    memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));    \
+    for (int i = 0; i < kHeight; ++i)                                          \
+      for (int j = 0; j < kStride; ++j)                                        \
+        src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);               \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c,  \
+                          kStrideUV * 2, dst_uv_c + kStrideUV, kStrideUV * 2,  \
+                          kWidth, NEG kHeight);                                \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth,        \
+                            dst_uv_opt, kStrideUV * 2, dst_uv_opt + kStrideUV, \
+                            kStrideUV * 2, kWidth, NEG kHeight);               \
+    }                                                                          \
+    for (int i = 0; i < kHeight; ++i) {                                        \
+      for (int j = 0; j < kWidth; ++j) {                                       \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);         \
+      }                                                                        \
+    }                                                                          \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) {              \
+      for (int j = 0; j < kStrideUV; ++j) {                                    \
+        EXPECT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \
+      }                                                                        \
+    }                                                                          \
+    free_aligned_buffer_page_end(dst_y_c);                                     \
+    free_aligned_buffer_page_end(dst_uv_c);                                    \
+    free_aligned_buffer_page_end(dst_y_opt);                                   \
+    free_aligned_buffer_page_end(dst_uv_opt);                                  \
+    free_aligned_buffer_page_end(src_argb);                                    \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                 benchmark_width_ + 1, _Any, +, 0)                            \
+  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                 benchmark_width_, _Unaligned, +, 2)                          \
+  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                 benchmark_width_, _Invert, -, 0)                             \
+  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                 benchmark_width_, _Opt, +, 0)
+#else
+#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                 benchmark_width_, _Opt, +, 0)
+#endif
+
+TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2)
+TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2)
+TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1)
+TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1)
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2)
+TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1)
+TESTATOPLANAR(ABGR, 4, 1, J420, 2, 2)
+TESTATOPLANAR(ABGR, 4, 1, J422, 2, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2)
+TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2)
+TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2)
+#endif
+TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2)
+TESTATOPLANAR(I400, 1, 1, I420, 2, 2)
+TESTATOPLANAR(J400, 1, 1, J420, 2, 2)
+TESTATOPLANAR(RAW, 3, 1, I420, 2, 2)
+TESTATOPLANAR(RAW, 3, 1, J420, 2, 2)
+TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2)
+TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2)
+TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1)
+TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2)
+TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1)
+
+#define TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X,           \
+                        SUBSAMP_Y, W1280, N, NEG, OFF)                         \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                        \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
+    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                   \
+    align_buffer_page_end(dst_a_c, kWidth* kHeight);                           \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
+    align_buffer_page_end(dst_uv_c,                                            \
+                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    align_buffer_page_end(dst_a_opt, kWidth* kHeight);                         \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
+    align_buffer_page_end(dst_uv_opt,                                          \
+                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    memset(dst_a_c, 1, kWidth* kHeight);                                       \
+    memset(dst_y_c, 2, kWidth* kHeight);                                       \
+    memset(dst_uv_c, 3, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+    memset(dst_a_opt, 101, kWidth* kHeight);                                   \
+    memset(dst_y_opt, 102, kWidth* kHeight);                                   \
+    memset(dst_uv_opt, 103, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));    \
+    for (int i = 0; i < kHeight; ++i)                                          \
+      for (int j = 0; j < kStride; ++j)                                        \
+        src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);               \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c,  \
+                          kStrideUV * 2, dst_uv_c + kStrideUV, kStrideUV * 2,  \
+                          dst_a_c, kWidth, kWidth, NEG kHeight);               \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth,        \
+                            dst_uv_opt, kStrideUV * 2, dst_uv_opt + kStrideUV, \
+                            kStrideUV * 2, dst_a_opt, kWidth, kWidth,          \
+                            NEG kHeight);                                      \
+    }                                                                          \
+    for (int i = 0; i < kHeight; ++i) {                                        \
+      for (int j = 0; j < kWidth; ++j) {                                       \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);         \
+        EXPECT_EQ(dst_a_c[i * kWidth + j], dst_a_opt[i * kWidth + j]);         \
+      }                                                                        \
+    }                                                                          \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) {              \
+      for (int j = 0; j < kStrideUV; ++j) {                                    \
+        EXPECT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \
+      }                                                                        \
+    }                                                                          \
+    free_aligned_buffer_page_end(dst_a_c);                                     \
+    free_aligned_buffer_page_end(dst_y_c);                                     \
+    free_aligned_buffer_page_end(dst_uv_c);                                    \
+    free_aligned_buffer_page_end(dst_a_opt);                                   \
+    free_aligned_buffer_page_end(dst_y_opt);                                   \
+    free_aligned_buffer_page_end(dst_uv_opt);                                  \
+    free_aligned_buffer_page_end(src_argb);                                    \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTATOPLANARA(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_ + 1, _Any, +, 0)                            \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_, _Unaligned, +, 2)                          \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_, _Invert, -, 0)                             \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_, _Opt, +, 0)
+#else
+#define TESTATOPLANARA(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_, _Opt, +, 0)
+#endif
+
+TESTATOPLANARA(ARGB, 4, 1, I420Alpha, 2, 2)
+
+#define TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,     \
+                   W1280, N, NEG, OFF)                                        \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                       \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                     \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
+    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                  \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
+    align_buffer_page_end(dst_uv_c,                                           \
+                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
+    align_buffer_page_end(dst_uv_opt,                                         \
+                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    for (int i = 0; i < kHeight; ++i)                                         \
+      for (int j = 0; j < kStride; ++j)                                       \
+        src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);              \
+    memset(dst_y_c, 1, kWidth* kHeight);                                      \
+    memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));       \
+    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
+    memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));   \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
+                          kStrideUV * 2, kWidth, NEG kHeight);                \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth,       \
+                            dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight);  \
+    }                                                                         \
+    for (int i = 0; i < kHeight; ++i) {                                       \
+      for (int j = 0; j < kWidth; ++j) {                                      \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
+      }                                                                       \
+    }                                                                         \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
+      for (int j = 0; j < kStrideUV * 2; ++j) {                               \
+        EXPECT_EQ(dst_uv_c[i * kStrideUV * 2 + j],                            \
+                  dst_uv_opt[i * kStrideUV * 2 + j]);                         \
+      }                                                                       \
+    }                                                                         \
+    free_aligned_buffer_page_end(dst_y_c);                                    \
+    free_aligned_buffer_page_end(dst_uv_c);                                   \
+    free_aligned_buffer_page_end(dst_y_opt);                                  \
+    free_aligned_buffer_page_end(dst_uv_opt);                                 \
+    free_aligned_buffer_page_end(src_argb);                                   \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTATOBP(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+  TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+             benchmark_width_ + 1, _Any, +, 0)                           \
+  TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+             benchmark_width_, _Unaligned, +, 2)                         \
+  TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+             benchmark_width_, _Invert, -, 0)                            \
+  TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+             benchmark_width_, _Opt, +, 0)
+#else
+#define TESTATOBP(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+  TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+             benchmark_width_, _Opt, +, 0)
+#endif
+
+TESTATOBP(ARGB, 1, 4, NV12, 2, 2)
+TESTATOBP(ARGB, 1, 4, NV21, 2, 2)
+TESTATOBP(ABGR, 1, 4, NV12, 2, 2)
+TESTATOBP(ABGR, 1, 4, NV21, 2, 2)
+TESTATOBP(RAW, 1, 3, JNV21, 2, 2)
+TESTATOBP(YUY2, 2, 4, NV12, 2, 2)
+TESTATOBP(UYVY, 2, 4, NV12, 2, 2)
+TESTATOBP(AYUV, 1, 4, NV12, 2, 2)
+TESTATOBP(AYUV, 1, 4, NV21, 2, 2)
+
+#if !defined(LEAN_TESTS)
+
+#ifdef HAVE_JPEG
+TEST_F(LibYUVConvertTest, ValidateJpeg) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+                             ? benchmark_width_ * benchmark_height_
+                             : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  align_buffer_page_end(orig_pixels, kSize);
+
+  // No SOI or EOI. Expect fail.
+  memset(orig_pixels, 0, kSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  // Test special value that matches marker start.
+  memset(orig_pixels, 0xff, kSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  // EOI, SOI. Expect pass.
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[2] = 0xff;
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
+  }
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+                             ? benchmark_width_ * benchmark_height_
+                             : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  const int kMultiple = 10;
+  const int kBufSize = kImageSize * kMultiple + kOff;
+  align_buffer_page_end(orig_pixels, kBufSize);
+
+  // No SOI or EOI. Expect fail.
+  memset(orig_pixels, 0, kBufSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kBufSize));
+
+  // EOI, SOI. Expect pass.
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[2] = 0xff;
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    EXPECT_TRUE(ValidateJpeg(orig_pixels, kBufSize));
+  }
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, InvalidateJpeg) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+                             ? benchmark_width_ * benchmark_height_
+                             : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  align_buffer_page_end(orig_pixels, kSize);
+
+  // NULL pointer. Expect fail.
+  EXPECT_FALSE(ValidateJpeg(NULL, kSize));
+
+  // Negative size. Expect fail.
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, -1));
+
+  // Too large size. Expect fail.
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, 0xfb000000ull));
+
+  // No SOI or EOI. Expect fail.
+  memset(orig_pixels, 0, kSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  // SOI but no EOI. Expect fail.
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[2] = 0xff;
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+  }
+
+  // EOI but no SOI. Expect fail.
+  orig_pixels[0] = 0;
+  orig_pixels[1] = 0;
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, FuzzJpeg) {
+  // SOI but no EOI. Expect fail.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    const int kSize = fastrand() % 5000 + 3;
+    align_buffer_page_end(orig_pixels, kSize);
+    MemRandomize(orig_pixels, kSize);
+
+    // Add SOI so frame will be scanned.
+    orig_pixels[0] = 0xff;
+    orig_pixels[1] = 0xd8;  // SOI.
+    orig_pixels[2] = 0xff;
+    orig_pixels[kSize - 1] = 0xff;
+    ValidateJpeg(orig_pixels,
+                 kSize);  // Failure normally expected.
+    free_aligned_buffer_page_end(orig_pixels);
+  }
+}
+
+// Test data created in GIMP.  In export jpeg, disable
+// thumbnails etc, choose a subsampling, and use low quality
+// (50) to keep size small. Generated with xxd -i test.jpg
+// test 0 is J400
+static const uint8_t kTest0Jpg[] = {
+    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xc2, 0x00, 0x0b, 0x08, 0x00, 0x10,
+    0x00, 0x20, 0x01, 0x01, 0x11, 0x00, 0xff, 0xc4, 0x00, 0x17, 0x00, 0x01,
+    0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xda, 0x00, 0x08, 0x01,
+    0x01, 0x00, 0x00, 0x00, 0x01, 0x43, 0x7e, 0xa7, 0x97, 0x57, 0xff, 0xc4,
+    0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+    0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+    0x02, 0x3b, 0xc0, 0x6f, 0x66, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+    0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
+    0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
+    0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
+    0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
+    0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
+    0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+    0x3f, 0x21, 0x65, 0x6e, 0x31, 0x86, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
+    0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
+    0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x08,
+    0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x35, 0xff, 0xc4, 0x00, 0x1f, 0x10,
+    0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
+    0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+    0x3f, 0x10, 0x0b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x88, 0xab, 0x8b,
+    0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
+    0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
+    0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
+    0xd9};
+static const size_t kTest0JpgLen = 421;
+
+// test 1 is J444
+static const uint8_t kTest1Jpg[] = {
+    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+    0x01, 0x11, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+    0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
+    0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x01, 0x03, 0xff, 0xda,
+    0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00, 0x01,
+    0x40, 0x8f, 0x26, 0xe8, 0xf4, 0xcc, 0xf9, 0x69, 0x2b, 0x1b, 0x2a, 0xcb,
+    0xff, 0xc4, 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11,
+    0x00, 0x03, 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00,
+    0x01, 0x05, 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99,
+    0x0d, 0x26, 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x01, 0x00,
+    0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x01, 0x00, 0x10, 0x11, 0x02, 0x12, 0xff, 0xda, 0x00, 0x08,
+    0x01, 0x03, 0x01, 0x01, 0x3f, 0x01, 0xf1, 0x00, 0x27, 0x45, 0xbb, 0x31,
+    0xaf, 0xff, 0xc4, 0x00, 0x1a, 0x11, 0x00, 0x02, 0x03, 0x01, 0x01, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+    0x02, 0x10, 0x11, 0x41, 0x12, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01,
+    0x01, 0x3f, 0x01, 0xf6, 0x4b, 0x5f, 0x48, 0xb3, 0x69, 0x63, 0x35, 0x72,
+    0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11,
+    0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00,
+    0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2,
+    0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c,
+    0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61,
+    0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21,
+    0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01,
+    0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48,
+    0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01,
+    0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x26, 0x61, 0xd4, 0xff,
+    0xc4, 0x00, 0x1a, 0x11, 0x00, 0x03, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
+    0x31, 0x41, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
+    0x10, 0x54, 0xa8, 0xbf, 0x50, 0x87, 0xb0, 0x9d, 0x8b, 0xc4, 0x6a, 0x26,
+    0x6b, 0x2a, 0x9c, 0x1f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x01, 0x00, 0x11, 0x21, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02,
+    0x01, 0x01, 0x3f, 0x10, 0x70, 0xe1, 0x3e, 0xd1, 0x8e, 0x0d, 0xe1, 0xb5,
+    0xd5, 0x91, 0x76, 0x43, 0x82, 0x45, 0x4c, 0x7b, 0x7f, 0xff, 0xc4, 0x00,
+    0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61,
+    0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01,
+    0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a,
+    0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96,
+    0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad,
+    0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7,
+    0xd4, 0xff, 0xd9};
+static const size_t kTest1JpgLen = 735;
+
+// test 2 is J420
+static const uint8_t kTest2Jpg[] = {
+    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+    0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+    0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x05, 0x01, 0x02, 0x04, 0xff,
+    0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x01, 0x02, 0xff,
+    0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+    0x01, 0x20, 0xe7, 0x28, 0xa3, 0x0b, 0x2e, 0x2d, 0xcf, 0xff, 0xc4, 0x00,
+    0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03, 0x10,
+    0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05, 0x02,
+    0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26, 0x62,
+    0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x00, 0x03, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
+    0x01, 0xc8, 0x53, 0xff, 0xc4, 0x00, 0x16, 0x11, 0x01, 0x01, 0x01, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x11, 0x32, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f,
+    0x01, 0xd2, 0xc7, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
+    0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
+    0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
+    0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
+    0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
+    0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+    0x3f, 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
+    0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
+    0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c,
+    0x03, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x13, 0x5f,
+    0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11,
+    0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x0e,
+    0xa1, 0x3a, 0x76, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x21, 0x11, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01,
+    0x3f, 0x10, 0x57, 0x0b, 0x08, 0x70, 0xdb, 0xff, 0xc4, 0x00, 0x1f, 0x10,
+    0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
+    0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+    0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b,
+    0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
+    0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
+    0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
+    0xd9};
+static const size_t kTest2JpgLen = 685;
+
+// test 3 is J422
+static const uint8_t kTest3Jpg[] = {
+    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+    0x01, 0x21, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+    0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
+    0x00, 0x17, 0x01, 0x00, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x00, 0xff,
+    0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+    0x01, 0x43, 0x8d, 0x1f, 0xa2, 0xb3, 0xca, 0x1b, 0x57, 0x0f, 0xff, 0xc4,
+    0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+    0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+    0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+    0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03, 0x01,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x01, 0x02, 0x10, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03,
+    0x01, 0x01, 0x3f, 0x01, 0x51, 0xce, 0x8c, 0x75, 0xff, 0xc4, 0x00, 0x18,
+    0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x61, 0x21, 0xff, 0xda,
+    0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xa6, 0xd9, 0x2f, 0x84,
+    0xe8, 0xf0, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda,
+    0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32,
+    0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00,
+    0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31,
+    0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f,
+    0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9,
+    0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6,
+    0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03,
+    0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x2e, 0x45, 0xff,
+    0xc4, 0x00, 0x18, 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
+    0x31, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x53,
+    0x50, 0xba, 0x54, 0xc1, 0x67, 0x4f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
+    0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x01, 0x11, 0x21, 0x00, 0x10, 0xff, 0xda, 0x00, 0x08,
+    0x01, 0x02, 0x01, 0x01, 0x3f, 0x10, 0x18, 0x81, 0x5c, 0x04, 0x1a, 0xca,
+    0x91, 0xbf, 0xff, 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04,
+    0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+    0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff,
+    0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9,
+    0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5,
+    0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c,
+    0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00,
+    0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff, 0xd9};
+static const size_t kTest3JpgLen = 704;
+
+// test 4 is J422 vertical - not supported
+static const uint8_t kTest4Jpg[] = {
+    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+    0x01, 0x12, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+    0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x05, 0x01, 0x02, 0x03, 0xff,
+    0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03, 0xff,
+    0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+    0x01, 0xd2, 0x98, 0xe9, 0x03, 0x0c, 0x00, 0x46, 0x21, 0xd9, 0xff, 0xc4,
+    0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+    0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+    0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+    0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x11, 0x01, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01,
+    0x3f, 0x01, 0x98, 0xb1, 0xbd, 0x47, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
+    0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x12, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08,
+    0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xb6, 0x35, 0xa2, 0xe1, 0x47, 0xff,
+    0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x21, 0x02,
+    0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00, 0x08, 0x01,
+    0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2, 0xed, 0xf9,
+    0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c, 0x10, 0x01,
+    0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61, 0x81, 0xf0,
+    0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21, 0x75, 0x6e,
+    0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01, 0xf3, 0xde,
+    0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48, 0x5d, 0x7a,
+    0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02,
+    0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x24, 0xaf, 0xff, 0xc4, 0x00, 0x19,
+    0x11, 0x00, 0x03, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x51, 0x21, 0x31, 0xff,
+    0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x59, 0x11, 0xca,
+    0x42, 0x60, 0x9f, 0x69, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x01, 0x11, 0x21, 0x31, 0x61, 0xff, 0xda, 0x00, 0x08, 0x01,
+    0x02, 0x01, 0x01, 0x3f, 0x10, 0xb0, 0xd7, 0x27, 0x51, 0xb6, 0x41, 0xff,
+    0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31,
+    0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08,
+    0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a,
+    0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd,
+    0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30,
+    0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03,
+    0x0b, 0xb7, 0xd4, 0xff, 0xd9};
+static const size_t kTest4JpgLen = 701;
+
+TEST_F(LibYUVConvertTest, TestMJPGSize) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  printf("test jpeg size %d x %d\n", width, height);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToI420) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_u, half_width * half_height);
+  align_buffer_page_end(dst_v, half_width * half_height);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_u, half_width,
+                     dst_v, half_width, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  uint32_t dst_u_hash = HashDjb2(dst_u, half_width * half_height, 5381);
+  uint32_t dst_v_hash = HashDjb2(dst_v, half_width * half_height, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_u_hash, 2501859930u);
+  EXPECT_EQ(dst_v_hash, 2126459123u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_u);
+  free_aligned_buffer_page_end(dst_v);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  // Convert to NV21
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_vu, half_width * half_height * 2);
+
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_vu,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Convert to I420
+  align_buffer_page_end(dst2_y, width * height);
+  align_buffer_page_end(dst2_u, half_width * half_height);
+  align_buffer_page_end(dst2_v, half_width * half_height);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width,
+                     dst2_v, half_width, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Convert I420 to NV21
+  align_buffer_page_end(dst3_y, width * height);
+  align_buffer_page_end(dst3_vu, half_width * half_height * 2);
+
+  I420ToNV21(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y,
+             width, dst3_vu, half_width * 2, width, height);
+
+  for (int i = 0; i < width * height; ++i) {
+    EXPECT_EQ(dst_y[i], dst3_y[i]);
+  }
+  for (int i = 0; i < half_width * half_height * 2; ++i) {
+    EXPECT_EQ(dst_vu[i], dst3_vu[i]);
+    EXPECT_EQ(dst_vu[i], dst3_vu[i]);
+  }
+
+  free_aligned_buffer_page_end(dst3_y);
+  free_aligned_buffer_page_end(dst3_vu);
+
+  free_aligned_buffer_page_end(dst2_y);
+  free_aligned_buffer_page_end(dst2_u);
+  free_aligned_buffer_page_end(dst2_v);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToI420_NV12) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  // Convert to NV12
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Convert to I420
+  align_buffer_page_end(dst2_y, width * height);
+  align_buffer_page_end(dst2_u, half_width * half_height);
+  align_buffer_page_end(dst2_v, half_width * half_height);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width,
+                     dst2_v, half_width, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Convert I420 to NV12
+  align_buffer_page_end(dst3_y, width * height);
+  align_buffer_page_end(dst3_uv, half_width * half_height * 2);
+
+  I420ToNV12(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y,
+             width, dst3_uv, half_width * 2, width, height);
+
+  for (int i = 0; i < width * height; ++i) {
+    EXPECT_EQ(dst_y[i], dst3_y[i]);
+  }
+  for (int i = 0; i < half_width * half_height * 2; ++i) {
+    EXPECT_EQ(dst_uv[i], dst3_uv[i]);
+    EXPECT_EQ(dst_uv[i], dst3_uv[i]);
+  }
+
+  free_aligned_buffer_page_end(dst3_y);
+  free_aligned_buffer_page_end(dst3_uv);
+
+  free_aligned_buffer_page_end(dst2_y);
+  free_aligned_buffer_page_end(dst2_u);
+  free_aligned_buffer_page_end(dst2_v);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_uv_hash, 1069662856u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_420) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value. Hashes are for VU so flip the plane.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  align_buffer_page_end(dst_vu, half_width * half_height * 2);
+  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+              half_height);
+  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_vu_hash, 1069662856u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+// TODO(fbarchard): Improve test to compare against I422, not checksum
+TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV21_422) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV21(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_uv_hash, 493520167u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV12_422) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV12(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value. Hashes are for VU so flip the plane.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  align_buffer_page_end(dst_vu, half_width * half_height * 2);
+  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+              half_height);
+  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_vu_hash, 493520167u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV21(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 330644005u);
+  EXPECT_EQ(dst_uv_hash, 135214341u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_400) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV12(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value. Hashes are for VU so flip the plane.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  align_buffer_page_end(dst_vu, half_width * half_height * 2);
+  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+              half_height);
+  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 330644005u);
+  EXPECT_EQ(dst_vu_hash, 135214341u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV21(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_uv_hash, 506143297u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_444) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV12(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value. Hashes are for VU so flip the plane.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  align_buffer_page_end(dst_vu, half_width * half_height * 2);
+  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+              half_height);
+  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_vu_hash, 506143297u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToARGB) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_argb, width * height * 4);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToARGB(kTest3Jpg, kTest3JpgLen, dst_argb, width * 4, width,
+                     height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381);
+#ifdef LIBYUV_UNLIMITED_DATA
+  EXPECT_EQ(dst_argb_hash, 3900633302u);
+#else
+  EXPECT_EQ(dst_argb_hash, 2355976473u);
+#endif
+
+  free_aligned_buffer_page_end(dst_argb);
+}
+
+static int ShowJPegInfo(const uint8_t* sample, size_t sample_size) {
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+
+  int width = mjpeg_decoder.GetWidth();
+  int height = mjpeg_decoder.GetHeight();
+
+  // YUV420
+  if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+      mjpeg_decoder.GetNumComponents() == 3 &&
+      mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+      mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+      mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+      mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+      mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+      mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+    printf("JPeg is J420, %dx%d %d bytes\n", width, height,
+           static_cast<int>(sample_size));
+    // YUV422
+  } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+             mjpeg_decoder.GetNumComponents() == 3 &&
+             mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+             mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+             mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+    printf("JPeg is J422, %dx%d %d bytes\n", width, height,
+           static_cast<int>(sample_size));
+    // YUV444
+  } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+             mjpeg_decoder.GetNumComponents() == 3 &&
+             mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+             mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+             mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+    printf("JPeg is J444, %dx%d %d bytes\n", width, height,
+           static_cast<int>(sample_size));
+    // YUV400
+  } else if (mjpeg_decoder.GetColorSpace() ==
+                 MJpegDecoder::kColorSpaceGrayscale &&
+             mjpeg_decoder.GetNumComponents() == 1 &&
+             mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+    printf("JPeg is J400, %dx%d %d bytes\n", width, height,
+           static_cast<int>(sample_size));
+  } else {
+    // Unknown colorspace.
+    printf("JPeg is Unknown colorspace.\n");
+  }
+  mjpeg_decoder.UnloadFrame();
+  return ret;
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGInfo) {
+  EXPECT_EQ(1, ShowJPegInfo(kTest0Jpg, kTest0JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest4Jpg,
+                            kTest4JpgLen));  // Valid but unsupported.
+}
+#endif  // HAVE_JPEG
+
+TEST_F(LibYUVConvertTest, NV12Crop) {
+  const int SUBSAMP_X = 2;
+  const int SUBSAMP_Y = 2;
+  const int kWidth = benchmark_width_;
+  const int kHeight = benchmark_height_;
+  const int crop_y =
+      ((benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2 + 1) & ~1;
+  const int kDestWidth = benchmark_width_;
+  const int kDestHeight = benchmark_height_ - crop_y * 2;
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);
+  const int sample_size =
+      kWidth * kHeight + kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;
+  align_buffer_page_end(src_y, sample_size);
+  uint8_t* src_uv = src_y + kWidth * kHeight;
+
+  align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
+  align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  align_buffer_page_end(dst_y_2, kDestWidth * kDestHeight);
+  align_buffer_page_end(dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  align_buffer_page_end(dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  for (int i = 0; i < kHeight * kWidth; ++i) {
+    src_y[i] = (fastrand() & 0xff);
+  }
+  for (int i = 0; i < (SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideUV) * 2; ++i) {
+    src_uv[i] = (fastrand() & 0xff);
+  }
+  memset(dst_y, 1, kDestWidth * kDestHeight);
+  memset(dst_u, 2,
+         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_v, 3,
+         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_y_2, 1, kDestWidth * kDestHeight);
+  memset(dst_u_2, 2,
+         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_v_2, 3,
+         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  ConvertToI420(src_y, sample_size, dst_y_2, kDestWidth, dst_u_2,
+                SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v_2,
+                SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
+                kDestWidth, kDestHeight, libyuv::kRotate0, libyuv::FOURCC_NV12);
+
+  NV12ToI420(src_y + crop_y * kWidth, kWidth,
+             src_uv + (crop_y / 2) * kStrideUV * 2, kStrideUV * 2, dst_y,
+             kDestWidth, dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
+             SUBSAMPLE(kDestWidth, SUBSAMP_X), kDestWidth, kDestHeight);
+
+  for (int i = 0; i < kDestHeight; ++i) {
+    for (int j = 0; j < kDestWidth; ++j) {
+      EXPECT_EQ(dst_y[i * kWidth + j], dst_y_2[i * kWidth + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+                dst_u_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+                dst_v_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_u);
+  free_aligned_buffer_page_end(dst_v);
+  free_aligned_buffer_page_end(dst_y_2);
+  free_aligned_buffer_page_end(dst_u_2);
+  free_aligned_buffer_page_end(dst_v_2);
+  free_aligned_buffer_page_end(src_y);
+}
+
+TEST_F(LibYUVConvertTest, I420CropOddY) {
+  const int SUBSAMP_X = 2;
+  const int SUBSAMP_Y = 2;
+  const int kWidth = benchmark_width_;
+  const int kHeight = benchmark_height_;
+  const int crop_y = benchmark_height_ > 1 ? 1 : 0;
+  const int kDestWidth = benchmark_width_;
+  const int kDestHeight = benchmark_height_ - crop_y * 2;
+  const int kStrideU = SUBSAMPLE(kWidth, SUBSAMP_X);
+  const int kStrideV = SUBSAMPLE(kWidth, SUBSAMP_X);
+  const int sample_size = kWidth * kHeight +
+                          kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y) +
+                          kStrideV * SUBSAMPLE(kHeight, SUBSAMP_Y);
+  align_buffer_page_end(src_y, sample_size);
+  uint8_t* src_u = src_y + kWidth * kHeight;
+  uint8_t* src_v = src_u + kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y);
+
+  align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
+  align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  for (int i = 0; i < kHeight * kWidth; ++i) {
+    src_y[i] = (fastrand() & 0xff);
+  }
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideU; ++i) {
+    src_u[i] = (fastrand() & 0xff);
+  }
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideV; ++i) {
+    src_v[i] = (fastrand() & 0xff);
+  }
+  memset(dst_y, 1, kDestWidth * kDestHeight);
+  memset(dst_u, 2,
+         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_v, 3,
+         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  MaskCpuFlags(benchmark_cpu_info_);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ConvertToI420(src_y, sample_size, dst_y, kDestWidth, dst_u,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
+                  kDestWidth, kDestHeight, libyuv::kRotate0,
+                  libyuv::FOURCC_I420);
+  }
+
+  for (int i = 0; i < kDestHeight; ++i) {
+    for (int j = 0; j < kDestWidth; ++j) {
+      EXPECT_EQ(src_y[crop_y * kWidth + i * kWidth + j],
+                dst_y[i * kDestWidth + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(src_u[(crop_y / 2 + i) * kStrideU + j],
+                dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(src_v[(crop_y / 2 + i) * kStrideV + j],
+                dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_u);
+  free_aligned_buffer_page_end(dst_v);
+  free_aligned_buffer_page_end(src_y);
+}
+
+#define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12)                                \
+  TEST_F(LibYUVConvertTest, NAME) {                                           \
+    const int kWidth = benchmark_width_;                                      \
+    const int kHeight = benchmark_height_;                                    \
+                                                                              \
+    align_buffer_page_end(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight);     \
+    align_buffer_page_end(orig_y, kWidth* kHeight);                           \
+    align_buffer_page_end(orig_u,                                             \
+                          SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));      \
+    align_buffer_page_end(orig_v,                                             \
+                          SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));      \
+                                                                              \
+    align_buffer_page_end(dst_y_orig, kWidth* kHeight);                       \
+    align_buffer_page_end(dst_uv_orig,                                        \
+                          2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));  \
+                                                                              \
+    align_buffer_page_end(dst_y, kWidth* kHeight);                            \
+    align_buffer_page_end(dst_uv,                                             \
+                          2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));  \
+                                                                              \
+    MemRandomize(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight);              \
+                                                                              \
+    /* Convert UYVY to NV12 in 2 steps for reference */                       \
+    libyuv::UYVYTOI420(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), orig_y, kWidth,   \
+                       orig_u, SUBSAMPLE(kWidth, 2), orig_v,                  \
+                       SUBSAMPLE(kWidth, 2), kWidth, kHeight);                \
+    libyuv::I420ToNV12(orig_y, kWidth, orig_u, SUBSAMPLE(kWidth, 2), orig_v,  \
+                       SUBSAMPLE(kWidth, 2), dst_y_orig, kWidth, dst_uv_orig, \
+                       2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);            \
+                                                                              \
+    /* Convert to NV12 */                                                     \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      libyuv::UYVYTONV12(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), dst_y, kWidth,  \
+                         dst_uv, 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);  \
+    }                                                                         \
+                                                                              \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      EXPECT_EQ(orig_y[i], dst_y[i]);                                         \
+    }                                                                         \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      EXPECT_EQ(dst_y_orig[i], dst_y[i]);                                     \
+    }                                                                         \
+    for (int i = 0; i < 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2);     \
+         ++i) {                                                               \
+      EXPECT_EQ(dst_uv_orig[i], dst_uv[i]);                                   \
+    }                                                                         \
+                                                                              \
+    free_aligned_buffer_page_end(orig_uyvy);                                  \
+    free_aligned_buffer_page_end(orig_y);                                     \
+    free_aligned_buffer_page_end(orig_u);                                     \
+    free_aligned_buffer_page_end(orig_v);                                     \
+    free_aligned_buffer_page_end(dst_y_orig);                                 \
+    free_aligned_buffer_page_end(dst_uv_orig);                                \
+    free_aligned_buffer_page_end(dst_y);                                      \
+    free_aligned_buffer_page_end(dst_uv);                                     \
+  }
+
+TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12)
+TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
+
+TEST_F(LibYUVConvertTest, MM21ToYUY2) {
+  const int kWidth = (benchmark_width_ + 15) & (~15);
+  const int kHeight = (benchmark_height_ + 31) & (~31);
+
+  align_buffer_page_end(orig_y, kWidth * kHeight);
+  align_buffer_page_end(orig_uv,
+                        2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+
+  align_buffer_page_end(tmp_y, kWidth * kHeight);
+  align_buffer_page_end(tmp_u, SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+  align_buffer_page_end(tmp_v, SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+
+  align_buffer_page_end(dst_yuyv, 4 * SUBSAMPLE(kWidth, 2) * kHeight);
+  align_buffer_page_end(golden_yuyv, 4 * SUBSAMPLE(kWidth, 2) * kHeight);
+
+  MemRandomize(orig_y, kWidth * kHeight);
+  MemRandomize(orig_uv, 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+
+  /* Convert MM21 to YUY2 in 2 steps for reference */
+  libyuv::MM21ToI420(orig_y, kWidth, orig_uv, 2 * SUBSAMPLE(kWidth, 2), tmp_y,
+                     kWidth, tmp_u, SUBSAMPLE(kWidth, 2), tmp_v,
+                     SUBSAMPLE(kWidth, 2), kWidth, kHeight);
+  libyuv::I420ToYUY2(tmp_y, kWidth, tmp_u, SUBSAMPLE(kWidth, 2), tmp_v,
+                     SUBSAMPLE(kWidth, 2), golden_yuyv,
+                     4 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);
+
+  /* Convert to NV12 */
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    libyuv::MM21ToYUY2(orig_y, kWidth, orig_uv, 2 * SUBSAMPLE(kWidth, 2),
+                       dst_yuyv, 4 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);
+  }
+
+  for (int i = 0; i < 4 * SUBSAMPLE(kWidth, 2) * kHeight; ++i) {
+    EXPECT_EQ(dst_yuyv[i], golden_yuyv[i]);
+  }
+
+  free_aligned_buffer_page_end(orig_y);
+  free_aligned_buffer_page_end(orig_uv);
+  free_aligned_buffer_page_end(tmp_y);
+  free_aligned_buffer_page_end(tmp_u);
+  free_aligned_buffer_page_end(tmp_v);
+  free_aligned_buffer_page_end(dst_yuyv);
+  free_aligned_buffer_page_end(golden_yuyv);
+}
+
+// Test RGB24 to J420 is exact
+#if defined(LIBYUV_BIT_EXACT)
+TEST_F(LibYUVConvertTest, TestRGB24ToJ420) {
+  const int kSize = 256;
+  align_buffer_page_end(orig_rgb24, kSize * 3 * 2);  // 2 rows of RGB24
+  align_buffer_page_end(dest_j420, kSize * 3 / 2 * 2);
+  int iterations256 = (benchmark_width_ * benchmark_height_ + (kSize * 2 - 1)) /
+                      (kSize * 2) * benchmark_iterations_;
+
+  for (int i = 0; i < kSize * 3 * 2; ++i) {
+    orig_rgb24[i] = i;
+  }
+
+  for (int i = 0; i < iterations256; ++i) {
+    RGB24ToJ420(orig_rgb24, kSize * 3, dest_j420, kSize,  // Y plane
+                dest_j420 + kSize * 2, kSize / 2,         // U plane
+                dest_j420 + kSize * 5 / 2, kSize / 2,     // V plane
+                kSize, 2);
+  }
+
+  uint32_t checksum = HashDjb2(dest_j420, kSize * 3 / 2 * 2, 5381);
+  EXPECT_EQ(2755440272u, checksum);
+
+  free_aligned_buffer_page_end(orig_rgb24);
+  free_aligned_buffer_page_end(dest_j420);
+}
+#endif
+
+// Test RGB24 to I420 is exact
+#if defined(LIBYUV_BIT_EXACT)
+TEST_F(LibYUVConvertTest, TestRGB24ToI420) {
+  const int kSize = 256;
+  align_buffer_page_end(orig_rgb24, kSize * 3 * 2);  // 2 rows of RGB24
+  align_buffer_page_end(dest_i420, kSize * 3 / 2 * 2);
+  int iterations256 = (benchmark_width_ * benchmark_height_ + (kSize * 2 - 1)) /
+                      (kSize * 2) * benchmark_iterations_;
+
+  for (int i = 0; i < kSize * 3 * 2; ++i) {
+    orig_rgb24[i] = i;
+  }
+
+  for (int i = 0; i < iterations256; ++i) {
+    RGB24ToI420(orig_rgb24, kSize * 3, dest_i420, kSize,  // Y plane
+                dest_i420 + kSize * 2, kSize / 2,         // U plane
+                dest_i420 + kSize * 5 / 2, kSize / 2,     // V plane
+                kSize, 2);
+  }
+
+  uint32_t checksum = HashDjb2(dest_i420, kSize * 3 / 2 * 2, 5381);
+  EXPECT_EQ(1526656597u, checksum);
+
+  free_aligned_buffer_page_end(orig_rgb24);
+  free_aligned_buffer_page_end(dest_i420);
+}
+#endif
+
+#endif  // !defined(LEAN_TESTS)
+
+}  // namespace libyuv
diff --git a/files/unit_test/cpu_test.cc b/unit_test/cpu_test.cc
index 080778f5..437b6632 100644
--- a/files/unit_test/cpu_test.cc
+++ b/unit_test/cpu_test.cc
@@ -20,13 +20,23 @@ namespace libyuv {
 
 TEST_F(LibYUVBaseTest, TestCpuHas) {
   int cpu_flags = TestCpuFlag(-1);
-  printf("Cpu Flags %d\n", cpu_flags);
+  printf("Cpu Flags 0x%x\n", cpu_flags);
 #if defined(__arm__) || defined(__aarch64__)
   int has_arm = TestCpuFlag(kCpuHasARM);
-  printf("Has ARM %d\n", has_arm);
+  printf("Has ARM 0x%x\n", has_arm);
   int has_neon = TestCpuFlag(kCpuHasNEON);
-  printf("Has NEON %d\n", has_neon);
+  printf("Has NEON 0x%x\n", has_neon);
 #endif
+#if defined(__riscv) && defined(__linux__)
+  int has_riscv = TestCpuFlag(kCpuHasRISCV);
+  printf("Has RISCV 0x%x\n", has_riscv);
+  int has_rvv = TestCpuFlag(kCpuHasRVV);
+  printf("Has RVV 0x%x\n", has_rvv);
+  int has_rvvzvfh = TestCpuFlag(kCpuHasRVVZVFH);
+  printf("Has RVVZVFH 0x%x\n", has_rvvzvfh);
+#endif
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+    defined(_M_X64)
   int has_x86 = TestCpuFlag(kCpuHasX86);
   int has_sse2 = TestCpuFlag(kCpuHasSSE2);
   int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
@@ -37,47 +47,48 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
   int has_erms = TestCpuFlag(kCpuHasERMS);
   int has_fma3 = TestCpuFlag(kCpuHasFMA3);
   int has_f16c = TestCpuFlag(kCpuHasF16C);
-  int has_gfni = TestCpuFlag(kCpuHasGFNI);
   int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
   int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
   int has_avx512vnni = TestCpuFlag(kCpuHasAVX512VNNI);
   int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
   int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
   int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
-  int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ);
-  printf("Has X86 %d\n", has_x86);
-  printf("Has SSE2 %d\n", has_sse2);
-  printf("Has SSSE3 %d\n", has_ssse3);
-  printf("Has SSE41 %d\n", has_sse41);
-  printf("Has SSE42 %d\n", has_sse42);
-  printf("Has AVX %d\n", has_avx);
-  printf("Has AVX2 %d\n", has_avx2);
-  printf("Has ERMS %d\n", has_erms);
-  printf("Has FMA3 %d\n", has_fma3);
-  printf("Has F16C %d\n", has_f16c);
-  printf("Has GFNI %d\n", has_gfni);
-  printf("Has AVX512BW %d\n", has_avx512bw);
-  printf("Has AVX512VL %d\n", has_avx512vl);
-  printf("Has AVX512VNNI %d\n", has_avx512vnni);
-  printf("Has AVX512VBMI %d\n", has_avx512vbmi);
-  printf("Has AVX512VBMI2 %d\n", has_avx512vbmi2);
-  printf("Has AVX512VBITALG %d\n", has_avx512vbitalg);
-  printf("Has AVX512VPOPCNTDQ %d\n", has_avx512vpopcntdq);
-
+  int has_avx10 = TestCpuFlag(kCpuHasAVX10);
+  int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI);
+  int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8);
+  printf("Has X86 0x%x\n", has_x86);
+  printf("Has SSE2 0x%x\n", has_sse2);
+  printf("Has SSSE3 0x%x\n", has_ssse3);
+  printf("Has SSE41 0x%x\n", has_sse41);
+  printf("Has SSE42 0x%x\n", has_sse42);
+  printf("Has AVX 0x%x\n", has_avx);
+  printf("Has AVX2 0x%x\n", has_avx2);
+  printf("Has ERMS 0x%x\n", has_erms);
+  printf("Has FMA3 0x%x\n", has_fma3);
+  printf("Has F16C 0x%x\n", has_f16c);
+  printf("Has AVX512BW 0x%x\n", has_avx512bw);
+  printf("Has AVX512VL 0x%x\n", has_avx512vl);
+  printf("Has AVX512VNNI 0x%x\n", has_avx512vnni);
+  printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi);
+  printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2);
+  printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg);
+  printf("Has AVX10 0x%x\n", has_avx10);
+  printf("HAS AVXVNNI 0x%x\n", has_avxvnni);
+  printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
+#endif
 #if defined(__mips__)
   int has_mips = TestCpuFlag(kCpuHasMIPS);
-  printf("Has MIPS %d\n", has_mips);
+  printf("Has MIPS 0x%x\n", has_mips);
   int has_msa = TestCpuFlag(kCpuHasMSA);
-  printf("Has MSA %d\n", has_msa);
+  printf("Has MSA 0x%x\n", has_msa);
 #endif
-
 #if defined(__loongarch__)
   int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
-  printf("Has LOONGARCH %d\n", has_loongarch);
+  printf("Has LOONGARCH 0x%x\n", has_loongarch);
   int has_lsx = TestCpuFlag(kCpuHasLSX);
-  printf("Has LSX %d\n", has_lsx);
+  printf("Has LSX 0x%x\n", has_lsx);
   int has_lasx = TestCpuFlag(kCpuHasLASX);
-  printf("Has LASX %d\n", has_lasx);
+  printf("Has LASX 0x%x\n", has_lasx);
 #endif
 }
 
@@ -104,27 +115,36 @@ TEST_F(LibYUVBaseTest, TestCompilerMacros) {
 #ifdef __i386__
   printf("__i386__ %d\n", __i386__);
 #endif
-#ifdef __mips
-  printf("__mips %d\n", __mips);
-#endif
-#ifdef __mips_isa_rev
-  printf("__mips_isa_rev %d\n", __mips_isa_rev);
-#endif
 #ifdef __x86_64__
   printf("__x86_64__ %d\n", __x86_64__);
 #endif
+#ifdef _M_IX86
+  printf("_M_IX86 %d\n", _M_IX86);
+#endif
+#ifdef _M_X64
+  printf("_M_X64 %d\n", _M_X64);
+#endif
 #ifdef _MSC_VER
   printf("_MSC_VER %d\n", _MSC_VER);
 #endif
 #ifdef __aarch64__
   printf("__aarch64__ %d\n", __aarch64__);
 #endif
-#ifdef __APPLE__
-  printf("__APPLE__ %d\n", __APPLE__);
-#endif
 #ifdef __arm__
   printf("__arm__ %d\n", __arm__);
 #endif
+#ifdef __riscv
+  printf("__riscv %d\n", __riscv);
+#endif
+#ifdef __riscv_vector
+  printf("__riscv_vector %d\n", __riscv_vector);
+#endif
+#ifdef __riscv_v_intrinsic
+  printf("__riscv_v_intrinsic %d\n", __riscv_v_intrinsic);
+#endif
+#ifdef __APPLE__
+  printf("__APPLE__ %d\n", __APPLE__);
+#endif
 #ifdef __clang__
   printf("__clang__ %d\n", __clang__);
 #endif
@@ -140,20 +160,11 @@ TEST_F(LibYUVBaseTest, TestCompilerMacros) {
 #ifdef __mips_msa
   printf("__mips_msa %d\n", __mips_msa);
 #endif
-#ifdef __native_client__
-  printf("__native_client__ %d\n", __native_client__);
-#endif
-#ifdef __pic__
-  printf("__pic__ %d\n", __pic__);
-#endif
-#ifdef __pnacl__
-  printf("__pnacl__ %d\n", __pnacl__);
-#endif
-#ifdef _M_IX86
-  printf("_M_IX86 %d\n", _M_IX86);
+#ifdef __mips
+  printf("__mips %d\n", __mips);
 #endif
-#ifdef _M_X64
-  printf("_M_X64 %d\n", _M_X64);
+#ifdef __mips_isa_rev
+  printf("__mips_isa_rev %d\n", __mips_isa_rev);
 #endif
 #ifdef _MIPS_ARCH_LOONGSON3A
   printf("_MIPS_ARCH_LOONGSON3A %d\n", _MIPS_ARCH_LOONGSON3A);
@@ -164,8 +175,17 @@ TEST_F(LibYUVBaseTest, TestCompilerMacros) {
 #ifdef _WIN32
   printf("_WIN32 %d\n", _WIN32);
 #endif
+#ifdef __native_client__
+  printf("__native_client__ %d\n", __native_client__);
+#endif
+#ifdef __pic__
+  printf("__pic__ %d\n", __pic__);
+#endif
+#ifdef __pnacl__
+  printf("__pnacl__ %d\n", __pnacl__);
+#endif
 #ifdef GG_LONGLONG
-  printf("GG_LONGLONG %d\n", GG_LONGLONG);
+  printf("GG_LONGLONG %lld\n", GG_LONGLONG(1));
 #endif
 #ifdef INT_TYPES_DEFINED
   printf("INT_TYPES_DEFINED\n");
@@ -200,8 +220,9 @@ TEST_F(LibYUVBaseTest, TestCpuId) {
     cpu_info[0] = cpu_info[1];  // Reorder output
     cpu_info[1] = cpu_info[3];
     cpu_info[3] = 0;
-    printf("Cpu Vendor: %s %x %x %x\n", reinterpret_cast<char*>(&cpu_info[0]),
-           cpu_info[0], cpu_info[1], cpu_info[2]);
+    printf("Cpu Vendor: %s 0x%x 0x%x 0x%x\n",
+           reinterpret_cast<char*>(&cpu_info[0]), cpu_info[0], cpu_info[1],
+           cpu_info[2]);
     EXPECT_EQ(12u, strlen(reinterpret_cast<char*>(&cpu_info[0])));
 
     // CPU Family and Model
@@ -264,6 +285,32 @@ TEST_F(LibYUVBaseTest, TestLinuxMipsMsa) {
   }
 }
 
+TEST_F(LibYUVBaseTest, TestLinuxRVV) {
+  if (FileExists("../../unit_test/testdata/riscv64.txt")) {
+    printf("Note: testing to load \"../../unit_test/testdata/riscv64.txt\"\n");
+
+    EXPECT_EQ(0, RiscvCpuCaps("../../unit_test/testdata/riscv64.txt"));
+    EXPECT_EQ(kCpuHasRVV,
+              RiscvCpuCaps("../../unit_test/testdata/riscv64_rvv.txt"));
+    EXPECT_EQ(kCpuHasRVV | kCpuHasRVVZVFH,
+              RiscvCpuCaps("../../unit_test/testdata/riscv64_rvv_zvfh.txt"));
+  } else {
+    printf(
+        "WARNING: unable to load "
+        "\"../../unit_test/testdata/riscv64.txt\"\n");
+  }
+#if defined(__linux__) && defined(__riscv)
+  if (FileExists("/proc/cpuinfo")) {
+    if (!(kCpuHasRVV & RiscvCpuCaps("/proc/cpuinfo"))) {
+      // This can happen on RVV emulator but /proc/cpuinfo is from host.
+      printf("WARNING: RVV build enabled but CPU does not have RVV\n");
+    }
+  } else {
+    printf("WARNING: unable to load \"/proc/cpuinfo\"\n");
+  }
+#endif
+}
+
 // TODO(fbarchard): Fix clangcl test of cpuflags.
 #ifdef _MSC_VER
 TEST_F(LibYUVBaseTest, DISABLED_TestSetCpuFlags) {
diff --git a/files/unit_test/cpu_thread_test.cc b/unit_test/cpu_thread_test.cc
index 69aab74e..69aab74e 100644
--- a/files/unit_test/cpu_thread_test.cc
+++ b/unit_test/cpu_thread_test.cc
diff --git a/files/unit_test/math_test.cc b/unit_test/math_test.cc
index a1544c12..a1544c12 100644
--- a/files/unit_test/math_test.cc
+++ b/unit_test/math_test.cc
diff --git a/files/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 3a8c470b..ec1d72eb 100644
--- a/files/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -30,9 +30,9 @@
 #endif
 
 #if defined(LIBYUV_BIT_EXACT)
-#define EXPECTED_ATTENUATE_DIFF 0
+#define EXPECTED_UNATTENUATE_DIFF 0
 #else
-#define EXPECTED_ATTENUATE_DIFF 2
+#define EXPECTED_UNATTENUATE_DIFF 2
 #endif
 
 namespace libyuv {
@@ -57,12 +57,17 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
   orig_pixels[2 * 4 + 0] = 16u;
   orig_pixels[2 * 4 + 1] = 64u;
   orig_pixels[2 * 4 + 2] = 192u;
-  orig_pixels[2 * 4 + 3] = 255u;
+  orig_pixels[2 * 4 + 3] = 128u;
   orig_pixels[3 * 4 + 0] = 16u;
   orig_pixels[3 * 4 + 1] = 64u;
   orig_pixels[3 * 4 + 2] = 192u;
-  orig_pixels[3 * 4 + 3] = 128u;
-  ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 4, 1);
+  orig_pixels[3 * 4 + 3] = 255u;
+  orig_pixels[4 * 4 + 0] = 255u;
+  orig_pixels[4 * 4 + 1] = 255u;
+  orig_pixels[4 * 4 + 2] = 255u;
+  orig_pixels[4 * 4 + 3] = 255u;
+
+  ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 5, 1);
   EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]);
   EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]);
   EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]);
@@ -71,14 +76,55 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
   EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]);
   EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]);
   EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]);
-  EXPECT_EQ(16u, unatten_pixels[2 * 4 + 0]);
-  EXPECT_EQ(64u, unatten_pixels[2 * 4 + 1]);
-  EXPECT_EQ(192u, unatten_pixels[2 * 4 + 2]);
-  EXPECT_EQ(255u, unatten_pixels[2 * 4 + 3]);
-  EXPECT_EQ(32u, unatten_pixels[3 * 4 + 0]);
-  EXPECT_EQ(128u, unatten_pixels[3 * 4 + 1]);
-  EXPECT_EQ(255u, unatten_pixels[3 * 4 + 2]);
-  EXPECT_EQ(128u, unatten_pixels[3 * 4 + 3]);
+  EXPECT_EQ(32u, unatten_pixels[2 * 4 + 0]);
+  EXPECT_EQ(128u, unatten_pixels[2 * 4 + 1]);
+  EXPECT_EQ(255u, unatten_pixels[2 * 4 + 2]);
+  EXPECT_EQ(128u, unatten_pixels[2 * 4 + 3]);
+  EXPECT_EQ(16u, unatten_pixels[3 * 4 + 0]);
+  EXPECT_EQ(64u, unatten_pixels[3 * 4 + 1]);
+  EXPECT_EQ(192u, unatten_pixels[3 * 4 + 2]);
+  EXPECT_EQ(255u, unatten_pixels[3 * 4 + 3]);
+  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 0]);
+  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 1]);
+  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 2]);
+  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 3]);
+
+  ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 5, 1);
+  EXPECT_EQ(100u, atten_pixels[0 * 4 + 0]);
+  EXPECT_EQ(65u, atten_pixels[0 * 4 + 1]);
+  EXPECT_EQ(64u, atten_pixels[0 * 4 + 2]);
+  EXPECT_EQ(128u, atten_pixels[0 * 4 + 3]);
+  EXPECT_EQ(0u, atten_pixels[1 * 4 + 0]);
+  EXPECT_EQ(0u, atten_pixels[1 * 4 + 1]);
+  EXPECT_EQ(0u, atten_pixels[1 * 4 + 2]);
+  EXPECT_EQ(0u, atten_pixels[1 * 4 + 3]);
+  EXPECT_EQ(8u, atten_pixels[2 * 4 + 0]);
+  EXPECT_EQ(32u, atten_pixels[2 * 4 + 1]);
+  EXPECT_EQ(96u, atten_pixels[2 * 4 + 2]);
+  EXPECT_EQ(128u, atten_pixels[2 * 4 + 3]);
+  EXPECT_EQ(16u, atten_pixels[3 * 4 + 0]);
+  EXPECT_EQ(64u, atten_pixels[3 * 4 + 1]);
+  EXPECT_EQ(192u, atten_pixels[3 * 4 + 2]);
+  EXPECT_EQ(255u, atten_pixels[3 * 4 + 3]);
+  EXPECT_EQ(255u, atten_pixels[4 * 4 + 0]);
+  EXPECT_EQ(255u, atten_pixels[4 * 4 + 1]);
+  EXPECT_EQ(255u, atten_pixels[4 * 4 + 2]);
+  EXPECT_EQ(255u, atten_pixels[4 * 4 + 3]);
+
+  // test 255
+  for (int i = 0; i < 256; ++i) {
+    orig_pixels[i * 4 + 0] = i;
+    orig_pixels[i * 4 + 1] = 0;
+    orig_pixels[i * 4 + 2] = 0;
+    orig_pixels[i * 4 + 3] = 255;
+  }
+  ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 256, 1);
+  for (int i = 0; i < 256; ++i) {
+    EXPECT_EQ(orig_pixels[i * 4 + 0], atten_pixels[i * 4 + 0]);
+    EXPECT_EQ(0, atten_pixels[i * 4 + 1]);
+    EXPECT_EQ(0, atten_pixels[i * 4 + 2]);
+    EXPECT_EQ(255, atten_pixels[i * 4 + 3]);
+  }
 
   for (int i = 0; i < 1280; ++i) {
     orig_pixels[i * 4 + 0] = i;
@@ -92,10 +138,10 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
     ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1);
   }
   for (int i = 0; i < 1280; ++i) {
-    EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 2);
-    EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 2);
-    EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 2);
-    EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 2);
+    EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 1);
+    EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 1);
+    EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 1);
+    EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 1);
   }
   // Make sure transparent, 50% and opaque are fully accurate.
   EXPECT_EQ(0, atten_pixels[0 * 4 + 0]);
@@ -106,9 +152,9 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
   EXPECT_EQ(32, atten_pixels[128 * 4 + 1]);
   EXPECT_EQ(21, atten_pixels[128 * 4 + 2]);
   EXPECT_EQ(128, atten_pixels[128 * 4 + 3]);
-  EXPECT_NEAR(254, atten_pixels[255 * 4 + 0], EXPECTED_ATTENUATE_DIFF);
-  EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], EXPECTED_ATTENUATE_DIFF);
-  EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], EXPECTED_ATTENUATE_DIFF);
+  EXPECT_EQ(255, atten_pixels[255 * 4 + 0]);
+  EXPECT_EQ(127, atten_pixels[255 * 4 + 1]);
+  EXPECT_EQ(85, atten_pixels[255 * 4 + 2]);
   EXPECT_EQ(255, atten_pixels[255 * 4 + 3]);
 
   free_aligned_buffer_page_end(atten2_pixels);
@@ -165,28 +211,28 @@ TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) {
                                 benchmark_iterations_, disable_cpu_flags_,
                                 benchmark_cpu_info_, +1, 0);
 
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_EQ(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) {
   int max_diff =
       TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
                      disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_EQ(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) {
   int max_diff =
       TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
                      disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_EQ(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) {
   int max_diff =
       TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
                      disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_EQ(max_diff, 0);
 }
 
 static int TestUnattenuateI(int width,
@@ -238,28 +284,28 @@ TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) {
   int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, +1, 0);
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) {
   int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, +1, 1);
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) {
   int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, -1, 0);
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) {
   int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, +1, 0);
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 
 TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
@@ -1638,29 +1684,29 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) {
   int i, j;
 
   // orig is tiled.  Allocate enough memory for tiles.
-  int orig_width = (benchmark_width_ + 15) & ~15;
-  int orig_height = (benchmark_height_ + 15) & ~15;
-  int orig_plane_size = orig_width * orig_height;
+  int tile_width = (benchmark_width_ + 15) & ~15;
+  int tile_height = (benchmark_height_ + 15) & ~15;
+  int tile_plane_size = tile_width * tile_height;
   int y_plane_size = benchmark_width_ * benchmark_height_;
-  align_buffer_page_end(orig_y, orig_plane_size);
+  align_buffer_page_end(tile_y, tile_plane_size);
   align_buffer_page_end(dst_c, y_plane_size);
   align_buffer_page_end(dst_opt, y_plane_size);
 
-  MemRandomize(orig_y, orig_plane_size);
+  MemRandomize(tile_y, tile_plane_size);
   memset(dst_c, 0, y_plane_size);
   memset(dst_opt, 0, y_plane_size);
 
   // Disable all optimizations.
   MaskCpuFlags(disable_cpu_flags_);
   for (j = 0; j < benchmark_iterations_; j++) {
-    DetilePlane(orig_y, orig_width, dst_c, benchmark_width_, benchmark_width_,
+    DetilePlane(tile_y, tile_width, dst_c, benchmark_width_, benchmark_width_,
                 benchmark_height_, 16);
   }
 
   // Enable optimizations.
   MaskCpuFlags(benchmark_cpu_info_);
   for (j = 0; j < benchmark_iterations_; j++) {
-    DetilePlane(orig_y, orig_width, dst_opt, benchmark_width_, benchmark_width_,
+    DetilePlane(tile_y, tile_width, dst_opt, benchmark_width_, benchmark_width_,
                 benchmark_height_, 16);
   }
 
@@ -1668,7 +1714,46 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) {
     EXPECT_EQ(dst_c[i], dst_opt[i]);
   }
 
-  free_aligned_buffer_page_end(orig_y);
+  free_aligned_buffer_page_end(tile_y);
+  free_aligned_buffer_page_end(dst_c);
+  free_aligned_buffer_page_end(dst_opt);
+}
+
+TEST_F(LibYUVPlanarTest, TestDetilePlane_16) {
+  int i, j;
+
+  // orig is tiled.  Allocate enough memory for tiles.
+  int tile_width = (benchmark_width_ + 15) & ~15;
+  int tile_height = (benchmark_height_ + 15) & ~15;
+  int tile_plane_size = tile_width * tile_height * 2;
+  int y_plane_size = benchmark_width_ * benchmark_height_ * 2;
+  align_buffer_page_end(tile_y, tile_plane_size);
+  align_buffer_page_end(dst_c, y_plane_size);
+  align_buffer_page_end(dst_opt, y_plane_size);
+
+  MemRandomize(tile_y, tile_plane_size);
+  memset(dst_c, 0, y_plane_size);
+  memset(dst_opt, 0, y_plane_size);
+
+  // Disable all optimizations.
+  MaskCpuFlags(disable_cpu_flags_);
+  for (j = 0; j < benchmark_iterations_; j++) {
+    DetilePlane_16((const uint16_t*)tile_y, tile_width, (uint16_t*)dst_c,
+                   benchmark_width_, benchmark_width_, benchmark_height_, 16);
+  }
+
+  // Enable optimizations.
+  MaskCpuFlags(benchmark_cpu_info_);
+  for (j = 0; j < benchmark_iterations_; j++) {
+    DetilePlane_16((const uint16_t*)tile_y, tile_width, (uint16_t*)dst_opt,
+                   benchmark_width_, benchmark_width_, benchmark_height_, 16);
+  }
+
+  for (i = 0; i < y_plane_size; ++i) {
+    EXPECT_EQ(dst_c[i], dst_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(tile_y);
   free_aligned_buffer_page_end(dst_c);
   free_aligned_buffer_page_end(dst_opt);
 }
@@ -1678,33 +1763,33 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) {
   int i, j;
 
   // orig is tiled.  Allocate enough memory for tiles.
-  int orig_width = (benchmark_width_ + 15) & ~15;
-  int orig_height = (benchmark_height_ + 15) & ~15;
-  int orig_plane_size = orig_width * orig_height;
+  int tile_width = (benchmark_width_ + 15) & ~15;
+  int tile_height = (benchmark_height_ + 15) & ~15;
+  int tile_plane_size = tile_width * tile_height;
   int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_;
-  align_buffer_page_end(orig_uv, orig_plane_size);
-  align_buffer_page_end(detiled_uv, orig_plane_size);
+  align_buffer_page_end(tile_uv, tile_plane_size);
+  align_buffer_page_end(detiled_uv, tile_plane_size);
   align_buffer_page_end(dst_u_two_stage, uv_plane_size);
   align_buffer_page_end(dst_u_opt, uv_plane_size);
   align_buffer_page_end(dst_v_two_stage, uv_plane_size);
   align_buffer_page_end(dst_v_opt, uv_plane_size);
 
-  MemRandomize(orig_uv, orig_plane_size);
-  memset(detiled_uv, 0, orig_plane_size);
+  MemRandomize(tile_uv, tile_plane_size);
+  memset(detiled_uv, 0, tile_plane_size);
   memset(dst_u_two_stage, 0, uv_plane_size);
   memset(dst_u_opt, 0, uv_plane_size);
   memset(dst_v_two_stage, 0, uv_plane_size);
   memset(dst_v_opt, 0, uv_plane_size);
 
-  DetileSplitUVPlane(orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2,
+  DetileSplitUVPlane(tile_uv, tile_width, dst_u_opt, (benchmark_width_ + 1) / 2,
                      dst_v_opt, (benchmark_width_ + 1) / 2, benchmark_width_,
                      benchmark_height_, 16);
 
   // Benchmark 2 step conversion for comparison.
   for (j = 0; j < benchmark_iterations_; j++) {
-    DetilePlane(orig_uv, orig_width, detiled_uv, benchmark_width_,
+    DetilePlane(tile_uv, tile_width, detiled_uv, benchmark_width_,
                 benchmark_width_, benchmark_height_, 16);
-    SplitUVPlane(detiled_uv, orig_width, dst_u_two_stage,
+    SplitUVPlane(detiled_uv, tile_width, dst_u_two_stage,
                  (benchmark_width_ + 1) / 2, dst_v_two_stage,
                  (benchmark_width_ + 1) / 2, (benchmark_width_ + 1) / 2,
                  benchmark_height_);
@@ -1715,7 +1800,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) {
     EXPECT_EQ(dst_v_two_stage[i], dst_v_opt[i]);
   }
 
-  free_aligned_buffer_page_end(orig_uv);
+  free_aligned_buffer_page_end(tile_uv);
   free_aligned_buffer_page_end(detiled_uv);
   free_aligned_buffer_page_end(dst_u_two_stage);
   free_aligned_buffer_page_end(dst_u_opt);
@@ -1727,17 +1812,17 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
   int i, j;
 
   // orig is tiled.  Allocate enough memory for tiles.
-  int orig_width = (benchmark_width_ + 15) & ~15;
-  int orig_height = (benchmark_height_ + 15) & ~15;
-  int orig_plane_size = orig_width * orig_height;
+  int tile_width = (benchmark_width_ + 15) & ~15;
+  int tile_height = (benchmark_height_ + 15) & ~15;
+  int tile_plane_size = tile_width * tile_height;
   int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_;
-  align_buffer_page_end(orig_uv, orig_plane_size);
+  align_buffer_page_end(tile_uv, tile_plane_size);
   align_buffer_page_end(dst_u_c, uv_plane_size);
   align_buffer_page_end(dst_u_opt, uv_plane_size);
   align_buffer_page_end(dst_v_c, uv_plane_size);
   align_buffer_page_end(dst_v_opt, uv_plane_size);
 
-  MemRandomize(orig_uv, orig_plane_size);
+  MemRandomize(tile_uv, tile_plane_size);
   memset(dst_u_c, 0, uv_plane_size);
   memset(dst_u_opt, 0, uv_plane_size);
   memset(dst_v_c, 0, uv_plane_size);
@@ -1746,7 +1831,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
   // Disable all optimizations.
   MaskCpuFlags(disable_cpu_flags_);
 
-  DetileSplitUVPlane(orig_uv, orig_width, dst_u_c, (benchmark_width_ + 1) / 2,
+  DetileSplitUVPlane(tile_uv, tile_width, dst_u_c, (benchmark_width_ + 1) / 2,
                      dst_v_c, (benchmark_width_ + 1) / 2, benchmark_width_,
                      benchmark_height_, 16);
 
@@ -1755,7 +1840,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
 
   for (j = 0; j < benchmark_iterations_; j++) {
     DetileSplitUVPlane(
-        orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt,
+        tile_uv, tile_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt,
         (benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16);
   }
 
@@ -1764,7 +1849,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
     EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);
   }
 
-  free_aligned_buffer_page_end(orig_uv);
+  free_aligned_buffer_page_end(tile_uv);
   free_aligned_buffer_page_end(dst_u_c);
   free_aligned_buffer_page_end(dst_u_opt);
   free_aligned_buffer_page_end(dst_v_c);
@@ -2710,12 +2795,23 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
   MaskCpuFlags(disable_cpu_flags_);
   ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c,
                    benchmark_width_, benchmark_width_, benchmark_height_);
-  MaskCpuFlags(benchmark_cpu_info_);
+  double c_time = get_time();
+  ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c,
+                   benchmark_width_, benchmark_width_, benchmark_height_);
+  c_time = (get_time() - c_time);
 
+  MaskCpuFlags(benchmark_cpu_info_);
+  ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
+                   benchmark_width_, benchmark_width_, benchmark_height_);
+  double opt_time = get_time();
   for (int i = 0; i < benchmark_iterations_; ++i) {
     ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
                      benchmark_width_, benchmark_width_, benchmark_height_);
   }
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+  // Report performance of C vs OPT
+  printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6),
+         static_cast<int>(opt_time * 1e6));
   for (int i = 0; i < kPixels; ++i) {
     EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
@@ -2738,12 +2834,24 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
   MaskCpuFlags(disable_cpu_flags_);
   ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c,
                    benchmark_width_ * 4, benchmark_width_, benchmark_height_);
-  MaskCpuFlags(benchmark_cpu_info_);
+  double c_time = get_time();
+  ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c,
+                   benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+  c_time = (get_time() - c_time);
 
+  MaskCpuFlags(benchmark_cpu_info_);
+  ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt,
+                   benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+  double opt_time = get_time();
   for (int i = 0; i < benchmark_iterations_; ++i) {
     ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt,
                      benchmark_width_ * 4, benchmark_width_, benchmark_height_);
   }
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+
+  // Report performance of C vs OPT
+  printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6),
+         static_cast<int>(opt_time * 1e6));
   for (int i = 0; i < kPixels * 4; ++i) {
     EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
@@ -3495,8 +3603,8 @@ TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16)
 // TODO(fbarchard): improve test for platforms and cpu detect
 #ifdef HAS_MERGEUVROW_16_AVX2
 TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
-  // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  // Round count up to multiple of 8
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
 
   align_buffer_page_end(src_pixels_u, kPixels * 2);
   align_buffer_page_end(src_pixels_v, kPixels * 2);
@@ -4429,4 +4537,83 @@ TEST_F(LibYUVPlanarTest, NV21Copy) {
   free_aligned_buffer_page_end(dst_vu);
 }
 
+#if defined(ENABLE_ROW_TESTS) && !defined(LIBYUV_DISABLE_NEON) && \
+    defined(__aarch64__)
+
+TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32) {
+  int i, j;
+  const int y_plane_size = benchmark_width_ * benchmark_height_;
+
+  align_buffer_page_end(orig_f, y_plane_size * 4);
+  align_buffer_page_end(orig_y, y_plane_size * 2);
+  align_buffer_page_end(dst_opt, y_plane_size * 4);
+  align_buffer_page_end(rec_opt, y_plane_size * 2);
+
+  for (i = 0; i < y_plane_size; ++i) {
+    ((float*)orig_f)[i] = (float)(i % 10000) * 3.14f;
+  }
+  memset(orig_y, 1, y_plane_size * 2);
+  memset(dst_opt, 2, y_plane_size * 4);
+  memset(rec_opt, 3, y_plane_size * 2);
+
+  ConvertFP32ToFP16Row_NEON((const float*)orig_f, (uint16_t*)orig_y,
+                            y_plane_size);
+
+  for (j = 0; j < benchmark_iterations_; j++) {
+    ConvertFP16ToFP32Row_NEON((const uint16_t*)orig_y, (float*)dst_opt,
+                              y_plane_size);
+  }
+
+  ConvertFP32ToFP16Row_NEON((const float*)dst_opt, (uint16_t*)rec_opt,
+                            y_plane_size);
+
+  for (i = 0; i < y_plane_size; ++i) {
+    EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]);
+  }
+
+  free_aligned_buffer_page_end(orig_f);
+  free_aligned_buffer_page_end(orig_y);
+  free_aligned_buffer_page_end(dst_opt);
+  free_aligned_buffer_page_end(rec_opt);
+}
+
+TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32Column) {
+  int i, j;
+  const int y_plane_size = benchmark_width_ * benchmark_height_;
+
+  align_buffer_page_end(orig_f, y_plane_size * 4);
+  align_buffer_page_end(orig_y, y_plane_size * 2);
+  align_buffer_page_end(dst_opt, y_plane_size * 4);
+  align_buffer_page_end(rec_opt, y_plane_size * 2);
+
+  for (i = 0; i < y_plane_size; ++i) {
+    ((float*)orig_f)[i] = (float)(i % 10000) * 3.14f;
+  }
+  memset(orig_y, 1, y_plane_size * 2);
+  memset(dst_opt, 2, y_plane_size * 4);
+  memset(rec_opt, 3, y_plane_size * 2);
+
+  ConvertFP32ToFP16Row_NEON((const float*)orig_f, (uint16_t*)orig_y,
+                            y_plane_size);
+
+  for (j = 0; j < benchmark_iterations_; j++) {
+    ConvertFP16ToFP32Column_NEON((const uint16_t*)orig_y, 1, (float*)dst_opt,
+                                 y_plane_size);
+  }
+
+  ConvertFP32ToFP16Row_NEON((const float*)dst_opt, (uint16_t*)rec_opt,
+                            y_plane_size);
+
+  for (i = 0; i < y_plane_size; ++i) {
+    EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]);
+  }
+
+  free_aligned_buffer_page_end(orig_f);
+  free_aligned_buffer_page_end(orig_y);
+  free_aligned_buffer_page_end(dst_opt);
+  free_aligned_buffer_page_end(rec_opt);
+}
+
+#endif  // defined(ENABLE_ROW_TESTS) && defined(__aarch64__)
+
 }  // namespace libyuv
diff --git a/files/unit_test/rotate_argb_test.cc b/unit_test/rotate_argb_test.cc
index 01ed69ca..74952c4e 100644
--- a/files/unit_test/rotate_argb_test.cc
+++ b/unit_test/rotate_argb_test.cc
@@ -225,4 +225,110 @@ TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) {
   free_aligned_buffer_page_end(src_argb);
 }
 
+static void TestRotatePlane_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               libyuv::RotationMode mode,
+                               int benchmark_iterations,
+                               int disable_cpu_flags,
+                               int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height < 1) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_stride = src_width;
+  int src_plane_size = src_stride * abs(src_height);
+  align_buffer_page_end_16(src, src_plane_size);
+  for (int i = 0; i < src_plane_size; ++i) {
+    src[i] = fastrand() & 0xff;
+  }
+
+  int dst_stride = dst_width;
+  int dst_plane_size = dst_stride * dst_height;
+  align_buffer_page_end_16(dst_c, dst_plane_size);
+  align_buffer_page_end_16(dst_opt, dst_plane_size);
+  memset(dst_c, 2, dst_plane_size);
+  memset(dst_opt, 3, dst_plane_size);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  RotatePlane_16(src, src_stride, dst_c, dst_stride, src_width, src_height,
+                 mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    RotatePlane_16(src, src_stride, dst_opt, dst_stride, src_width, src_height,
+                   mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_plane_size; ++i) {
+    EXPECT_EQ(dst_c[i], dst_opt[i]);
+  }
+
+  free_aligned_buffer_page_end_16(dst_c);
+  free_aligned_buffer_page_end_16(dst_opt);
+  free_aligned_buffer_page_end_16(src);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane0_16_Opt) {
+  TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_width_,
+                     benchmark_height_, kRotate0, benchmark_iterations_,
+                     disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane90_16_Opt) {
+  TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_height_,
+                     benchmark_width_, kRotate90, benchmark_iterations_,
+                     disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane180_16_Opt) {
+  TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_width_,
+                     benchmark_height_, kRotate180, benchmark_iterations_,
+                     disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane270_16_Opt) {
+  TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_height_,
+                     benchmark_width_, kRotate270, benchmark_iterations_,
+                     disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane0_16_Odd) {
+  TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+                     benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
+                     benchmark_iterations_, disable_cpu_flags_,
+                     benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane90_16_Odd) {
+  TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+                     benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
+                     benchmark_iterations_, disable_cpu_flags_,
+                     benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane180_16_Odd) {
+  TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+                     benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
+                     benchmark_iterations_, disable_cpu_flags_,
+                     benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane270_16_Odd) {
+  TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+                     benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
+                     benchmark_iterations_, disable_cpu_flags_,
+                     benchmark_cpu_info_);
+}
+
 }  // namespace libyuv
diff --git a/files/unit_test/rotate_test.cc b/unit_test/rotate_test.cc
index d3887414..abc08efa 100644
--- a/files/unit_test/rotate_test.cc
+++ b/unit_test/rotate_test.cc
@@ -14,6 +14,10 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/rotate.h"
 
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/rotate_row.h"
+#endif
+
 namespace libyuv {
 
 #define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
@@ -596,4 +600,363 @@ TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
 #undef TESTAPLANARTOP
 #undef TESTAPLANARTOPI
 
+static void I010TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i010_y_size = src_width * Abs(src_height);
+  int src_i010_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2);
+  int src_i010_size = src_i010_y_size + src_i010_uv_size * 2;
+  align_buffer_page_end_16(src_i010, src_i010_size);
+  for (int i = 0; i < src_i010_size; ++i) {
+    src_i010[i] = fastrand() & 0x3ff;
+  }
+
+  int dst_i010_y_size = dst_width * dst_height;
+  int dst_i010_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+  int dst_i010_size = dst_i010_y_size + dst_i010_uv_size * 2;
+  align_buffer_page_end_16(dst_i010_c, dst_i010_size);
+  align_buffer_page_end_16(dst_i010_opt, dst_i010_size);
+  memset(dst_i010_c, 2, dst_i010_size * 2);
+  memset(dst_i010_opt, 3, dst_i010_size * 2);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I010Rotate(src_i010, src_width, src_i010 + src_i010_y_size,
+             (src_width + 1) / 2, src_i010 + src_i010_y_size + src_i010_uv_size,
+             (src_width + 1) / 2, dst_i010_c, dst_width,
+             dst_i010_c + dst_i010_y_size, (dst_width + 1) / 2,
+             dst_i010_c + dst_i010_y_size + dst_i010_uv_size,
+             (dst_width + 1) / 2, src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I010Rotate(
+        src_i010, src_width, src_i010 + src_i010_y_size, (src_width + 1) / 2,
+        src_i010 + src_i010_y_size + src_i010_uv_size, (src_width + 1) / 2,
+        dst_i010_opt, dst_width, dst_i010_opt + dst_i010_y_size,
+        (dst_width + 1) / 2, dst_i010_opt + dst_i010_y_size + dst_i010_uv_size,
+        (dst_width + 1) / 2, src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i010_size; ++i) {
+    EXPECT_EQ(dst_i010_c[i], dst_i010_opt[i]);
+  }
+
+  free_aligned_buffer_page_end_16(dst_i010_c);
+  free_aligned_buffer_page_end_16(dst_i010_opt);
+  free_aligned_buffer_page_end_16(src_i010);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate0_Opt) {
+  I010TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate90_Opt) {
+  I010TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate180_Opt) {
+  I010TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate270_Opt) {
+  I010TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+static void I210TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i210_y_size = src_width * Abs(src_height);
+  int src_i210_uv_size = ((src_width + 1) / 2) * Abs(src_height);
+  int src_i210_size = src_i210_y_size + src_i210_uv_size * 2;
+  align_buffer_page_end_16(src_i210, src_i210_size);
+  for (int i = 0; i < src_i210_size; ++i) {
+    src_i210[i] = fastrand() & 0x3ff;
+  }
+
+  int dst_i210_y_size = dst_width * dst_height;
+  int dst_i210_uv_size = ((dst_width + 1) / 2) * dst_height;
+  int dst_i210_size = dst_i210_y_size + dst_i210_uv_size * 2;
+  align_buffer_page_end_16(dst_i210_c, dst_i210_size);
+  align_buffer_page_end_16(dst_i210_opt, dst_i210_size);
+  memset(dst_i210_c, 2, dst_i210_size * 2);
+  memset(dst_i210_opt, 3, dst_i210_size * 2);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I210Rotate(src_i210, src_width, src_i210 + src_i210_y_size,
+             (src_width + 1) / 2, src_i210 + src_i210_y_size + src_i210_uv_size,
+             (src_width + 1) / 2, dst_i210_c, dst_width,
+             dst_i210_c + dst_i210_y_size, (dst_width + 1) / 2,
+             dst_i210_c + dst_i210_y_size + dst_i210_uv_size,
+             (dst_width + 1) / 2, src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I210Rotate(
+        src_i210, src_width, src_i210 + src_i210_y_size, (src_width + 1) / 2,
+        src_i210 + src_i210_y_size + src_i210_uv_size, (src_width + 1) / 2,
+        dst_i210_opt, dst_width, dst_i210_opt + dst_i210_y_size,
+        (dst_width + 1) / 2, dst_i210_opt + dst_i210_y_size + dst_i210_uv_size,
+        (dst_width + 1) / 2, src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i210_size; ++i) {
+    EXPECT_EQ(dst_i210_c[i], dst_i210_opt[i]);
+  }
+
+  free_aligned_buffer_page_end_16(dst_i210_c);
+  free_aligned_buffer_page_end_16(dst_i210_opt);
+  free_aligned_buffer_page_end_16(src_i210);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate0_Opt) {
+  I210TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate90_Opt) {
+  I210TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate180_Opt) {
+  I210TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate270_Opt) {
+  I210TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+static void I410TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i410_y_size = src_width * Abs(src_height);
+  int src_i410_uv_size = src_width * Abs(src_height);
+  int src_i410_size = src_i410_y_size + src_i410_uv_size * 2;
+  align_buffer_page_end_16(src_i410, src_i410_size);
+  for (int i = 0; i < src_i410_size; ++i) {
+    src_i410[i] = fastrand() & 0x3ff;
+  }
+
+  int dst_i410_y_size = dst_width * dst_height;
+  int dst_i410_uv_size = dst_width * dst_height;
+  int dst_i410_size = dst_i410_y_size + dst_i410_uv_size * 2;
+  align_buffer_page_end_16(dst_i410_c, dst_i410_size);
+  align_buffer_page_end_16(dst_i410_opt, dst_i410_size);
+  memset(dst_i410_c, 2, dst_i410_size * 2);
+  memset(dst_i410_opt, 3, dst_i410_size * 2);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I410Rotate(src_i410, src_width, src_i410 + src_i410_y_size, src_width,
+             src_i410 + src_i410_y_size + src_i410_uv_size, src_width,
+             dst_i410_c, dst_width, dst_i410_c + dst_i410_y_size, dst_width,
+             dst_i410_c + dst_i410_y_size + dst_i410_uv_size, dst_width,
+             src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I410Rotate(src_i410, src_width, src_i410 + src_i410_y_size, src_width,
+               src_i410 + src_i410_y_size + src_i410_uv_size, src_width,
+               dst_i410_opt, dst_width, dst_i410_opt + dst_i410_y_size,
+               dst_width, dst_i410_opt + dst_i410_y_size + dst_i410_uv_size,
+               dst_width, src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i410_size; ++i) {
+    EXPECT_EQ(dst_i410_c[i], dst_i410_opt[i]);
+  }
+
+  free_aligned_buffer_page_end_16(dst_i410_c);
+  free_aligned_buffer_page_end_16(dst_i410_opt);
+  free_aligned_buffer_page_end_16(src_i410);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate0_Opt) {
+  I410TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate90_Opt) {
+  I410TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate180_Opt) {
+  I410TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate270_Opt) {
+  I410TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+#if defined(ENABLE_ROW_TESTS)
+
+TEST_F(LibYUVRotateTest, Transpose4x4_Test) {
+  // dst width and height
+  const int width = 4;
+  const int height = 4;
+  int src_pixels[4][4];
+  int dst_pixels_c[4][4];
+  int dst_pixels_opt[4][4];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      src_pixels[i][j] = i * 10 + j;
+    }
+  }
+  memset(dst_pixels_c, 1, width * height * 4);
+  memset(dst_pixels_opt, 2, width * height * 4);
+
+  Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                    (uint8_t*)dst_pixels_c, width * 4, width);
+
+  const int benchmark_iterations =
+      (benchmark_iterations_ * benchmark_width_ * benchmark_height_ + 15) /
+      (4 * 4);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+#if defined(HAS_TRANSPOSE4X4_32_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else
+#elif defined(HAS_TRANSPOSE4X4_32_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2)) {
+      Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else
+#endif
+    {
+      Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                        (uint8_t*)dst_pixels_opt, width * 4, width);
+    }
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      EXPECT_EQ(dst_pixels_c[i][j], src_pixels[j][i]);
+      EXPECT_EQ(dst_pixels_c[i][j], dst_pixels_opt[i][j]);
+    }
+  }
+}
+
+TEST_F(LibYUVRotateTest, Transpose4x4_Opt) {
+  // dst width and height
+  const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3;
+  const int height = 4;
+  align_buffer_page_end(src_pixels, height * width * 4);
+  align_buffer_page_end(dst_pixels_c, width * height * 4);
+  align_buffer_page_end(dst_pixels_opt, width * height * 4);
+
+  MemRandomize(src_pixels, height * width * 4);
+  memset(dst_pixels_c, 1, width * height * 4);
+  memset(dst_pixels_opt, 2, width * height * 4);
+
+  Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                    (uint8_t*)dst_pixels_c, width * 4, width);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+#if defined(HAS_TRANSPOSE4X4_32_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else
+#elif defined(HAS_TRANSPOSE4X4_32_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+      Transpose4x4_32_AVX2((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else if (TestCpuFlag(kCpuHasSSE2)) {
+      Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else
+#endif
+    {
+      Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                        (uint8_t*)dst_pixels_opt, width * 4, width);
+    }
+  }
+
+  for (int i = 0; i < width * height; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_c);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+}
+
+#endif  // ENABLE_ROW_TESTS
+
 }  // namespace libyuv
diff --git a/files/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc
index f54a68f1..f54a68f1 100644
--- a/files/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
diff --git a/unit_test/scale_plane_test.cc b/unit_test/scale_plane_test.cc
new file mode 100644
index 00000000..9ce47a02
--- /dev/null
+++ b/unit_test/scale_plane_test.cc
@@ -0,0 +1,470 @@
+/*
+ *  Copyright 2023 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/scale_row.h"  // For ScaleRowDown2Box_Odd_C
+#endif
+
+#define STRINGIZE(line) #line
+#define FILELINESTR(file, line) file ":" STRINGIZE(line)
+
+#if defined(__riscv) && !defined(__clang__)
+#define DISABLE_SLOW_TESTS
+#undef ENABLE_FULL_TESTS
+#undef ENABLE_ROW_TESTS
+#define LEAN_TESTS
+#endif
+
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
+namespace libyuv {
+
+#ifdef ENABLE_ROW_TESTS
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
+  SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]);
+  SIMD_ALIGNED(uint8_t dst_pixels_opt[64]);
+  SIMD_ALIGNED(uint8_t dst_pixels_c[64]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+  memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt));
+  memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
+
+  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+  if (!has_ssse3) {
+    printf("Warning SSSE3 not detected; Skipping test.\n");
+  } else {
+    // TL.
+    orig_pixels[0] = 255u;
+    orig_pixels[1] = 0u;
+    orig_pixels[128 + 0] = 0u;
+    orig_pixels[128 + 1] = 0u;
+    // TR.
+    orig_pixels[2] = 0u;
+    orig_pixels[3] = 100u;
+    orig_pixels[128 + 2] = 0u;
+    orig_pixels[128 + 3] = 0u;
+    // BL.
+    orig_pixels[4] = 0u;
+    orig_pixels[5] = 0u;
+    orig_pixels[128 + 4] = 50u;
+    orig_pixels[128 + 5] = 0u;
+    // BR.
+    orig_pixels[6] = 0u;
+    orig_pixels[7] = 0u;
+    orig_pixels[128 + 6] = 0u;
+    orig_pixels[128 + 7] = 20u;
+    // Odd.
+    orig_pixels[126] = 4u;
+    orig_pixels[127] = 255u;
+    orig_pixels[128 + 126] = 16u;
+    orig_pixels[128 + 127] = 255u;
+
+    // Test regular half size.
+    ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64);
+
+    EXPECT_EQ(64u, dst_pixels_c[0]);
+    EXPECT_EQ(25u, dst_pixels_c[1]);
+    EXPECT_EQ(13u, dst_pixels_c[2]);
+    EXPECT_EQ(5u, dst_pixels_c[3]);
+    EXPECT_EQ(0u, dst_pixels_c[4]);
+    EXPECT_EQ(133u, dst_pixels_c[63]);
+
+    // Test Odd width version - Last pixel is just 1 horizontal pixel.
+    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
+
+    EXPECT_EQ(64u, dst_pixels_c[0]);
+    EXPECT_EQ(25u, dst_pixels_c[1]);
+    EXPECT_EQ(13u, dst_pixels_c[2]);
+    EXPECT_EQ(5u, dst_pixels_c[3]);
+    EXPECT_EQ(0u, dst_pixels_c[4]);
+    EXPECT_EQ(10u, dst_pixels_c[63]);
+
+    // Test one pixel less, should skip the last pixel.
+    memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
+    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63);
+
+    EXPECT_EQ(64u, dst_pixels_c[0]);
+    EXPECT_EQ(25u, dst_pixels_c[1]);
+    EXPECT_EQ(13u, dst_pixels_c[2]);
+    EXPECT_EQ(5u, dst_pixels_c[3]);
+    EXPECT_EQ(0u, dst_pixels_c[4]);
+    EXPECT_EQ(0u, dst_pixels_c[63]);
+
+    // Test regular half size SSSE3.
+    ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
+
+    EXPECT_EQ(64u, dst_pixels_opt[0]);
+    EXPECT_EQ(25u, dst_pixels_opt[1]);
+    EXPECT_EQ(13u, dst_pixels_opt[2]);
+    EXPECT_EQ(5u, dst_pixels_opt[3]);
+    EXPECT_EQ(0u, dst_pixels_opt[4]);
+    EXPECT_EQ(133u, dst_pixels_opt[63]);
+
+    // Compare C and SSSE3 match.
+    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
+    ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
+    for (int i = 0; i < 64; ++i) {
+      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    }
+  }
+}
+#endif  // HAS_SCALEROWDOWN2_SSSE3
+
+extern "C" void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
+                                         ptrdiff_t src_stride,
+                                         uint16_t* dst,
+                                         int dst_width);
+
+TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
+  SIMD_ALIGNED(uint16_t orig_pixels[2560 * 2]);
+  SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
+  SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
+
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+  memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+  memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
+
+  for (int i = 0; i < 2560 * 2; ++i) {
+    orig_pixels[i] = i;
+  }
+  ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+    int has_neon = TestCpuFlag(kCpuHasNEON);
+    if (has_neon) {
+      ScaleRowDown2Box_16_NEON(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
+    } else {
+      ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
+    }
+#else
+    ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
+#endif
+  }
+
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  EXPECT_EQ(dst_pixels_c[0], (0 + 1 + 2560 + 2561 + 2) / 4);
+  EXPECT_EQ(dst_pixels_c[1279], 3839);
+}
+#endif  // ENABLE_ROW_TESTS
+
+// Test scaling plane with 8 bit C vs 12 bit C and return maximum pixel
+// difference.
+// 0 = exact.
+static int TestPlaneFilter_16(int src_width,
+                              int src_height,
+                              int dst_width,
+                              int dst_height,
+                              FilterMode f,
+                              int benchmark_iterations,
+                              int disable_cpu_flags,
+                              int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i;
+  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+  int src_stride_y = Abs(src_width);
+  int dst_y_plane_size = dst_width * dst_height;
+  int dst_stride_y = dst_width;
+
+  align_buffer_page_end(src_y, src_y_plane_size);
+  align_buffer_page_end(src_y_16, src_y_plane_size * 2);
+  align_buffer_page_end(dst_y_8, dst_y_plane_size);
+  align_buffer_page_end(dst_y_16, dst_y_plane_size * 2);
+  uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16);
+  uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16);
+
+  MemRandomize(src_y, src_y_plane_size);
+  memset(dst_y_8, 0, dst_y_plane_size);
+  memset(dst_y_16, 1, dst_y_plane_size * 2);
+
+  for (i = 0; i < src_y_plane_size; ++i) {
+    p_src_y_16[i] = src_y[i] & 255;
+  }
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y_8, dst_stride_y,
+             dst_width, dst_height, f);
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+
+  for (i = 0; i < benchmark_iterations; ++i) {
+    ScalePlane_16(p_src_y_16, src_stride_y, src_width, src_height, p_dst_y_16,
+                  dst_stride_y, dst_width, dst_height, f);
+  }
+
+  // Expect an exact match.
+  int max_diff = 0;
+  for (i = 0; i < dst_y_plane_size; ++i) {
+    int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y_8);
+  free_aligned_buffer_page_end(dst_y_16);
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_y_16);
+
+  return max_diff;
+}
+
+// The following adjustments in dimensions ensure the scale factor will be
+// exactly achieved.
+// 2 is chroma subsample.
+#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
+#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
+
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                       \
+  TEST_F(LibYUVScaleTest, DISABLED_##ScalePlaneDownBy##name##_##filter##_16) { \
+    int diff = TestPlaneFilter_16(                                             \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom),   \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),   \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,            \
+        benchmark_cpu_info_);                                                  \
+    EXPECT_LE(diff, max_diff);                                                 \
+  }
+
+// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
+// filtering is different fixed point implementations for SSSE3, Neon and C.
+#define TEST_FACTOR(name, nom, denom, boxdiff)      \
+  TEST_FACTOR1(name, None, nom, denom, 0)           \
+  TEST_FACTOR1(name, Linear, nom, denom, boxdiff)   \
+  TEST_FACTOR1(name, Bilinear, nom, denom, boxdiff) \
+  TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+
+TEST_FACTOR(2, 1, 2, 0)
+TEST_FACTOR(4, 1, 4, 0)
+// TEST_FACTOR(8, 1, 8, 0) Disable for benchmark performance.  Takes 90 seconds.
+TEST_FACTOR(3by4, 3, 4, 1)
+TEST_FACTOR(3by8, 3, 8, 1)
+TEST_FACTOR(3, 1, 3, 0)
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+#undef SX
+#undef DX
+
+TEST_F(LibYUVScaleTest, PlaneTest3x) {
+  const int kSrcStride = 480;
+  const int kDstStride = 160;
+  const int kSize = kSrcStride * 3;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < 480 * 3; ++i) {
+    orig_pixels[i] = i;
+  }
+  align_buffer_page_end(dest_pixels, kDstStride);
+
+  int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+                      benchmark_iterations_;
+  for (int i = 0; i < iterations160; ++i) {
+    ScalePlane(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+               kFilterBilinear);
+  }
+
+  EXPECT_EQ(225, dest_pixels[0]);
+
+  ScalePlane(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+             kFilterNone);
+
+  EXPECT_EQ(225, dest_pixels[0]);
+
+  free_aligned_buffer_page_end(dest_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTest4x) {
+  const int kSrcStride = 640;
+  const int kDstStride = 160;
+  const int kSize = kSrcStride * 4;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < 640 * 4; ++i) {
+    orig_pixels[i] = i;
+  }
+  align_buffer_page_end(dest_pixels, kDstStride);
+
+  int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+                      benchmark_iterations_;
+  for (int i = 0; i < iterations160; ++i) {
+    ScalePlane(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
+               kFilterBilinear);
+  }
+
+  EXPECT_EQ(66, dest_pixels[0]);
+
+  ScalePlane(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
+             kFilterNone);
+
+  EXPECT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
+
+  free_aligned_buffer_page_end(dest_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+// Intent is to test 200x50 to 50x200 but width and height can be parameters.
+TEST_F(LibYUVScaleTest, PlaneTestRotate_None) {
+  const int kSize = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < kSize; ++i) {
+    orig_pixels[i] = i;
+  }
+  align_buffer_page_end(dest_opt_pixels, kSize);
+  align_buffer_page_end(dest_c_pixels, kSize);
+
+  MaskCpuFlags(disable_cpu_flags_);  // Disable all CPU optimization.
+  ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_,
+             dest_c_pixels, benchmark_height_, benchmark_height_,
+             benchmark_width_, kFilterNone);
+  MaskCpuFlags(benchmark_cpu_info_);  // Enable all CPU optimization.
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ScalePlane(orig_pixels, benchmark_width_, benchmark_width_,
+               benchmark_height_, dest_opt_pixels, benchmark_height_,
+               benchmark_height_, benchmark_width_, kFilterNone);
+  }
+
+  for (int i = 0; i < kSize; ++i) {
+    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+  }
+
+  free_aligned_buffer_page_end(dest_c_pixels);
+  free_aligned_buffer_page_end(dest_opt_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTestRotate_Bilinear) {
+  const int kSize = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < kSize; ++i) {
+    orig_pixels[i] = i;
+  }
+  align_buffer_page_end(dest_opt_pixels, kSize);
+  align_buffer_page_end(dest_c_pixels, kSize);
+
+  MaskCpuFlags(disable_cpu_flags_);  // Disable all CPU optimization.
+  ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_,
+             dest_c_pixels, benchmark_height_, benchmark_height_,
+             benchmark_width_, kFilterBilinear);
+  MaskCpuFlags(benchmark_cpu_info_);  // Enable all CPU optimization.
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ScalePlane(orig_pixels, benchmark_width_, benchmark_width_,
+               benchmark_height_, dest_opt_pixels, benchmark_height_,
+               benchmark_height_, benchmark_width_, kFilterBilinear);
+  }
+
+  for (int i = 0; i < kSize; ++i) {
+    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+  }
+
+  free_aligned_buffer_page_end(dest_c_pixels);
+  free_aligned_buffer_page_end(dest_opt_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+// Intent is to test 200x50 to 50x200 but width and height can be parameters.
+TEST_F(LibYUVScaleTest, PlaneTestRotate_Box) {
+  const int kSize = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < kSize; ++i) {
+    orig_pixels[i] = i;
+  }
+  align_buffer_page_end(dest_opt_pixels, kSize);
+  align_buffer_page_end(dest_c_pixels, kSize);
+
+  MaskCpuFlags(disable_cpu_flags_);  // Disable all CPU optimization.
+  ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_,
+             dest_c_pixels, benchmark_height_, benchmark_height_,
+             benchmark_width_, kFilterBox);
+  MaskCpuFlags(benchmark_cpu_info_);  // Enable all CPU optimization.
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ScalePlane(orig_pixels, benchmark_width_, benchmark_width_,
+               benchmark_height_, dest_opt_pixels, benchmark_height_,
+               benchmark_height_, benchmark_width_, kFilterBox);
+  }
+
+  for (int i = 0; i < kSize; ++i) {
+    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+  }
+
+  free_aligned_buffer_page_end(dest_c_pixels);
+  free_aligned_buffer_page_end(dest_opt_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTest1_Box) {
+  align_buffer_page_end(orig_pixels, 3);
+  align_buffer_page_end(dst_pixels, 3);
+
+  // Pad the 1x1 byte image with invalid values before and after in case libyuv
+  // reads outside the memory boundaries.
+  orig_pixels[0] = 0;
+  orig_pixels[1] = 1;  // scale this pixel
+  orig_pixels[2] = 2;
+  dst_pixels[0] = 3;
+  dst_pixels[1] = 3;
+  dst_pixels[2] = 3;
+
+  libyuv::ScalePlane(orig_pixels + 1, /* src_stride= */ 1, /* src_width= */ 1,
+                     /* src_height= */ 1, dst_pixels, /* dst_stride= */ 1,
+                     /* dst_width= */ 1, /* dst_height= */ 2,
+                     libyuv::kFilterBox);
+
+  EXPECT_EQ(dst_pixels[0], 1);
+  EXPECT_EQ(dst_pixels[1], 1);
+  EXPECT_EQ(dst_pixels[2], 3);
+
+  free_aligned_buffer_page_end(dst_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTest1_16_Box) {
+  align_buffer_page_end(orig_pixels_alloc, 3 * 2);
+  align_buffer_page_end(dst_pixels_alloc, 3 * 2);
+  uint16_t* orig_pixels = (uint16_t*)orig_pixels_alloc;
+  uint16_t* dst_pixels = (uint16_t*)dst_pixels_alloc;
+
+  // Pad the 1x1 byte image with invalid values before and after in case libyuv
+  // reads outside the memory boundaries.
+  orig_pixels[0] = 0;
+  orig_pixels[1] = 1;  // scale this pixel
+  orig_pixels[2] = 2;
+  dst_pixels[0] = 3;
+  dst_pixels[1] = 3;
+  dst_pixels[2] = 3;
+
+  libyuv::ScalePlane_16(
+      orig_pixels + 1, /* src_stride= */ 1, /* src_width= */ 1,
+      /* src_height= */ 1, dst_pixels, /* dst_stride= */ 1,
+      /* dst_width= */ 1, /* dst_height= */ 2, libyuv::kFilterNone);
+
+  EXPECT_EQ(dst_pixels[0], 1);
+  EXPECT_EQ(dst_pixels[1], 1);
+  EXPECT_EQ(dst_pixels[2], 3);
+
+  free_aligned_buffer_page_end(dst_pixels_alloc);
+  free_aligned_buffer_page_end(orig_pixels_alloc);
+}
+}  // namespace libyuv
diff --git a/files/unit_test/scale_rgb_test.cc b/unit_test/scale_rgb_test.cc
index 8296abe3..8296abe3 100644
--- a/files/unit_test/scale_rgb_test.cc
+++ b/unit_test/scale_rgb_test.cc
diff --git a/files/unit_test/scale_test.cc b/unit_test/scale_test.cc
index a8c95268..6e3b9271 100644
--- a/files/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -22,6 +22,11 @@
 #define STRINGIZE(line) #line
 #define FILELINESTR(file, line) file ":" STRINGIZE(line)
 
+#if defined(__riscv) && !defined(__clang__)
+#define DISABLE_SLOW_TESTS
+#undef ENABLE_FULL_TESTS
+#endif
+
 #if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
 // SLOW TESTS are those that are unoptimized C code.
 // FULL TESTS are optimized but test many variations of the same code.
@@ -1123,479 +1128,6 @@ TEST_SCALESWAPXY1(DISABLED_, Scale, Bilinear, 3)
 TEST_SCALESWAPXY1(DISABLED_, Scale, Box, 3)
 #endif
 #endif
-
 #undef TEST_SCALESWAPXY1
 
-#ifdef ENABLE_ROW_TESTS
-#ifdef HAS_SCALEROWDOWN2_SSSE3
-TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
-  SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]);
-  SIMD_ALIGNED(uint8_t dst_pixels_opt[64]);
-  SIMD_ALIGNED(uint8_t dst_pixels_c[64]);
-  memset(orig_pixels, 0, sizeof(orig_pixels));
-  memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt));
-  memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
-
-  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
-  if (!has_ssse3) {
-    printf("Warning SSSE3 not detected; Skipping test.\n");
-  } else {
-    // TL.
-    orig_pixels[0] = 255u;
-    orig_pixels[1] = 0u;
-    orig_pixels[128 + 0] = 0u;
-    orig_pixels[128 + 1] = 0u;
-    // TR.
-    orig_pixels[2] = 0u;
-    orig_pixels[3] = 100u;
-    orig_pixels[128 + 2] = 0u;
-    orig_pixels[128 + 3] = 0u;
-    // BL.
-    orig_pixels[4] = 0u;
-    orig_pixels[5] = 0u;
-    orig_pixels[128 + 4] = 50u;
-    orig_pixels[128 + 5] = 0u;
-    // BR.
-    orig_pixels[6] = 0u;
-    orig_pixels[7] = 0u;
-    orig_pixels[128 + 6] = 0u;
-    orig_pixels[128 + 7] = 20u;
-    // Odd.
-    orig_pixels[126] = 4u;
-    orig_pixels[127] = 255u;
-    orig_pixels[128 + 126] = 16u;
-    orig_pixels[128 + 127] = 255u;
-
-    // Test regular half size.
-    ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64);
-
-    EXPECT_EQ(64u, dst_pixels_c[0]);
-    EXPECT_EQ(25u, dst_pixels_c[1]);
-    EXPECT_EQ(13u, dst_pixels_c[2]);
-    EXPECT_EQ(5u, dst_pixels_c[3]);
-    EXPECT_EQ(0u, dst_pixels_c[4]);
-    EXPECT_EQ(133u, dst_pixels_c[63]);
-
-    // Test Odd width version - Last pixel is just 1 horizontal pixel.
-    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
-
-    EXPECT_EQ(64u, dst_pixels_c[0]);
-    EXPECT_EQ(25u, dst_pixels_c[1]);
-    EXPECT_EQ(13u, dst_pixels_c[2]);
-    EXPECT_EQ(5u, dst_pixels_c[3]);
-    EXPECT_EQ(0u, dst_pixels_c[4]);
-    EXPECT_EQ(10u, dst_pixels_c[63]);
-
-    // Test one pixel less, should skip the last pixel.
-    memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
-    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63);
-
-    EXPECT_EQ(64u, dst_pixels_c[0]);
-    EXPECT_EQ(25u, dst_pixels_c[1]);
-    EXPECT_EQ(13u, dst_pixels_c[2]);
-    EXPECT_EQ(5u, dst_pixels_c[3]);
-    EXPECT_EQ(0u, dst_pixels_c[4]);
-    EXPECT_EQ(0u, dst_pixels_c[63]);
-
-    // Test regular half size SSSE3.
-    ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
-
-    EXPECT_EQ(64u, dst_pixels_opt[0]);
-    EXPECT_EQ(25u, dst_pixels_opt[1]);
-    EXPECT_EQ(13u, dst_pixels_opt[2]);
-    EXPECT_EQ(5u, dst_pixels_opt[3]);
-    EXPECT_EQ(0u, dst_pixels_opt[4]);
-    EXPECT_EQ(133u, dst_pixels_opt[63]);
-
-    // Compare C and SSSE3 match.
-    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
-    ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
-    for (int i = 0; i < 64; ++i) {
-      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
-    }
-  }
-}
-#endif  // HAS_SCALEROWDOWN2_SSSE3
-
-extern "C" void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint16_t* dst,
-                                    int dst_width);
-extern "C" void ScaleRowUp2_16_C(const uint16_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint16_t* dst,
-                                 int dst_width);
-
-TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
-  SIMD_ALIGNED(uint16_t orig_pixels[640 * 2 + 1]);  // 2 rows + 1 pixel overrun.
-  SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
-  SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
-
-  memset(orig_pixels, 0, sizeof(orig_pixels));
-  memset(dst_pixels_opt, 1, sizeof(dst_pixels_opt));
-  memset(dst_pixels_c, 2, sizeof(dst_pixels_c));
-
-  for (int i = 0; i < 640 * 2 + 1; ++i) {
-    orig_pixels[i] = i;
-  }
-  ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_c[0], 1280);
-  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-    int has_neon = TestCpuFlag(kCpuHasNEON);
-    if (has_neon) {
-      ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
-    } else {
-      ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
-    }
-#else
-    ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
-#endif
-  }
-
-  for (int i = 0; i < 1280; ++i) {
-    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
-  }
-  EXPECT_EQ(dst_pixels_c[0], (0 * 9 + 1 * 3 + 640 * 3 + 641 * 1 + 8) / 16);
-  EXPECT_EQ(dst_pixels_c[1279], 800);
-}
-
-extern "C" void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
-                                         ptrdiff_t src_stride,
-                                         uint16_t* dst,
-                                         int dst_width);
-
-TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
-  SIMD_ALIGNED(uint16_t orig_pixels[2560 * 2]);
-  SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
-  SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
-
-  memset(orig_pixels, 0, sizeof(orig_pixels));
-  memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
-  memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
-
-  for (int i = 0; i < 2560 * 2; ++i) {
-    orig_pixels[i] = i;
-  }
-  ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280);
-  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-    int has_neon = TestCpuFlag(kCpuHasNEON);
-    if (has_neon) {
-      ScaleRowDown2Box_16_NEON(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
-    } else {
-      ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
-    }
-#else
-    ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
-#endif
-  }
-
-  for (int i = 0; i < 1280; ++i) {
-    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
-  }
-
-  EXPECT_EQ(dst_pixels_c[0], (0 + 1 + 2560 + 2561 + 2) / 4);
-  EXPECT_EQ(dst_pixels_c[1279], 3839);
-}
-#endif  // ENABLE_ROW_TESTS
-
-// Test scaling plane with 8 bit C vs 12 bit C and return maximum pixel
-// difference.
-// 0 = exact.
-static int TestPlaneFilter_16(int src_width,
-                              int src_height,
-                              int dst_width,
-                              int dst_height,
-                              FilterMode f,
-                              int benchmark_iterations,
-                              int disable_cpu_flags,
-                              int benchmark_cpu_info) {
-  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
-    return 0;
-  }
-
-  int i;
-  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
-  int src_stride_y = Abs(src_width);
-  int dst_y_plane_size = dst_width * dst_height;
-  int dst_stride_y = dst_width;
-
-  align_buffer_page_end(src_y, src_y_plane_size);
-  align_buffer_page_end(src_y_16, src_y_plane_size * 2);
-  align_buffer_page_end(dst_y_8, dst_y_plane_size);
-  align_buffer_page_end(dst_y_16, dst_y_plane_size * 2);
-  uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16);
-  uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16);
-
-  MemRandomize(src_y, src_y_plane_size);
-  memset(dst_y_8, 0, dst_y_plane_size);
-  memset(dst_y_16, 1, dst_y_plane_size * 2);
-
-  for (i = 0; i < src_y_plane_size; ++i) {
-    p_src_y_16[i] = src_y[i] & 255;
-  }
-
-  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
-  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y_8, dst_stride_y,
-             dst_width, dst_height, f);
-  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
-
-  for (i = 0; i < benchmark_iterations; ++i) {
-    ScalePlane_16(p_src_y_16, src_stride_y, src_width, src_height, p_dst_y_16,
-                  dst_stride_y, dst_width, dst_height, f);
-  }
-
-  // Expect an exact match.
-  int max_diff = 0;
-  for (i = 0; i < dst_y_plane_size; ++i) {
-    int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]);
-    if (abs_diff > max_diff) {
-      max_diff = abs_diff;
-    }
-  }
-
-  free_aligned_buffer_page_end(dst_y_8);
-  free_aligned_buffer_page_end(dst_y_16);
-  free_aligned_buffer_page_end(src_y);
-  free_aligned_buffer_page_end(src_y_16);
-
-  return max_diff;
-}
-
-// The following adjustments in dimensions ensure the scale factor will be
-// exactly achieved.
-// 2 is chroma subsample.
-#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
-#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
-
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                       \
-  TEST_F(LibYUVScaleTest, DISABLED_##ScalePlaneDownBy##name##_##filter##_16) { \
-    int diff = TestPlaneFilter_16(                                             \
-        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom),   \
-        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),   \
-        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,            \
-        benchmark_cpu_info_);                                                  \
-    EXPECT_LE(diff, max_diff);                                                 \
-  }
-
-// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
-// filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom, boxdiff)      \
-  TEST_FACTOR1(name, None, nom, denom, 0)           \
-  TEST_FACTOR1(name, Linear, nom, denom, boxdiff)   \
-  TEST_FACTOR1(name, Bilinear, nom, denom, boxdiff) \
-  TEST_FACTOR1(name, Box, nom, denom, boxdiff)
-
-TEST_FACTOR(2, 1, 2, 0)
-TEST_FACTOR(4, 1, 4, 0)
-// TEST_FACTOR(8, 1, 8, 0) Disable for benchmark performance.  Takes 90 seconds.
-TEST_FACTOR(3by4, 3, 4, 1)
-TEST_FACTOR(3by8, 3, 8, 1)
-TEST_FACTOR(3, 1, 3, 0)
-#undef TEST_FACTOR1
-#undef TEST_FACTOR
-#undef SX
-#undef DX
-
-TEST_F(LibYUVScaleTest, PlaneTest3x) {
-  const int kSrcStride = 480;
-  const int kDstStride = 160;
-  const int kSize = kSrcStride * 3;
-  align_buffer_page_end(orig_pixels, kSize);
-  for (int i = 0; i < 480 * 3; ++i) {
-    orig_pixels[i] = i;
-  }
-  align_buffer_page_end(dest_pixels, kDstStride);
-
-  int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
-                      benchmark_iterations_;
-  for (int i = 0; i < iterations160; ++i) {
-    ScalePlane(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
-               kFilterBilinear);
-  }
-
-  EXPECT_EQ(225, dest_pixels[0]);
-
-  ScalePlane(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
-             kFilterNone);
-
-  EXPECT_EQ(225, dest_pixels[0]);
-
-  free_aligned_buffer_page_end(dest_pixels);
-  free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(LibYUVScaleTest, PlaneTest4x) {
-  const int kSrcStride = 640;
-  const int kDstStride = 160;
-  const int kSize = kSrcStride * 4;
-  align_buffer_page_end(orig_pixels, kSize);
-  for (int i = 0; i < 640 * 4; ++i) {
-    orig_pixels[i] = i;
-  }
-  align_buffer_page_end(dest_pixels, kDstStride);
-
-  int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
-                      benchmark_iterations_;
-  for (int i = 0; i < iterations160; ++i) {
-    ScalePlane(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
-               kFilterBilinear);
-  }
-
-  EXPECT_EQ(66, dest_pixels[0]);
-
-  ScalePlane(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
-             kFilterNone);
-
-  EXPECT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
-
-  free_aligned_buffer_page_end(dest_pixels);
-  free_aligned_buffer_page_end(orig_pixels);
-}
-
-// Intent is to test 200x50 to 50x200 but width and height can be parameters.
-TEST_F(LibYUVScaleTest, PlaneTestRotate_None) {
-  const int kSize = benchmark_width_ * benchmark_height_;
-  align_buffer_page_end(orig_pixels, kSize);
-  for (int i = 0; i < kSize; ++i) {
-    orig_pixels[i] = i;
-  }
-  align_buffer_page_end(dest_opt_pixels, kSize);
-  align_buffer_page_end(dest_c_pixels, kSize);
-
-  MaskCpuFlags(disable_cpu_flags_);  // Disable all CPU optimization.
-  ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_,
-             dest_c_pixels, benchmark_height_, benchmark_height_,
-             benchmark_width_, kFilterNone);
-  MaskCpuFlags(benchmark_cpu_info_);  // Enable all CPU optimization.
-
-  for (int i = 0; i < benchmark_iterations_; ++i) {
-    ScalePlane(orig_pixels, benchmark_width_, benchmark_width_,
-               benchmark_height_, dest_opt_pixels, benchmark_height_,
-               benchmark_height_, benchmark_width_, kFilterNone);
-  }
-
-  for (int i = 0; i < kSize; ++i) {
-    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
-  }
-
-  free_aligned_buffer_page_end(dest_c_pixels);
-  free_aligned_buffer_page_end(dest_opt_pixels);
-  free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(LibYUVScaleTest, PlaneTestRotate_Bilinear) {
-  const int kSize = benchmark_width_ * benchmark_height_;
-  align_buffer_page_end(orig_pixels, kSize);
-  for (int i = 0; i < kSize; ++i) {
-    orig_pixels[i] = i;
-  }
-  align_buffer_page_end(dest_opt_pixels, kSize);
-  align_buffer_page_end(dest_c_pixels, kSize);
-
-  MaskCpuFlags(disable_cpu_flags_);  // Disable all CPU optimization.
-  ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_,
-             dest_c_pixels, benchmark_height_, benchmark_height_,
-             benchmark_width_, kFilterBilinear);
-  MaskCpuFlags(benchmark_cpu_info_);  // Enable all CPU optimization.
-
-  for (int i = 0; i < benchmark_iterations_; ++i) {
-    ScalePlane(orig_pixels, benchmark_width_, benchmark_width_,
-               benchmark_height_, dest_opt_pixels, benchmark_height_,
-               benchmark_height_, benchmark_width_, kFilterBilinear);
-  }
-
-  for (int i = 0; i < kSize; ++i) {
-    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
-  }
-
-  free_aligned_buffer_page_end(dest_c_pixels);
-  free_aligned_buffer_page_end(dest_opt_pixels);
-  free_aligned_buffer_page_end(orig_pixels);
-}
-
-// Intent is to test 200x50 to 50x200 but width and height can be parameters.
-TEST_F(LibYUVScaleTest, PlaneTestRotate_Box) {
-  const int kSize = benchmark_width_ * benchmark_height_;
-  align_buffer_page_end(orig_pixels, kSize);
-  for (int i = 0; i < kSize; ++i) {
-    orig_pixels[i] = i;
-  }
-  align_buffer_page_end(dest_opt_pixels, kSize);
-  align_buffer_page_end(dest_c_pixels, kSize);
-
-  MaskCpuFlags(disable_cpu_flags_);  // Disable all CPU optimization.
-  ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_,
-             dest_c_pixels, benchmark_height_, benchmark_height_,
-             benchmark_width_, kFilterBox);
-  MaskCpuFlags(benchmark_cpu_info_);  // Enable all CPU optimization.
-
-  for (int i = 0; i < benchmark_iterations_; ++i) {
-    ScalePlane(orig_pixels, benchmark_width_, benchmark_width_,
-               benchmark_height_, dest_opt_pixels, benchmark_height_,
-               benchmark_height_, benchmark_width_, kFilterBox);
-  }
-
-  for (int i = 0; i < kSize; ++i) {
-    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
-  }
-
-  free_aligned_buffer_page_end(dest_c_pixels);
-  free_aligned_buffer_page_end(dest_opt_pixels);
-  free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(LibYUVScaleTest, PlaneTest1_Box) {
-  align_buffer_page_end(orig_pixels, 3);
-  align_buffer_page_end(dst_pixels, 3);
-
-  // Pad the 1x1 byte image with invalid values before and after in case libyuv
-  // reads outside the memory boundaries.
-  orig_pixels[0] = 0;
-  orig_pixels[1] = 1;  // scale this pixel
-  orig_pixels[2] = 2;
-  dst_pixels[0] = 3;
-  dst_pixels[1] = 3;
-  dst_pixels[2] = 3;
-
-  libyuv::ScalePlane(orig_pixels + 1, /* src_stride= */ 1, /* src_width= */ 1,
-                     /* src_height= */ 1, dst_pixels, /* dst_stride= */ 1,
-                     /* dst_width= */ 1, /* dst_height= */ 2,
-                     libyuv::kFilterBox);
-
-  EXPECT_EQ(dst_pixels[0], 1);
-  EXPECT_EQ(dst_pixels[1], 1);
-  EXPECT_EQ(dst_pixels[2], 3);
-
-  free_aligned_buffer_page_end(dst_pixels);
-  free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(LibYUVScaleTest, PlaneTest1_16_Box) {
-  align_buffer_page_end(orig_pixels_alloc, 3 * 2);
-  align_buffer_page_end(dst_pixels_alloc, 3 * 2);
-  uint16_t* orig_pixels = (uint16_t*)orig_pixels_alloc;
-  uint16_t* dst_pixels = (uint16_t*)dst_pixels_alloc;
-
-  // Pad the 1x1 byte image with invalid values before and after in case libyuv
-  // reads outside the memory boundaries.
-  orig_pixels[0] = 0;
-  orig_pixels[1] = 1;  // scale this pixel
-  orig_pixels[2] = 2;
-  dst_pixels[0] = 3;
-  dst_pixels[1] = 3;
-  dst_pixels[2] = 3;
-
-  libyuv::ScalePlane_16(
-      orig_pixels + 1, /* src_stride= */ 1, /* src_width= */ 1,
-      /* src_height= */ 1, dst_pixels, /* dst_stride= */ 1,
-      /* dst_width= */ 1, /* dst_height= */ 2, libyuv::kFilterNone);
-
-  EXPECT_EQ(dst_pixels[0], 1);
-  EXPECT_EQ(dst_pixels[1], 1);
-  EXPECT_EQ(dst_pixels[2], 3);
-
-  free_aligned_buffer_page_end(dst_pixels_alloc);
-  free_aligned_buffer_page_end(orig_pixels_alloc);
-}
 }  // namespace libyuv
diff --git a/files/unit_test/scale_uv_test.cc b/unit_test/scale_uv_test.cc
index 3d524bef..dab217c9 100644
--- a/files/unit_test/scale_uv_test.cc
+++ b/unit_test/scale_uv_test.cc
@@ -39,55 +39,35 @@ static int UVTestFilter(int src_width,
     return 0;
   }
 
-  int i, j;
-  const int b = 0;  // 128 to test for padding/stride.
-  int64_t src_uv_plane_size =
-      (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 2LL;
-  int src_stride_uv = (b * 2 + Abs(src_width)) * 2;
+  int i;
+  int64_t src_uv_plane_size = Abs(src_width) * Abs(src_height) * 2LL;
+  int src_stride_uv = Abs(src_width) * 2;
+  int64_t dst_uv_plane_size = dst_width * dst_height * 2LL;
+  int dst_stride_uv = dst_width * 2;
 
   align_buffer_page_end(src_uv, src_uv_plane_size);
-  if (!src_uv) {
-    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
-    return 0;
-  }
-  MemRandomize(src_uv, src_uv_plane_size);
-
-  int64_t dst_uv_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 2LL;
-  int dst_stride_uv = (b * 2 + dst_width) * 2;
-
   align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
   align_buffer_page_end(dst_uv_opt, dst_uv_plane_size);
-  if (!dst_uv_c || !dst_uv_opt) {
+
+  if (!src_uv || !dst_uv_c || !dst_uv_opt) {
     printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
     return 0;
   }
+  MemRandomize(src_uv, src_uv_plane_size);
   memset(dst_uv_c, 2, dst_uv_plane_size);
-  memset(dst_uv_opt, 3, dst_uv_plane_size);
-
-  // Warm up both versions for consistent benchmarks.
-  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
-  UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-          src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv,
-          dst_width, dst_height, f);
-  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
-  UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-          src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
-          dst_width, dst_height, f);
+  memset(dst_uv_opt, 123, dst_uv_plane_size);
 
   MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
   double c_time = get_time();
-  UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-          src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv,
+  UVScale(src_uv, src_stride_uv, src_width, src_height, dst_uv_c, dst_stride_uv,
           dst_width, dst_height, f);
-
   c_time = (get_time() - c_time);
 
   MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
   double opt_time = get_time();
   for (i = 0; i < benchmark_iterations; ++i) {
-    UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-            src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
-            dst_width, dst_height, f);
+    UVScale(src_uv, src_stride_uv, src_width, src_height, dst_uv_opt,
+            dst_stride_uv, dst_width, dst_height, f);
   }
   opt_time = (get_time() - opt_time) / benchmark_iterations;
 
@@ -95,18 +75,11 @@ static int UVTestFilter(int src_width,
   printf("filter %d - %8d us C - %8d us OPT\n", f,
          static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
 
-  // C version may be a little off from the optimized. Order of
-  //  operations may introduce rounding somewhere. So do a difference
-  //  of the buffers and look to see that the max difference isn't
-  //  over 2.
   int max_diff = 0;
-  for (i = b; i < (dst_height + b); ++i) {
-    for (j = b * 2; j < (dst_width + b) * 2; ++j) {
-      int abs_diff = Abs(dst_uv_c[(i * dst_stride_uv) + j] -
-                         dst_uv_opt[(i * dst_stride_uv) + j]);
-      if (abs_diff > max_diff) {
-        max_diff = abs_diff;
-      }
+  for (i = 0; i < dst_uv_plane_size; ++i) {
+    int abs_diff = Abs(dst_uv_c[i] - dst_uv_opt[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
     }
   }
 
@@ -121,28 +94,26 @@ static int UVTestFilter(int src_width,
 #define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
 #define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
 
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                     \
+#define TEST_FACTOR1(name, filter, nom, denom)                               \
   TEST_F(LibYUVScaleTest, UVScaleDownBy##name##_##filter) {                  \
     int diff = UVTestFilter(                                                 \
         SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
         kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
         benchmark_cpu_info_);                                                \
-    EXPECT_LE(diff, max_diff);                                               \
+    EXPECT_EQ(0, diff);                                                      \
   }
 
 #if defined(ENABLE_FULL_TESTS)
-// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
-// filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom)         \
-  TEST_FACTOR1(name, None, nom, denom, 0)     \
-  TEST_FACTOR1(name, Linear, nom, denom, 3)   \
-  TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
-  TEST_FACTOR1(name, Box, nom, denom, 3)
+// Test a scale factor with all 4 filters.  Expect exact for SIMD vs C.
+#define TEST_FACTOR(name, nom, denom)      \
+  TEST_FACTOR1(name, None, nom, denom)     \
+  TEST_FACTOR1(name, Linear, nom, denom)   \
+  TEST_FACTOR1(name, Bilinear, nom, denom) \
+  TEST_FACTOR1(name, Box, nom, denom)
 #else
 // Test a scale factor with Bilinear.
-#define TEST_FACTOR(name, nom, denom) \
-  TEST_FACTOR1(name, Bilinear, nom, denom, 3)
+#define TEST_FACTOR(name, nom, denom) TEST_FACTOR1(name, Bilinear, nom, denom)
 #endif
 
 TEST_FACTOR(2, 1, 2)
diff --git a/files/unit_test/testdata/arm_v7.txt b/unit_test/testdata/arm_v7.txt
index 5d7dbd04..5d7dbd04 100644
--- a/files/unit_test/testdata/arm_v7.txt
+++ b/unit_test/testdata/arm_v7.txt
diff --git a/files/unit_test/testdata/juno.txt b/unit_test/testdata/juno.txt
index dd465272..dd465272 100644
--- a/files/unit_test/testdata/juno.txt
+++ b/unit_test/testdata/juno.txt
diff --git a/files/unit_test/testdata/mips.txt b/unit_test/testdata/mips.txt
index d9f28cbf..d9f28cbf 100644
--- a/files/unit_test/testdata/mips.txt
+++ b/unit_test/testdata/mips.txt
diff --git a/files/unit_test/testdata/mips_loongson2k.txt b/unit_test/testdata/mips_loongson2k.txt
index 8a88d38f..8a88d38f 100644
--- a/files/unit_test/testdata/mips_loongson2k.txt
+++ b/unit_test/testdata/mips_loongson2k.txt
diff --git a/files/unit_test/testdata/mips_loongson3.txt b/unit_test/testdata/mips_loongson3.txt
index 1f540b12..1f540b12 100644
--- a/files/unit_test/testdata/mips_loongson3.txt
+++ b/unit_test/testdata/mips_loongson3.txt
diff --git a/files/unit_test/testdata/mips_loongson_mmi.txt b/unit_test/testdata/mips_loongson_mmi.txt
index 0f10b8bb..0f10b8bb 100644
--- a/files/unit_test/testdata/mips_loongson_mmi.txt
+++ b/unit_test/testdata/mips_loongson_mmi.txt
diff --git a/files/unit_test/testdata/mips_msa.txt b/unit_test/testdata/mips_msa.txt
index ac930615..ac930615 100644
--- a/files/unit_test/testdata/mips_msa.txt
+++ b/unit_test/testdata/mips_msa.txt
diff --git a/unit_test/testdata/riscv64.txt b/unit_test/testdata/riscv64.txt
new file mode 100644
index 00000000..fbb4200f
--- /dev/null
+++ b/unit_test/testdata/riscv64.txt
@@ -0,0 +1,4 @@
+processor       : 0
+hart            : 1
+isa             : rv64imac
+mmu             : sv48
+\ No newline at end of file
diff --git a/unit_test/testdata/riscv64_rvv.txt b/unit_test/testdata/riscv64_rvv.txt
new file mode 100644
index 00000000..af1b3f36
--- /dev/null
+++ b/unit_test/testdata/riscv64_rvv.txt
@@ -0,0 +1,4 @@
+processor       : 0
+hart            : 1
+isa             : rv64imafdcv
+mmu             : sv48
+\ No newline at end of file
diff --git a/unit_test/testdata/riscv64_rvv_zvfh.txt b/unit_test/testdata/riscv64_rvv_zvfh.txt
new file mode 100644
index 00000000..c416c1af
--- /dev/null
+++ b/unit_test/testdata/riscv64_rvv_zvfh.txt
@@ -0,0 +1,4 @@
+processor       : 0
+hart            : 1
+isa             : rv64imafdcv_zfh_zvfh
+mmu             : sv48
+\ No newline at end of file
diff --git a/files/unit_test/testdata/tegra3.txt b/unit_test/testdata/tegra3.txt
index d1b09f6b..d1b09f6b 100644
--- a/files/unit_test/testdata/tegra3.txt
+++ b/unit_test/testdata/tegra3.txt
diff --git a/files/unit_test/testdata/test0.jpg b/unit_test/testdata/test0.jpg
index f4461a81..f4461a81 100644
--- a/files/unit_test/testdata/test0.jpg
+++ b/unit_test/testdata/test0.jpg
diff --git a/files/unit_test/testdata/test1.jpg b/unit_test/testdata/test1.jpg
index a0210e9d..a0210e9d 100644
--- a/files/unit_test/testdata/test1.jpg
+++ b/unit_test/testdata/test1.jpg
diff --git a/files/unit_test/testdata/test2.jpg b/unit_test/testdata/test2.jpg
index 816ca767..816ca767 100644
--- a/files/unit_test/testdata/test2.jpg
+++ b/unit_test/testdata/test2.jpg
diff --git a/files/unit_test/testdata/test3.jpg b/unit_test/testdata/test3.jpg
index 792d91dc..792d91dc 100644
--- a/files/unit_test/testdata/test3.jpg
+++ b/unit_test/testdata/test3.jpg
diff --git a/files/unit_test/testdata/test4.jpg b/unit_test/testdata/test4.jpg
index 1ef41668..1ef41668 100644
--- a/files/unit_test/testdata/test4.jpg
+++ b/unit_test/testdata/test4.jpg
diff --git a/files/unit_test/unit_test.cc b/unit_test/unit_test.cc
index 61145a46..239d5b92 100644
--- a/files/unit_test/unit_test.cc
+++ b/unit_test/unit_test.cc
@@ -88,6 +88,11 @@ int TestCpuEnv(int cpu_info) {
     cpu_info &= ~libyuv::kCpuHasLASX;
   }
 #endif
+#if defined(__riscv) && defined(__linux__)
+  if (TestEnv("LIBYUV_DISABLE_RVV")) {
+    cpu_info &= ~libyuv::kCpuHasRVV;
+  }
+#endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) &&                   \
     (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
      defined(_M_IX86))
@@ -139,11 +144,14 @@ int TestCpuEnv(int cpu_info) {
   if (TestEnv("LIBYUV_DISABLE_AVX512VBITALG")) {
     cpu_info &= ~libyuv::kCpuHasAVX512VBITALG;
   }
-  if (TestEnv("LIBYUV_DISABLE_AVX512VPOPCNTDQ")) {
-    cpu_info &= ~libyuv::kCpuHasAVX512VPOPCNTDQ;
+  if (TestEnv("LIBYUV_DISABLE_AVX10")) {
+    cpu_info &= ~libyuv::kCpuHasAVX10;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVXVNNI")) {
+    cpu_info &= ~libyuv::kCpuHasAVXVNNI;
   }
-  if (TestEnv("LIBYUV_DISABLE_GFNI")) {
-    cpu_info &= ~libyuv::kCpuHasGFNI;
+  if (TestEnv("LIBYUV_DISABLE_AVXVNNIINT8")) {
+    cpu_info &= ~libyuv::kCpuHasAVXVNNIINT8;
   }
 #endif
   if (TestEnv("LIBYUV_DISABLE_ASM")) {
diff --git a/files/unit_test/unit_test.h b/unit_test/unit_test.h
index 0a8df4d2..99cc8d19 100644
--- a/files/unit_test/unit_test.h
+++ b/unit_test/unit_test.h
@@ -11,10 +11,10 @@
 #ifndef UNIT_TEST_UNIT_TEST_H_  // NOLINT
 #define UNIT_TEST_UNIT_TEST_H_
 
+#include <stddef.h>  // For NULL
 #ifdef _WIN32
 #include <windows.h>
 #else
-#include <sys/resource.h>
 #include <sys/time.h>
 #endif
 
@@ -77,7 +77,18 @@ static inline bool SizeValid(int src_width,
 
 #define free_aligned_buffer_page_end(var) \
   free(var##_mem);                        \
-  var = 0
+  var = NULL
+
+#define align_buffer_page_end_16(var, size)                                 \
+  uint8_t* var##_mem =                                                      \
+      reinterpret_cast<uint8_t*>(malloc(((size)*2 + 4095 + 63) & ~4095));   \
+  uint16_t* var = reinterpret_cast<uint16_t*>(                              \
+      (intptr_t)(var##_mem + (((size)*2 + 4095 + 63) & ~4095) - (size)*2) & \
+      ~63)
+
+#define free_aligned_buffer_page_end_16(var) \
+  free(var##_mem);                           \
+  var = NULL
 
 #ifdef WIN32
 static inline double get_time() {
diff --git a/files/unit_test/video_common_test.cc b/unit_test/video_common_test.cc
index 36728ea9..36728ea9 100644
--- a/files/unit_test/video_common_test.cc
+++ b/unit_test/video_common_test.cc
diff --git a/files/util/Makefile b/util/Makefile
index 40e74b65..40e74b65 100644
--- a/files/util/Makefile
+++ b/util/Makefile
diff --git a/files/util/color.cc b/util/color.cc
index 8c3bbefd..8c3bbefd 100644
--- a/files/util/color.cc
+++ b/util/color.cc
diff --git a/files/util/compare.cc b/util/compare.cc
index a16613ee..a16613ee 100644
--- a/files/util/compare.cc
+++ b/util/compare.cc
diff --git a/files/util/cpuid.c b/util/cpuid.c
index b618bb10..c07e6e95 100644
--- a/files/util/cpuid.c
+++ b/util/cpuid.c
@@ -21,8 +21,9 @@ using namespace libyuv;
 int main(int argc, const char* argv[]) {
   int cpu_flags = TestCpuFlag(-1);
   int has_arm = TestCpuFlag(kCpuHasARM);
-  int has_mips = TestCpuFlag(kCpuHasMIPS);
+  int has_riscv = TestCpuFlag(kCpuHasRISCV);
   int has_x86 = TestCpuFlag(kCpuHasX86);
+  int has_mips = TestCpuFlag(kCpuHasMIPS);
   int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
   (void)argc;
   (void)argv;
@@ -62,24 +63,28 @@ int main(int argc, const char* argv[]) {
            model, model);
   }
 #endif
-  printf("Cpu Flags %x\n", cpu_flags);
-  printf("Has ARM %x\n", has_arm);
-  printf("Has MIPS %x\n", has_mips);
-  printf("Has X86 %x\n", has_x86);
-  printf("Has LOONGARCH %x\n", has_loongarch);
+  printf("Cpu Flags 0x%x\n", cpu_flags);
   if (has_arm) {
     int has_neon = TestCpuFlag(kCpuHasNEON);
-    printf("Has NEON %x\n", has_neon);
+    printf("Has ARM 0x%x\n", has_arm);
+    printf("Has NEON 0x%x\n", has_neon);
+  }
+  if (has_riscv) {
+    int has_rvv = TestCpuFlag(kCpuHasRVV);
+    printf("Has RISCV 0x%x\n", has_riscv);
+    printf("Has RVV 0x%x\n", has_rvv);
   }
   if (has_mips) {
     int has_msa = TestCpuFlag(kCpuHasMSA);
-    printf("Has MSA %x\n", has_msa);
+    printf("Has MIPS 0x%x\n", has_mips);
+    printf("Has MSA 0x%x\n", has_msa);
   }
   if (has_loongarch) {
     int has_lsx  = TestCpuFlag(kCpuHasLSX);
-    printf("Has LSX %x\n", has_lsx);
     int has_lasx = TestCpuFlag(kCpuHasLASX);
-    printf("Has LASX %x\n", has_lasx);
+    printf("Has LOONGARCH 0x%x\n", has_loongarch);
+    printf("Has LSX 0x%x\n", has_lsx);
+    printf("Has LASX 0x%x\n", has_lasx);
   }
   if (has_x86) {
     int has_sse2 = TestCpuFlag(kCpuHasSSE2);
@@ -91,31 +96,34 @@ int main(int argc, const char* argv[]) {
     int has_erms = TestCpuFlag(kCpuHasERMS);
     int has_fma3 = TestCpuFlag(kCpuHasFMA3);
     int has_f16c = TestCpuFlag(kCpuHasF16C);
-    int has_gfni = TestCpuFlag(kCpuHasGFNI);
     int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
     int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
     int has_avx512vnni = TestCpuFlag(kCpuHasAVX512VNNI);
     int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
     int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
     int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
-    int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ);
-    printf("Has SSE2 %x\n", has_sse2);
-    printf("Has SSSE3 %x\n", has_ssse3);
-    printf("Has SSE4.1 %x\n", has_sse41);
-    printf("Has SSE4.2 %x\n", has_sse42);
-    printf("Has AVX %x\n", has_avx);
-    printf("Has AVX2 %x\n", has_avx2);
-    printf("Has ERMS %x\n", has_erms);
-    printf("Has FMA3 %x\n", has_fma3);
-    printf("Has F16C %x\n", has_f16c);
-    printf("Has GFNI %x\n", has_gfni);
-    printf("Has AVX512BW %x\n", has_avx512bw);
-    printf("Has AVX512VL %x\n", has_avx512vl);
-    printf("Has AVX512VNNI %x\n", has_avx512vnni);
-    printf("Has AVX512VBMI %x\n", has_avx512vbmi);
-    printf("Has AVX512VBMI2 %x\n", has_avx512vbmi2);
-    printf("Has AVX512VBITALG %x\n", has_avx512vbitalg);
-    printf("Has AVX512VPOPCNTDQ %x\n", has_avx512vpopcntdq);
+    int has_avx10 = TestCpuFlag(kCpuHasAVX10);
+    int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI);
+    int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8);
+    printf("Has X86 0x%x\n", has_x86);
+    printf("Has SSE2 0x%x\n", has_sse2);
+    printf("Has SSSE3 0x%x\n", has_ssse3);
+    printf("Has SSE4.1 0x%x\n", has_sse41);
+    printf("Has SSE4.2 0x%x\n", has_sse42);
+    printf("Has AVX 0x%x\n", has_avx);
+    printf("Has AVX2 0x%x\n", has_avx2);
+    printf("Has ERMS 0x%x\n", has_erms);
+    printf("Has FMA3 0x%x\n", has_fma3);
+    printf("Has F16C 0x%x\n", has_f16c);
+    printf("Has AVX512BW 0x%x\n", has_avx512bw);
+    printf("Has AVX512VL 0x%x\n", has_avx512vl);
+    printf("Has AVX512VNNI 0x%x\n", has_avx512vnni);
+    printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi);
+    printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2);
+    printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg);
+    printf("Has AVX10 0x%x\n", has_avx10);
+    printf("HAS AVXVNNI 0x%x\n", has_avxvnni);
+    printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
   }
   return 0;
 }
diff --git a/files/util/i444tonv12_eg.cc b/util/i444tonv12_eg.cc
index 0fcb4095..0fcb4095 100644
--- a/files/util/i444tonv12_eg.cc
+++ b/util/i444tonv12_eg.cc
diff --git a/files/util/psnr.cc b/util/psnr.cc
index c7bee7f9..c7bee7f9 100644
--- a/files/util/psnr.cc
+++ b/util/psnr.cc
diff --git a/files/util/psnr.h b/util/psnr.h
index aac128cb..aac128cb 100644
--- a/files/util/psnr.h
+++ b/util/psnr.h
diff --git a/files/util/psnr_main.cc b/util/psnr_main.cc
index 8b9fd972..8b9fd972 100644
--- a/files/util/psnr_main.cc
+++ b/util/psnr_main.cc
diff --git a/files/util/ssim.cc b/util/ssim.cc
index 096fbcf0..096fbcf0 100644
--- a/files/util/ssim.cc
+++ b/util/ssim.cc
diff --git a/files/util/ssim.h b/util/ssim.h
index a855f1d1..a855f1d1 100644
--- a/files/util/ssim.h
+++ b/util/ssim.h
diff --git a/files/util/yuvconstants.c b/util/yuvconstants.c
index 037e0824..4e5185af 100644
--- a/files/util/yuvconstants.c
+++ b/util/yuvconstants.c
@@ -43,9 +43,10 @@
 // #define BR (-VR * 128 + YB)
 
 int main(int argc, const char* argv[]) {
-  if (argc < 2) {
-    printf("yuvconstants Kr Kb\n");
-    printf("  MC BT          KR = 0.2126; KB = 0.0722\n");
+  if (argc < 3) {
+    printf("yuvconstants [KR] [KB]\n");
+    printf("  e.g. yuvconstants 0.2126 0.0722\n");
+    printf("  MC BT          KR           KB\n");
     printf("  1  BT.709      KR = 0.2126; KB = 0.0722\n");
     printf("  4  FCC         KR = 0.30;   KB = 0.11\n");
     printf("  6  BT.601      KR = 0.299;  KB = 0.114\n");
@@ -53,8 +54,8 @@ int main(int argc, const char* argv[]) {
     printf("  9  BT.2020     KR = 0.2627; KB = 0.0593\n");
     return -1;
   }
-  float kr = atof(argv[1]);
-  float kb = atof(argv[2]);
+  float kr = (float)atof(argv[1]);
+  float kb = (float)atof(argv[2]);
   float kg = 1 - kr - kb;
 
   float vr = 2 * (1 - kr);
diff --git a/files/util/yuvconvert.cc b/util/yuvconvert.cc
index 332699e3..93b52668 100644
--- a/files/util/yuvconvert.cc
+++ b/util/yuvconvert.cc
@@ -42,9 +42,9 @@ static __inline uint32_t Abs(int32_t v) {
 }
 
 // Parse PYUV format. ie name.1920x800_24Hz_P420.yuv
-bool ExtractResolutionFromFilename(const char* name,
-                                   int* width_ptr,
-                                   int* height_ptr) {
+static bool ExtractResolutionFromFilename(const char* name,
+                                          int* width_ptr,
+                                          int* height_ptr) {
   // Isolate the .width_height. section of the filename by searching for a
   // dot or underscore followed by a digit.
   for (int i = 0; name[i]; ++i) {
@@ -59,7 +59,7 @@ bool ExtractResolutionFromFilename(const char* name,
   return false;
 }
 
-void PrintHelp(const char* program) {
+static void PrintHelp(const char* program) {
   printf("%s [-options] src_argb.raw dst_yuv.raw\n", program);
   printf(
       " -s <width> <height> .... specify source resolution.  "
@@ -78,7 +78,7 @@ void PrintHelp(const char* program) {
   exit(0);
 }
 
-void ParseOptions(int argc, const char* argv[]) {
+static void ParseOptions(int argc, const char* argv[]) {
   if (argc <= 1) {
     PrintHelp(argv[0]);
   }
diff --git a/files/winarm.mk b/winarm.mk
index b0a344ae..b0a344ae 100644
--- a/files/winarm.mk
+++ b/winarm.mk
author	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	2024-03-21 23:41:35 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	2024-03-21 23:41:35 +0000
commit	4d6f34f32c7828ab935beb54cdca08bd7bbd39c3 (patch)
tree	7aa355fd0b89ec0b2611e17ee84a14c6fa449e22
parent	7447ad8db319c34111a3d43b64b9485ac1a8115d (diff)
parent	50ea3b267729413aa561eaea5aac7e6fcd565101 (diff)
download	libyuv-androidx-fragment-release.tar.gz