Update libyuv to r1602 version to get best performance.android-cts_7.1_r1 android-cts-7.1_r9 android-cts-7.1_r8 android-cts-7.1_r7 android-cts-7.1_r6 android-cts-7.1_r5 android-cts-7.1_r4 android-cts-7.1_r3 android-cts-7.1_r29 android-cts-7.1_r28 android-cts-7.1_r27 android-cts-7.1_r26 android-cts-7.1_r25 android-cts-7.1_r24 android-cts-7.1_r23 android-cts-7.1_r22 android-cts-7.1_r21 android-cts-7.1_r20 android-cts-7.1_r2 android-cts-7.1_r19 android-cts-7.1_r18 android-cts-7.1_r17 android-cts-7.1_r16 android-cts-7.1_r15 android-cts-7.1_r14 android-cts-7.1_r13 android-cts-7.1_r12 android-cts-7.1_r11 android-cts-7.1_r10 android-cts-7.1_r1 android-7.1.2_r9 android-7.1.2_r8 android-7.1.2_r6 android-7.1.2_r5 android-7.1.2_r4 android-7.1.2_r39 android-7.1.2_r38 android-7.1.2_r37 android-7.1.2_r36 android-7.1.2_r33 android-7.1.2_r32 android-7.1.2_r30 android-7.1.2_r3 android-7.1.2_r29 android-7.1.2_r28 android-7.1.2_r27 android-7.1.2_r25 android-7.1.2_r24 android-7.1.2_r23 android-7.1.2_r2 android-7.1.2_r19 android-7.1.2_r18 android-7.1.2_r17 android-7.1.2_r16 android-7.1.2_r15 android-7.1.2_r14 android-7.1.2_r13 android-7.1.2_r12 android-7.1.2_r11 android-7.1.2_r10 android-7.1.2_r1 android-7.1.1_r9 android-7.1.1_r8 android-7.1.1_r7 android-7.1.1_r61 android-7.1.1_r60 android-7.1.1_r6 android-7.1.1_r59 android-7.1.1_r58 android-7.1.1_r57 android-7.1.1_r56 android-7.1.1_r55 android-7.1.1_r54 android-7.1.1_r53 android-7.1.1_r52 android-7.1.1_r51 android-7.1.1_r50 android-7.1.1_r49 android-7.1.1_r48 android-7.1.1_r47 android-7.1.1_r46 android-7.1.1_r45 android-7.1.1_r44 android-7.1.1_r43 android-7.1.1_r42 android-7.1.1_r41 android-7.1.1_r40 android-7.1.1_r4 android-7.1.1_r39 android-7.1.1_r38 android-7.1.1_r35 android-7.1.1_r33 android-7.1.1_r32 android-7.1.1_r31 android-7.1.1_r3 android-7.1.1_r28 android-7.1.1_r27 android-7.1.1_r26 android-7.1.1_r25 android-7.1.1_r24 android-7.1.1_r23 android-7.1.1_r22 android-7.1.1_r21 android-7.1.1_r20 android-7.1.1_r2 android-7.1.1_r17 android-7.1.1_r16 android-7.1.1_r15 android-7.1.1_r14 android-7.1.1_r13 android-7.1.1_r12 android-7.1.1_r11 android-7.1.1_r10 android-7.1.1_r1 android-7.1.0_r7 android-7.1.0_r6 android-7.1.0_r5 android-7.1.0_r4 android-7.1.0_r3 android-7.1.0_r2 android-7.1.0_r1 nougat-mr2.3-release nougat-mr2.2-release nougat-mr2.1-release nougat-mr2-security-release nougat-mr2-release nougat-mr2-pixel-release nougat-mr2-dev nougat-mr1.8-release nougat-mr1.7-release nougat-mr1.6-release nougat-mr1.5-release nougat-mr1.4-release nougat-mr1.3-release nougat-mr1.2-release nougat-mr1.1-release nougat-mr1-volantis-release nougat-mr1-security-release nougat-mr1-release nougat-mr1-flounder-release nougat-mr1-dev nougat-mr1-cts-release nougat-dr1-release

Bug: 29870647 Change-Id: I8ec9fab7f55765fa33ebe7ba1c7ad2147f418de2
author: Hangyu Kuang <hkuang@google.com> 2016-07-06 14:21:45 -0700
committer: Hangyu Kuang <hkuang@google.com> 2016-07-08 09:51:10 -0700
commit: f047e7ca6983218eed7703c7afd51fed7bd3b5c9 (patch)
tree: 2667579566b6270c21ee4b495b4cd119af5ccf5b
parent: bb74e3e19b98261031216de8cadcef34cccd9e4a (diff)
download: libyuv-nougat-mr2-pixel-release.tar.gz
149 files changed, 53862 insertions, 20721 deletions
diff --git a/Android.mk b/Android.mk
index 212c4fc6..42e9e427 100644
--- a/Android.mk
+++ b/Android.mk
@@ -1,56 +1,5 @@
-# This is the Android makefile for google3/third_party/libsrtp so that we can
-# build it with the Android NDK.
-
 LOCAL_PATH := $(call my-dir)
-
-common_SRC_FILES := \
-    files/source/compare.cc \
-    files/source/convert.cc \
-    files/source/convert_argb.cc \
-    files/source/convert_from.cc \
-    files/source/cpu_id.cc \
-    files/source/format_conversion.cc \
-    files/source/mjpeg_decoder.cc \
-    files/source/planar_functions.cc \
-    files/source/rotate.cc \
-    files/source/rotate_argb.cc \
-    files/source/row_common.cc \
-    files/source/row_posix.cc \
-    files/source/scale.cc \
-    files/source/scale_argb.cc \
-    files/source/video_common.cc
-
-common_CFLAGS := -Wall -fexceptions -DHAVE_JPEG
-
-common_C_INCLUDES = $(LOCAL_PATH)/files/include
-
-# For the device
-# =====================================================
-# Device static library
-
 include $(CLEAR_VARS)
 
-LOCAL_CPP_EXTENSION := .cc
-
-LOCAL_SDK_VERSION := 9
-LOCAL_NDK_STL_VARIANT := stlport_static
-
-LOCAL_SRC_FILES := $(common_SRC_FILES)
-LOCAL_CFLAGS += $(common_CFLAGS)
-LOCAL_C_INCLUDES += $(common_C_INCLUDES)
-LOCAL_SHARED_LIBRARIES := libjpeg
-
-LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH)/files/include
-
-ifeq ($(ARCH_ARM_HAVE_NEON),true)
-    LOCAL_CFLAGS_arm += -DLIBYUV_NEON
-    LOCAL_SRC_FILES_arm += \
-        files/source/compare_neon.cc \
-        files/source/rotate_neon.cc \
-        files/source/row_neon.cc \
-        files/source/scale_neon.cc
-endif
-
-LOCAL_MODULE := libyuv_static
-
-include $(BUILD_STATIC_LIBRARY)
+# libyuv
+include external/libyuv/files/Android.mk
diff --git a/README.google b/README.google
index c887302d..285364ed 100644
--- a/README.google
+++ b/README.google
@@ -1,5 +1,5 @@
-URL: http://libyuv.googlecode.com/svn-history/r397/trunk/
-Version: r397
+URL: https://chromium.googlesource.com/libyuv/libyuv/
+Version: r1602
 License: BSD
 License File: LICENSE
 
diff --git a/files/.gitignore b/files/.gitignore
new file mode 100644
index 00000000..711f09e0
--- /dev/null
+++ b/files/.gitignore
@@ -0,0 +1,94 @@
+*.pyc
+pin-log.txt
+/base
+/build
+/buildtools
+/chromium/.gclient.tmp
+/chromium/.gclient.tmp_entries
+/chromium/.last_sync_chromium
+/chromium/src/
+/google_apis
+/links
+/links.db
+/mojo
+/native_client
+/net
+/out
+/sde-avx-sse-transition-out.txt
+/testing
+/third_party/android_platform
+/third_party/android_tools
+/third_party/appurify-python
+/third_party/asan
+/third_party/ashmem
+/third_party/binutils
+/third_party/BUILD.gn
+/third_party/catapult
+/third_party/drmemory
+/third_party/gflags/src
+/third_party/icu
+/third_party/ijar
+/third_party/instrumented_libraries
+/third_party/jsr-305
+/third_party/junit
+/third_party/libjpeg
+/third_party/libjpeg_turbo
+/third_party/libxml
+/third_party/llvm
+/third_party/llvm-build
+/third_party/lss
+/third_party/mockito
+/third_party/modp_b64
+/third_party/protobuf
+/third_party/requests
+/third_party/robolectric
+/third_party/WebKit
+/third_party/yasm
+/tools/android
+/tools/clang
+/tools/generate_library_loader
+/tools/gn
+/tools/grit
+/tools/gritsettings/README
+/tools/gritsettings/resource_ids
+/tools/gyp
+/tools/isolate_driver.py
+/tools/memory
+/tools/protoc_wrapper
+/tools/python
+/tools/sanitizer_options
+/tools/swarming_client
+/tools/tsan_suppressions
+/tools/valgrind
+/tools/valgrind-libyuv/libyuv_tests.bat
+/tools/valgrind-libyuv/libyuv_tests.py
+/tools/valgrind-libyuv/libyuv_tests.sh
+/tools/valgrind-libyuv/memcheck/OWNERS
+/tools/valgrind-libyuv/memcheck/PRESUBMIT.py
+/tools/valgrind-libyuv/memcheck/suppressions.txt
+/tools/valgrind-libyuv/memcheck/suppressions_mac.txt
+/tools/valgrind-libyuv/memcheck/suppressions_win32.txt
+/tools/valgrind-libyuv/tsan/OWNERS
+/tools/valgrind-libyuv/tsan/PRESUBMIT.py
+/tools/valgrind-libyuv/tsan/suppressions.txt
+/tools/valgrind-libyuv/tsan/suppressions_mac.txt
+/tools/valgrind-libyuv/tsan/suppressions_win32.txt
+/tools/vim
+/tools/win
+
+# Files generated by CMake build
+cmake_install.cmake
+CMakeCache.txt
+CMakeFiles/
+convert
+libgtest.a
+libyuv.a
+libyuv_unittest
+
+# Files generated by winarm.mk build
+libyuv_arm.lib
+source/*.o
+
+# Files generated by perf
+perf.data
+perf.data.old
diff --git a/files/.gn b/files/.gn
new file mode 100644
index 00000000..63dad32d
--- /dev/null
+++ b/files/.gn
@@ -0,0 +1,46 @@
+# Copyright 2015 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# The location of the build configuration file.
+buildconfig = "//build/config/BUILDCONFIG.gn"
+
+# The secondary source root is a parallel directory tree where
+# GN build files are placed when they can not be placed directly
+# in the source tree, e.g. for third party source trees.
+secondary_source = "//build/secondary/"
+
+# These are the targets to check headers for by default. The files in targets
+# matching these patterns (see "gn help label_pattern" for format) will have
+# their includes checked for proper dependencies when you run either
+# "gn check" or "gn gen --check".
+check_targets = [ "//libyuv/*" ]
+
+# These are the list of GN files that run exec_script. This whitelist exists
+# to force additional review for new uses of exec_script, which is strongly
+# discouraged except for gypi_to_gn calls.
+exec_script_whitelist = [
+  "//build/config/BUILD.gn",
+  "//build/config/android/BUILD.gn",
+  "//build/config/android/config.gni",
+  "//build/config/android/internal_rules.gni",
+  "//build/config/android/rules.gni",
+  "//build/config/compiler/BUILD.gn",
+  "//build/config/gcc/gcc_version.gni",
+  "//build/config/ios/ios_sdk.gni",
+  "//build/config/linux/BUILD.gn",
+  "//build/config/linux/pkg_config.gni",
+  "//build/config/mac/mac_sdk.gni",
+  "//build/config/posix/BUILD.gn",
+  "//build/config/sysroot.gni",
+  "//build/config/win/visual_studio_version.gni",
+  "//build/gn_helpers.py",
+  "//build/gypi_to_gn.py",
+  "//build/toolchain/gcc_toolchain.gni",
+  "//build/toolchain/mac/BUILD.gn",
+  "//build/toolchain/win/BUILD.gn",
+]
diff --git a/files/Android.mk b/files/Android.mk
new file mode 100644
index 00000000..a13a1706
--- /dev/null
+++ b/files/Android.mk
@@ -0,0 +1,59 @@
+# This is the Android makefile for libyuv for both platform and NDK.
+LOCAL_PATH:= $(call my-dir)
+
+include $(CLEAR_VARS)
+
+LOCAL_CPP_EXTENSION := .cc
+
+LOCAL_SRC_FILES := \
+    source/compare.cc           \
+    source/compare_common.cc    \
+    source/compare_neon64.cc    \
+    source/compare_gcc.cc       \
+    source/convert.cc           \
+    source/convert_argb.cc      \
+    source/convert_from.cc      \
+    source/convert_from_argb.cc \
+    source/convert_to_argb.cc   \
+    source/convert_to_i420.cc   \
+    source/cpu_id.cc            \
+    source/planar_functions.cc  \
+    source/rotate.cc            \
+    source/rotate_any.cc        \
+    source/rotate_argb.cc       \
+    source/rotate_common.cc     \
+    source/rotate_mips.cc       \
+    source/rotate_neon64.cc     \
+    source/rotate_gcc.cc        \
+    source/row_any.cc           \
+    source/row_common.cc        \
+    source/row_mips.cc          \
+    source/row_neon64.cc        \
+    source/row_gcc.cc	          \
+    source/scale.cc             \
+    source/scale_any.cc         \
+    source/scale_argb.cc        \
+    source/scale_common.cc      \
+    source/scale_mips.cc        \
+    source/scale_neon64.cc      \
+    source/scale_gcc.cc         \
+    source/video_common.cc      \
+    source/compare_neon.cc      \
+    source/rotate_neon.cc       \
+    source/row_neon.cc          \
+    source/scale_neon.cc        \
+    source/mjpeg_decoder.cc     \
+    source/convert_jpeg.cc      \
+    source/mjpeg_validate.cc
+
+common_CFLAGS := -Wall -fexceptions -DHAVE_JPEG
+LOCAL_CFLAGS += $(common_CFLAGS)
+LOCAL_SHARED_LIBRARIES := libjpeg
+LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/include
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/include
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH)/include
+
+LOCAL_MODULE := libyuv_static
+LOCAL_MODULE_TAGS := optional
+
+include $(BUILD_STATIC_LIBRARY)
diff --git a/files/BUILD.gn b/files/BUILD.gn
new file mode 100644
index 00000000..b091cbc2
--- /dev/null
+++ b/files/BUILD.gn
@@ -0,0 +1,135 @@
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+import("//build/config/arm.gni")
+import("//build/config/sanitizers/sanitizers.gni")
+
+config("libyuv_config") {
+  include_dirs = [
+    ".",
+    "include",
+  ]
+}
+
+use_neon = current_cpu == "arm64" || (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon))
+
+source_set("libyuv") {
+  sources = [
+    # Headers
+    "include/libyuv.h",
+    "include/libyuv/basic_types.h",
+    "include/libyuv/compare.h",
+    "include/libyuv/convert.h",
+    "include/libyuv/convert_argb.h",
+    "include/libyuv/convert_from.h",
+    "include/libyuv/convert_from_argb.h",
+    "include/libyuv/cpu_id.h",
+    "include/libyuv/mjpeg_decoder.h",
+    "include/libyuv/planar_functions.h",
+    "include/libyuv/rotate.h",
+    "include/libyuv/rotate_argb.h",
+    "include/libyuv/rotate_row.h",
+    "include/libyuv/row.h",
+    "include/libyuv/scale.h",
+    "include/libyuv/scale_argb.h",
+    "include/libyuv/scale_row.h",
+    "include/libyuv/version.h",
+    "include/libyuv/video_common.h",
+
+    # Source Files
+    "source/compare.cc",
+    "source/compare_common.cc",
+    "source/compare_gcc.cc",
+    "source/compare_win.cc",
+    "source/convert.cc",
+    "source/convert_argb.cc",
+    "source/convert_from.cc",
+    "source/convert_from_argb.cc",
+    "source/convert_jpeg.cc",
+    "source/convert_to_argb.cc",
+    "source/convert_to_i420.cc",
+    "source/cpu_id.cc",
+    "source/mjpeg_decoder.cc",
+    "source/mjpeg_validate.cc",
+    "source/planar_functions.cc",
+    "source/rotate.cc",
+    "source/rotate_any.cc",
+    "source/rotate_argb.cc",
+    "source/rotate_common.cc",
+    "source/rotate_mips.cc",
+    "source/rotate_gcc.cc",
+    "source/rotate_win.cc",
+    "source/row_any.cc",
+    "source/row_common.cc",
+    "source/row_mips.cc",
+    "source/row_gcc.cc",
+    "source/row_win.cc",
+    "source/scale.cc",
+    "source/scale_any.cc",
+    "source/scale_argb.cc",
+    "source/scale_common.cc",
+    "source/scale_mips.cc",
+    "source/scale_gcc.cc",
+    "source/scale_win.cc",
+    "source/video_common.cc",
+  ]
+
+  configs -= [ "//build/config/compiler:chromium_code" ]
+  configs += [ "//build/config/compiler:no_chromium_code" ]
+
+  public_configs = [ ":libyuv_config" ]
+
+  defines = []
+
+  if (!is_ios) {
+    defines += [ "HAVE_JPEG" ]
+  }
+
+  if (is_msan) {
+    # MemorySanitizer does not support assembly code yet.
+    # http://crbug.com/344505
+    defines += [ "LIBYUV_DISABLE_X86" ]
+  }
+
+  deps = [
+    "//third_party:jpeg",
+  ]
+
+  if (use_neon) {
+    deps += [ ":libyuv_neon" ]
+  }
+
+  if (is_nacl) {
+    # Always enable optimization under NaCl to workaround crbug.com/538243 .
+    configs -= [ "//build/config/compiler:default_optimization" ]
+    configs += [ "//build/config/compiler:optimize_max" ]
+  }
+}
+
+if (use_neon) {
+  static_library("libyuv_neon") {
+    sources = [
+      # ARM Source Files
+      "source/compare_neon.cc",
+      "source/compare_neon64.cc",
+      "source/rotate_neon.cc",
+      "source/rotate_neon64.cc",
+      "source/row_neon.cc",
+      "source/row_neon64.cc",
+      "source/scale_neon.cc",
+      "source/scale_neon64.cc",
+    ]
+
+    public_configs = [ ":libyuv_config" ]
+
+    if (current_cpu != "arm64") {
+      configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
+      cflags = [ "-mfpu=neon" ]
+    }
+  }
+}
diff --git a/files/CMakeLists.txt b/files/CMakeLists.txt
new file mode 100644
index 00000000..718b47ad
--- /dev/null
+++ b/files/CMakeLists.txt
@@ -0,0 +1,142 @@
+cmake_minimum_required(VERSION 2.8)
+
+# CMakeLists for libyuv
+# Originally created for "roxlu build system" to compile libyuv on windows
+# Run with -DTEST=ON to build unit tests
+option(TEST "Built unit tests" OFF)
+
+set(ly_base_dir ${CMAKE_CURRENT_LIST_DIR})
+set(ly_src_dir ${ly_base_dir}/source/)
+set(ly_inc_dir ${ly_base_dir}/include)
+set(ly_lib_name "yuv")
+
+set(ly_source_files
+  ${ly_src_dir}/compare.cc
+  ${ly_src_dir}/compare_common.cc
+  ${ly_src_dir}/compare_neon.cc
+  ${ly_src_dir}/compare_neon64.cc
+  ${ly_src_dir}/compare_gcc.cc
+  ${ly_src_dir}/compare_win.cc
+  ${ly_src_dir}/convert.cc
+  ${ly_src_dir}/convert_argb.cc
+  ${ly_src_dir}/convert_from.cc
+  ${ly_src_dir}/convert_from_argb.cc
+  ${ly_src_dir}/convert_jpeg.cc
+  ${ly_src_dir}/convert_to_argb.cc
+  ${ly_src_dir}/convert_to_i420.cc
+  ${ly_src_dir}/cpu_id.cc
+  ${ly_src_dir}/mjpeg_decoder.cc
+  ${ly_src_dir}/mjpeg_validate.cc
+  ${ly_src_dir}/planar_functions.cc
+  ${ly_src_dir}/rotate.cc
+  ${ly_src_dir}/rotate_any.cc
+  ${ly_src_dir}/rotate_argb.cc
+  ${ly_src_dir}/rotate_common.cc
+  ${ly_src_dir}/rotate_mips.cc
+  ${ly_src_dir}/rotate_neon.cc
+  ${ly_src_dir}/rotate_neon64.cc
+  ${ly_src_dir}/rotate_gcc.cc
+  ${ly_src_dir}/rotate_win.cc
+  ${ly_src_dir}/row_any.cc
+  ${ly_src_dir}/row_common.cc
+  ${ly_src_dir}/row_mips.cc
+  ${ly_src_dir}/row_neon.cc
+  ${ly_src_dir}/row_neon64.cc
+  ${ly_src_dir}/row_gcc.cc
+  ${ly_src_dir}/row_win.cc
+  ${ly_src_dir}/scale.cc
+  ${ly_src_dir}/scale_any.cc
+  ${ly_src_dir}/scale_argb.cc
+  ${ly_src_dir}/scale_common.cc
+  ${ly_src_dir}/scale_mips.cc
+  ${ly_src_dir}/scale_neon.cc
+  ${ly_src_dir}/scale_neon64.cc
+  ${ly_src_dir}/scale_gcc.cc
+  ${ly_src_dir}/scale_win.cc
+  ${ly_src_dir}/video_common.cc
+)
+
+set(ly_unittest_sources
+  ${ly_base_dir}/unit_test/basictypes_test.cc
+  ${ly_base_dir}/unit_test/color_test.cc
+  ${ly_base_dir}/unit_test/compare_test.cc
+  ${ly_base_dir}/unit_test/convert_test.cc
+  ${ly_base_dir}/unit_test/cpu_test.cc
+  ${ly_base_dir}/unit_test/math_test.cc
+  ${ly_base_dir}/unit_test/planar_test.cc
+  ${ly_base_dir}/unit_test/rotate_argb_test.cc
+  ${ly_base_dir}/unit_test/rotate_test.cc
+  ${ly_base_dir}/unit_test/scale_argb_test.cc
+  ${ly_base_dir}/unit_test/scale_test.cc
+  ${ly_base_dir}/unit_test/unit_test.cc
+  ${ly_base_dir}/unit_test/video_common_test.cc
+)
+
+set(ly_header_files
+  ${ly_inc_dir}/libyuv/basic_types.h
+  ${ly_inc_dir}/libyuv/compare.h
+  ${ly_inc_dir}/libyuv/convert.h
+  ${ly_inc_dir}/libyuv/convert_argb.h
+  ${ly_inc_dir}/libyuv/convert_from.h
+  ${ly_inc_dir}/libyuv/convert_from_argb.h
+  ${ly_inc_dir}/libyuv/cpu_id.h
+  ${ly_inc_dir}/libyuv/planar_functions.h
+  ${ly_inc_dir}/libyuv/rotate.h
+  ${ly_inc_dir}/libyuv/rotate_argb.h
+  ${ly_inc_dir}/libyuv/rotate_row.h
+  ${ly_inc_dir}/libyuv/row.h
+  ${ly_inc_dir}/libyuv/scale.h
+  ${ly_inc_dir}/libyuv/scale_argb.h
+  ${ly_inc_dir}/libyuv/scale_row.h
+  ${ly_inc_dir}/libyuv/version.h
+  ${ly_inc_dir}/libyuv/video_common.h
+  ${ly_inc_dir}/libyuv/mjpeg_decoder.h
+)
+
+include_directories(${ly_inc_dir})
+
+add_library(${ly_lib_name} STATIC ${ly_source_files})
+
+add_executable(convert ${ly_base_dir}/util/convert.cc)
+target_link_libraries(convert ${ly_lib_name})
+
+include(FindJPEG)
+if (JPEG_FOUND)
+  include_directories(${JPEG_INCLUDE_DIR})
+  target_link_libraries(convert ${JPEG_LIBRARY})
+  add_definitions(-DHAVE_JPEG)
+endif()
+
+if(TEST)
+  find_library(GTEST_LIBRARY gtest)
+  if(GTEST_LIBRARY STREQUAL "GTEST_LIBRARY-NOTFOUND")
+    set(GTEST_SRC_DIR /usr/src/gtest CACHE STRING "Location of gtest sources")
+    if(EXISTS ${GTEST_SRC_DIR}/src/gtest-all.cc)
+      message(STATUS "building gtest from sources in ${GTEST_SRC_DIR}")
+      set(gtest_sources ${GTEST_SRC_DIR}/src/gtest-all.cc)
+      add_library(gtest STATIC ${gtest_sources})
+      include_directories(${GTEST_SRC_DIR})
+      include_directories(${GTEST_SRC_DIR}/include)
+      set(GTEST_LIBRARY gtest)
+    else()
+      message(FATAL_ERROR "TEST is set but unable to find gtest library")
+    endif()
+  endif()
+
+  add_executable(libyuv_unittest ${ly_unittest_sources})
+  target_link_libraries(libyuv_unittest ${ly_lib_name} ${GTEST_LIBRARY} pthread)
+  if (JPEG_FOUND)
+    target_link_libraries(libyuv_unittest ${JPEG_LIBRARY})
+  endif()
+  
+  if(NACL AND NACL_LIBC STREQUAL "newlib")
+    target_link_libraries(libyuv_unittest glibc-compat)
+  endif()
+
+  target_link_libraries(libyuv_unittest gflags)
+  
+endif()
+
+install(TARGETS ${ly_lib_name} DESTINATION lib)
+install(FILES ${ly_header_files} DESTINATION include/libyuv)
+install(FILES ${ly_inc_dir}/libyuv.h DESTINATION include/)
diff --git a/files/DEPS b/files/DEPS
new file mode 100644
index 00000000..0a450050
--- /dev/null
+++ b/files/DEPS
@@ -0,0 +1,42 @@
+vars = {
+  # Override root_dir in your .gclient's custom_vars to specify a custom root
+  # folder name.
+  'root_dir': 'libyuv',
+  'extra_gyp_flag': '-Dextra_gyp_flag=0',
+  'chromium_git': 'https://chromium.googlesource.com',
+
+  # Roll the Chromium Git hash to pick up newer versions of all the
+  # dependencies and tools linked to in setup_links.py.
+  'chromium_revision': '2a818f54130d8c93f81490adce5a1e87307bf5f0',
+}
+
+# NOTE: Prefer revision numbers to tags for svn deps. Use http rather than
+# https; the latter can cause problems for users behind proxies.
+deps = {
+  Var('root_dir') + '/third_party/gflags/src':
+    Var('chromium_git') + '/external/github.com/gflags/gflags@03bebcb065c83beff83d50ae025a55a4bf94dfca',
+}
+
+# Define rules for which include paths are allowed in our source.
+include_rules = [ '+gflags' ]
+
+hooks = [
+  {
+    # Clone chromium and its deps.
+    'name': 'sync chromium',
+    'pattern': '.',
+    'action': ['python', '-u', Var('root_dir') + '/sync_chromium.py',
+               '--target-revision', Var('chromium_revision')],
+  },
+  {
+    # Create links to shared dependencies in Chromium.
+    'name': 'setup_links',
+    'pattern': '.',
+    'action': ['python', Var('root_dir') + '/setup_links.py'],
+  },
+  {
+    # A change to a .gyp, .gypi, or to GYP itself should run the generator.
+    'pattern': '.',
+    'action': ['python', Var('root_dir') + '/gyp_libyuv'],
+  },
+]
diff --git a/files/LICENSE b/files/LICENSE
index da40b336..c911747a 100644
--- a/files/LICENSE
+++ b/files/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2011, Google Inc. All rights reserved.
+Copyright 2011 The LibYuv Project Authors. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
diff --git a/files/LICENSE_THIRD_PARTY b/files/LICENSE_THIRD_PARTY
new file mode 100644
index 00000000..a71591e7
--- /dev/null
+++ b/files/LICENSE_THIRD_PARTY
@@ -0,0 +1,8 @@
+This source tree contains third party source code which is governed by third
+party licenses. This file contains references to files which are under other
+licenses than the one provided in the LICENSE file in the root of the source
+tree.
+
+Files governed by third party licenses:
+source/x86inc.asm
+
diff --git a/files/OWNERS b/files/OWNERS
new file mode 100644
index 00000000..2db52d30
--- /dev/null
+++ b/files/OWNERS
@@ -0,0 +1,13 @@
+fbarchard@chromium.org
+magjed@chromium.org
+torbjorng@chromium.org
+
+per-file *.gyp=kjellander@chromium.org
+per-file *.gn=kjellander@chromium.org
+per-file .gitignore=*
+per-file AUTHORS=*
+per-file DEPS=*
+per-file PRESUBMIT.py=kjellander@chromium.org
+per-file gyp_libyuv.py=kjellander@chromium.org
+per-file setup_links.py=*
+per-file sync_chromium.py=kjellander@chromium.org
diff --git a/files/PRESUBMIT.py b/files/PRESUBMIT.py
new file mode 100755
index 00000000..58242bd9
--- /dev/null
+++ b/files/PRESUBMIT.py
@@ -0,0 +1,65 @@
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+import re
+import sys
+
+
+def GetDefaultTryConfigs(bots=None):
+  """Returns a list of ('bot', set(['tests']), optionally filtered by [bots].
+
+  For WebRTC purposes, we always return an empty list of tests, since we want
+  to run all tests by default on all our trybots.
+  """
+  return { 'tryserver.libyuv': dict((bot, []) for bot in bots)}
+
+
+# pylint: disable=W0613
+def GetPreferredTryMasters(project, change):
+  files = change.LocalPaths()
+  bots = [
+    'win',
+    'win_rel',
+    'win_x64_rel',
+    'win_x64_gn',
+    'win_x64_gn_rel',
+    'win_clang',
+    'win_clang_rel',
+    'win_x64_clang_rel',
+    'mac',
+    'mac_rel',
+    'mac_gn',
+    'mac_gn_rel',
+    'mac_asan',
+    'ios',
+    'ios_rel',
+    'ios_arm64',
+    'ios_arm64_rel',
+    'linux',
+    'linux_rel',
+    'linux_gn',
+    'linux_gn_rel',
+    'linux_memcheck',
+    'linux_tsan2',
+    'linux_asan',
+    'linux_msan',
+    'linux_ubsan',
+    'linux_ubsan_vptr',
+    'android',
+    'android_rel',
+    'android_clang',
+    'android_arm64',
+    'android_mips',
+    'android_x64',
+    'android_x86',
+    'android_gn',
+    'android_gn_rel',
+  ]
+  if not files or all(re.search(r'[\\/]OWNERS$', f) for f in files):
+    return {}
+  return GetDefaultTryConfigs(bots)
diff --git a/files/README.chromium b/files/README.chromium
new file mode 100644
index 00000000..251f8676
--- /dev/null
+++ b/files/README.chromium
@@ -0,0 +1,8 @@
+Name: libyuv
+URL: http://code.google.com/p/libyuv/
+Version: 1602
+License: BSD
+License File: LICENSE
+
+Description:
+libyuv is an open source project that includes YUV conversion and scaling functionality.
diff --git a/files/README.md b/files/README.md
new file mode 100644
index 00000000..b59b71c5
--- /dev/null
+++ b/files/README.md
@@ -0,0 +1,18 @@
+**libyuv** is an open source project that includes YUV scaling and conversion functionality.
+
+* Scale YUV to prepare content for compression, with point, bilinear or box filter.
+* Convert to YUV from webcam formats.
+* Convert from YUV to formats for rendering/effects.
+* Rotate by 90/180/270 degrees to adjust for mobile devices in portrait mode.
+* Optimized for SSE2/SSSE3/AVX2 on x86/x64.
+* Optimized for Neon on Arm.
+* Optimized for DSP R2 on Mips.
+
+### Development
+
+See [Getting started] [1] for instructions on how to get started developing.
+
+You can also browse the [docs directory] [2] for more documentation.
+
+[1]: https://chromium.googlesource.com/libyuv/libyuv/+/master/docs/getting_started.md
+[2]: https://chromium.googlesource.com/libyuv/libyuv/+/master/docs/
diff --git a/files/all.gyp b/files/all.gyp
new file mode 100644
index 00000000..88a74842
--- /dev/null
+++ b/files/all.gyp
@@ -0,0 +1,21 @@
+# Copyright 2013 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# all.gyp and All target are for benefit of android gyp build.
+{
+  'targets': [
+    {
+      'target_name': 'All',
+      'type': 'none',
+      'dependencies': [
+        'libyuv.gyp:*',
+        'libyuv_test.gyp:*',
+      ],
+    },
+  ],
+}
diff --git a/files/build_overrides/build.gni b/files/build_overrides/build.gni
new file mode 100644
index 00000000..6d3aa1eb
--- /dev/null
+++ b/files/build_overrides/build.gni
@@ -0,0 +1,15 @@
+# Copyright 2016 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Using same overrides as WebRTC
+# See https://bugs.chromium.org/p/webrtc/issues/detail?id=5453.
+# Some WebRTC targets require the 10.7 deployment version of the Mac SDK and a
+# 10.11 min SDK but those targets are only used in non-Chromium builds. We can
+# remove this when Chromium drops 10.6 support and also requires 10.7.
+mac_sdk_min_build_override = "10.11"
+mac_deployment_target_build_override = "10.7"
diff --git a/files/chromium/.gclient b/files/chromium/.gclient
new file mode 100644
index 00000000..c1a86ecf
--- /dev/null
+++ b/files/chromium/.gclient
@@ -0,0 +1,20 @@
+solutions = [{
+  'name': 'src',
+  'url': 'https://chromium.googlesource.com/chromium/src.git',
+  'deps_file': '.DEPS.git',
+  'managed': False,
+  'custom_deps': {
+    # Skip syncing some large dependencies Libyuv will never need.
+    'src/third_party/cld_2/src': None,
+    'src/third_party/ffmpeg': None,
+    'src/third_party/hunspell_dictionaries': None,
+    'src/third_party/liblouis/src': None,
+    'src/third_party/pdfium': None,
+    'src/third_party/skia': None,
+    'src/third_party/trace-viewer': None,
+    'src/third_party/webrtc': None,
+  },
+  'safesync_url': ''
+}]
+
+cache_dir = None
diff --git a/files/chromium/README b/files/chromium/README
new file mode 100644
index 00000000..127f4b52
--- /dev/null
+++ b/files/chromium/README
@@ -0,0 +1,5 @@
+This .gclient file is used to do download a copy of Chromium.
+Libyuv uses the Chromium build toolchain and a number of shared
+dependencies by creating symlinks to folders in this checkout,
+using the ../setup_links.py script.
+
diff --git a/files/codereview.settings b/files/codereview.settings
index 11270bba..9b538069 100644
--- a/files/codereview.settings
+++ b/files/codereview.settings
@@ -1,12 +1,12 @@
 # This file is used by gcl to get repository specific information.
-# The LibYuv code review is via WebRtc's code review
-CODE_REVIEW_SERVER: webrtc-codereview.appspot.com
+CODE_REVIEW_SERVER: codereview.chromium.org
 #CC_LIST:
-#VIEW_VC:
+VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/
 #STATUS:
+FORCE_HTTPS_COMMIT_URL: True
+PROJECT: libyuv
 TRY_ON_UPLOAD: False
-TRYSERVER_HTTP_HOST: webrtc-cb-linux-master.cbf.corp.google.com
-TRYSERVER_HTTP_PORT: 9020
-#TRYSERVER_SVN_URL:
+TRYSERVER_ROOT: src
+TRYSERVER_SVN_URL: svn://svn.chromium.org/chrome-try/try-libyuv
 #GITCL_PREUPLOAD:
 #GITCL_PREDCOMMIT:
diff --git a/files/docs/environment_variables.md b/files/docs/environment_variables.md
new file mode 100644
index 00000000..5802599e
--- /dev/null
+++ b/files/docs/environment_variables.md
@@ -0,0 +1,32 @@
+# Introduction
+
+For test purposes, environment variables can be set to control libyuv behavior.  These should only be used for testing, to narrow down bugs or to test performance.
+
+# CPU
+
+By default the cpu is detected and the most advanced form of SIMD is used.  But you can disable instruction sets selectively, or completely, falling back on C code.  Set the variable to 1 to disable the specified instruction set.
+
+    LIBYUV_DISABLE_ASM
+    LIBYUV_DISABLE_X86
+    LIBYUV_DISABLE_SSE2
+    LIBYUV_DISABLE_SSSE3
+    LIBYUV_DISABLE_SSE41
+    LIBYUV_DISABLE_SSE42
+    LIBYUV_DISABLE_AVX
+    LIBYUV_DISABLE_AVX2
+    LIBYUV_DISABLE_AVX3
+    LIBYUV_DISABLE_ERMS
+    LIBYUV_DISABLE_FMA3
+    LIBYUV_DISABLE_DSPR2
+    LIBYUV_DISABLE_NEON
+
+# Test Width/Height/Repeat
+
+The unittests default to a small image (128x72) to run fast.  This can be set by environment variable to test a specific resolutions.
+You can also repeat the test a specified number of iterations, allowing benchmarking and profiling.
+
+    set LIBYUV_WIDTH=1280
+    set LIBYUV_HEIGHT=720
+    set LIBYUV_REPEAT=999
+    set LIBYUV_FLAGS=-1
+    set LIBYUV_CPU_INFO=-1
diff --git a/files/docs/filtering.md b/files/docs/filtering.md
new file mode 100644
index 00000000..8696976e
--- /dev/null
+++ b/files/docs/filtering.md
@@ -0,0 +1,196 @@
+# Introduction
+
+This document discusses the current state of filtering in libyuv. An emphasis on maximum performance while avoiding memory exceptions, and minimal amount of code/complexity.  See future work at end.
+
+# LibYuv Filter Subsampling
+
+There are 2 challenges with subsampling
+
+1. centering of samples, which involves clamping on edges
+2. clipping a source region
+
+Centering depends on scale factor and filter mode.
+
+# Down Sampling
+
+If scaling down, the stepping rate is always src_width / dst_width.
+
+    dx = src_width / dst_width;
+
+e.g. If scaling from 1280x720 to 640x360, the step thru the source will be 2.0, stepping over 2 pixels of source for each pixel of destination.
+
+Centering, depends on filter mode.
+
+*Point* downsampling takes the middle pixel.
+
+    x = dx >> 1;
+
+For odd scale factors (e.g. 3x down) this is exactly the middle.  For even scale factors, this rounds up and takes the pixel to the right of center.  e.g. scale of 4x down will take pixel 2.
+
+**Bilinear** filter, uses the 2x2 pixels in the middle.
+
+    x = dx / 2 - 0.5;
+
+For odd scale factors (e.g. 3x down) this is exactly the middle, and point sampling is used.
+For even scale factors, this evenly filters the middle 2x2 pixels.  e.g. 4x down will filter pixels 1,2 at 50% in both directions.
+
+**Box** filter averages the entire box so sampling starts at 0.
+
+    x = 0;
+
+For a scale factor of 2x down, this is equivalent to bilinear.
+
+# Up Sampling
+
+**Point** upsampling use stepping rate of src_width / dst_width and a starting coordinate of 0.
+
+    x = 0;
+    dx = src_width / dst_width;
+
+e.g. If scaling from 640x360 to 1280x720 the step thru the source will be 0.0, stepping half a pixel of source for each pixel of destination. Each pixel is replicated by the scale factor.
+
+**Bilinear** filter stretches such that the first pixel of source maps to the first pixel of destination, and the last pixel of source maps to the last pixel of destination.
+
+    x = 0;
+    dx = (src_width - 1) / (dst_width - 1);
+
+This method is not technically correct, and will likely change in the future.
+
+* It is inconsistent with the bilinear down sampler.  The same method could be used for down sampling, and then it would be more reversible, but that would prevent specialized 2x down sampling.
+* Although centered, the image is slightly magnified.
+* The filtering was changed in early 2013 - previously it used:
+
+        x = 0;
+        dx = (src_width - 1) / (dst_width - 1);
+
+Which is the correct scale factor, but shifted the image left, and extruded the last pixel.  The reason for the change was to remove the extruding code from the low level row functions, allowing 3 functions to sshare the same row functions - ARGBScale, I420Scale, and ARGBInterpolate.  Then the one function was ported to many cpu variations: SSE2, SSSE3, AVX2, Neon and 'Any' version for any number of pixels and alignment.  The function is also specialized for 0,25,50,75%.
+
+The above goes still has the potential to read the last pixel 100% and last pixel + 1 0%, which may cause a memory exception.  So the left pixel goes to a fraction less than the last pixel, but filters in the minimum amount of it, and the maximum of the last pixel.
+
+    dx = FixedDiv((src_width << 16) - 0x00010001, (dst << 16) - 0x00010000);
+
+**Box** filter for upsampling switches over to Bilinear.
+
+# Scale snippet:
+
+    #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+    #define FIXEDDIV1(src, dst) FixedDiv((src << 16) - 0x00010001, \
+                                         (dst << 16) - 0x00010000);
+
+    // Compute slope values for stepping.
+    void ScaleSlope(int src_width, int src_height,
+                    int dst_width, int dst_height,
+                    FilterMode filtering,
+                    int* x, int* y, int* dx, int* dy) {
+      assert(x != NULL);
+      assert(y != NULL);
+      assert(dx != NULL);
+      assert(dy != NULL);
+      assert(src_width != 0);
+      assert(src_height != 0);
+      assert(dst_width > 0);
+      assert(dst_height > 0);
+      if (filtering == kFilterBox) {
+        // Scale step for point sampling duplicates all pixels equally.
+        *dx = FixedDiv(Abs(src_width), dst_width);
+        *dy = FixedDiv(src_height, dst_height);
+        *x = 0;
+        *y = 0;
+      } else if (filtering == kFilterBilinear) {
+        // Scale step for bilinear sampling renders last pixel once for upsample.
+        if (dst_width <= Abs(src_width)) {
+          *dx = FixedDiv(Abs(src_width), dst_width);
+          *x = CENTERSTART(*dx, -32768);
+        } else if (dst_width > 1) {
+          *dx = FIXEDDIV1(Abs(src_width), dst_width);
+          *x = 0;
+        }
+        if (dst_height <= src_height) {
+          *dy = FixedDiv(src_height,  dst_height);
+          *y = CENTERSTART(*dy, -32768);  // 32768 = -0.5 to center bilinear.
+        } else if (dst_height > 1) {
+          *dy = FIXEDDIV1(src_height, dst_height);
+          *y = 0;
+        }
+      } else if (filtering == kFilterLinear) {
+        // Scale step for bilinear sampling renders last pixel once for upsample.
+        if (dst_width <= Abs(src_width)) {
+          *dx = FixedDiv(Abs(src_width), dst_width);
+          *x = CENTERSTART(*dx, -32768);
+        } else if (dst_width > 1) {
+          *dx = FIXEDDIV1(Abs(src_width), dst_width);
+          *x = 0;
+        }
+        *dy = FixedDiv(src_height, dst_height);
+        *y = *dy >> 1;
+      } else {
+        // Scale step for point sampling duplicates all pixels equally.
+        *dx = FixedDiv(Abs(src_width), dst_width);
+        *dy = FixedDiv(src_height, dst_height);
+        *x = CENTERSTART(*dx, 0);
+        *y = CENTERSTART(*dy, 0);
+      }
+      // Negative src_width means horizontally mirror.
+      if (src_width < 0) {
+        *x += (dst_width - 1) * *dx;
+        *dx = -*dx;
+        src_width = -src_width;
+      }
+    }
+
+# Future Work
+
+Point sampling should ideally be the same as bilinear, but pixel by pixel, round to nearest neighbor.  But as is, it is reversible and exactly matches ffmpeg at all scale factors, both up and down.  The scale factor is
+
+    dx = src_width / dst_width;
+
+The step value is centered for down sample:
+
+    x = dx / 2;
+
+Or starts at 0 for upsample.
+
+    x = 0;
+
+Bilinear filtering is currently correct for down sampling, but not for upsampling.
+Upsampling is stretching the first and last pixel of source, to the first and last pixel of destination.
+
+    dx = (src_width - 1) / (dst_width - 1);<br>
+    x = 0;
+
+It should be stretching such that the first pixel is centered in the middle of the scale factor, to match the pixel that would be sampled for down sampling by the same amount.  And same on last pixel.
+
+    dx = src_width / dst_width;<br>
+    x = dx / 2 - 0.5;
+
+This would start at -0.5 and go to last pixel + 0.5, sampling 50% from last pixel + 1.
+Then clamping would be needed.  On GPUs there are numerous ways to clamp.
+
+1. Clamp the coordinate to the edge of the texture, duplicating the first and last pixel.
+2. Blend with a constant color, such as transparent black.  Typically best for fonts.
+3. Mirror the UV coordinate, which is similar to clamping.  Good for continuous tone images.
+4. Wrap the coordinate, for texture tiling.
+5. Allow the coordinate to index beyond the image, which may be the correct data if sampling a subimage.
+6. Extrapolate the edge based on the previous pixel.  pixel -0.5 is computed from slope of pixel 0 and 1.
+
+Some of these are computational, even for a GPU, which is one reason textures are sometimes limited to power of 2 sizes.
+We do care about the clipping case, where allowing coordinates to become negative and index pixels before the image is the correct data.  But normally for simple scaling, we want to clamp to the edge pixel.  For example, if bilinear scaling from 3x3 to 30x30, we’d essentially want 10 pixels of each of the original 3 pixels.  But we want the original pixels to land in the middle of each 10 pixels, at offsets 5, 15 and 25.  There would be filtering between 5 and 15 between the original pixels 0 and 1.  And filtering between 15 and 25 from original pixels 1 and 2.  The first 5 pixels are clamped to pixel 0 and the last 5 pixels are clamped to pixel 2.
+The easiest way to implement this is copy the original 3 pixels to a buffer, and duplicate the first and last pixels.  0,1,2 becomes 0, 0,1,2, 2.  Then implement a filtering without clamping.  We call this source extruding.  Its only necessary on up sampling, since down sampler will always have valid surrounding pixels.
+Extruding is practical when the image is already copied to a temporary buffer.   It could be done to the original image, as long as the original memory is restored, but valgrind and/or memory protection would disallow this, so it requires a memcpy to a temporary buffer, which may hurt performance.  The memcpy has a performance advantage, from a cache point of view, that can actually make this technique faster, depending on hardware characteristics.
+Vertical extrusion can be done with a memcpy of the first/last row, or clamping a pointer.
+
+
+The other way to implement clamping is handle the edges with a memset.  e.g. Read first source pixel and memset the first 5 pixels.  Filter pixels 0,1,2 to 5 to 25.  Read last pixel and memset the last 5 pixels.  Blur is implemented with this method like this, which has 3 loops per row - left, middle and right.
+
+Box filter is only used for 2x down sample or more.  Its based on integer sized boxes.  Technically it should be filtered edges, but thats substantially slower (roughly 100x), and at that point you may as well do a cubic filter which is more correct.
+
+Box filter currently sums rows into a row buffer.  It does this with
+
+Mirroring will use the same slope as normal, but with a negative.
+The starting coordinate needs to consider the scale factor and filter.  e.g. box filter of 30x30 to 3x3 with mirroring would use -10 for step, but x = 20.  width (30) - dx.
+
+Step needs to be accurate, so it uses an integer divide.  This is as much as 5% of the profile.  An approximated divide is substantially faster, but the inaccuracy causes stepping beyond the original image boundaries.  3 general solutions:
+
+1. copy image to buffer with padding.  allows for small errors in stepping.
+2. hash the divide, so common values are quickly found.
+3. change api so caller provides the slope.
diff --git a/files/docs/formats.md b/files/docs/formats.md
new file mode 100644
index 00000000..a7cfed82
--- /dev/null
+++ b/files/docs/formats.md
@@ -0,0 +1,133 @@
+# Introduction
+
+Formats (FOURCC) supported by libyuv are detailed here.
+
+# Core Formats
+
+There are 2 core formats supported by libyuv - I420 and ARGB.  All YUV formats can be converted to/from I420.  All RGB formats can be converted to/from ARGB.
+
+Filtering functions such as scaling and planar functions work on I420 and/or ARGB.
+
+# OSX Core Media Pixel Formats
+
+This is how OSX formats map to libyuv
+
+    enum {
+      kCMPixelFormat_32ARGB          = 32,      FOURCC_BGRA
+      kCMPixelFormat_32BGRA          = 'BGRA',  FOURCC_ARGB
+      kCMPixelFormat_24RGB           = 24,      FOURCC_RAW
+      kCMPixelFormat_16BE555         = 16,      Not supported.
+      kCMPixelFormat_16BE565         = 'B565',  Not supported.
+      kCMPixelFormat_16LE555         = 'L555',  FOURCC_RGBO
+      kCMPixelFormat_16LE565         = 'L565',  FOURCC_RGBP
+      kCMPixelFormat_16LE5551        = '5551',  FOURCC_RGBO
+      kCMPixelFormat_422YpCbCr8      = '2vuy',  FOURCC_UYVY
+      kCMPixelFormat_422YpCbCr8_yuvs = 'yuvs',  FOURCC_YUY2
+      kCMPixelFormat_444YpCbCr8      = 'v308',  FOURCC_I444 ?
+      kCMPixelFormat_4444YpCbCrA8    = 'v408',  Not supported.
+      kCMPixelFormat_422YpCbCr16     = 'v216',  Not supported.
+      kCMPixelFormat_422YpCbCr10     = 'v210',  FOURCC_V210 previously.  Removed now.
+      kCMPixelFormat_444YpCbCr10     = 'v410',  Not supported.
+      kCMPixelFormat_8IndexedGray_WhiteIsZero = 0x00000028,  Not supported.
+    };
+
+
+# FOURCC (Four Charactacter Code) List
+
+The following is extracted from video_common.h as a complete list of formats supported by libyuv.
+
+    enum FourCC {
+      // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+      FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+      FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+      FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+      FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+      FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+      FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+      FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+      FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+      FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+
+      // 2 Secondary YUV formats: row biplanar.
+      FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+      FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+
+      // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+      FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+      FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+      FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+      FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+      FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
+      FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+      FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
+      FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
+      FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
+
+      // 4 Secondary RGB formats: 4 Bayer Patterns.
+      FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+      FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+      FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+      FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+
+      // 1 Primary Compressed YUV format.
+      FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+
+      // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+      FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+      FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+      FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+      FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
+      FOURCC_J420 = FOURCC('J', '4', '2', '0'),
+      FOURCC_J400 = FOURCC('J', '4', '0', '0'),
+
+      // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
+      FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
+      FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
+      FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
+      FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
+      FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
+      FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
+      FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.
+      FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
+      FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
+      FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
+      FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
+      FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
+      FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB
+      FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB
+      FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.
+      FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
+      FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
+
+      // 1 Auxiliary compressed YUV format set aside for capturer.
+      FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+
+# The ARGB FOURCC
+
+There are 4 ARGB layouts - ARGB, BGRA, ABGR and RGBA.  ARGB is most common by far, used for screen formats, and windows webcam drivers.
+
+The fourcc describes the order of channels in a ***register***.
+
+A fourcc provided by capturer, can be thought of string, e.g. "ARGB".
+
+On little endian machines, as an int, this would have 'A' in the lowest byte.  The FOURCC macro reverses the order:
+
+    #define FOURCC(a, b, c, d) (((uint32)(a)) | ((uint32)(b) << 8) | ((uint32)(c) << 16) | ((uint32)(d) << 24))
+
+So the "ARGB" string, read as an uint32, is
+
+    FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B')
+
+If you were to read ARGB pixels as uint32's, the alpha would be in the high byte, and the blue in the lowest byte.  In memory, these are stored little endian, so 'B' is first, then 'G', 'R' and 'A' last.
+
+When calling conversion functions, the names match the FOURCC, so in this case it would be I420ToARGB().
+
+All formats can be converted to/from ARGB.
+
+Most 'planar_functions' work on ARGB (e.g. ARGBBlend).
+
+Some are channel order agnostic (e.g. ARGBScale).
+
+Some functions are symmetric (e.g. ARGBToBGRA is the same as BGRAToARGB, so its a macro).
+
+ARGBBlend expects preattenuated ARGB. The R,G,B are premultiplied by alpha.  Other functions don't care.
diff --git a/files/docs/getting_started.md b/files/docs/getting_started.md
new file mode 100644
index 00000000..7cd56167
--- /dev/null
+++ b/files/docs/getting_started.md
@@ -0,0 +1,429 @@
+# Getting Started
+
+How to get and build the libyuv code.
+
+## Pre-requisites
+
+You'll need to have depot tools installed: https://www.chromium.org/developers/how-tos/install-depot-tools
+Refer to chromium instructions for each platform for other prerequisites.
+
+## Getting the Code
+
+Create a working directory, enter it, and run:
+
+    gclient config https://chromium.googlesource.com/libyuv/libyuv
+    gclient sync
+
+
+Then you'll get a .gclient file like:
+
+    solutions = [
+      { "name"        : "libyuv",
+        "url"         : "https://chromium.googlesource.com/libyuv/libyuv",
+        "deps_file"   : "DEPS",
+        "managed"     : True,
+        "custom_deps" : {
+        },
+        "safesync_url": "",
+      },
+    ];
+
+
+For iOS add `;target_os=['ios'];` to your OSX .gclient and run `GYP_DEFINES="OS=ios" gclient sync.`
+
+Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/master
+
+### Android
+For Android add `;target_os=['android'];` to your Linux .gclient
+
+
+    solutions = [
+      { "name"        : "libyuv",
+        "url"         : "https://chromium.googlesource.com/libyuv/libyuv",
+        "deps_file"   : "DEPS",
+        "managed"     : True,
+        "custom_deps" : {
+        },
+        "safesync_url": "",
+      },
+    ];
+    target_os = ["android", "unix"];
+
+Then run:
+
+    export GYP_DEFINES="OS=android"
+    gclient sync
+
+Caveat: Theres an error with Google Play services updates.  If you get the error "Your version of the Google Play services library is not up to date", run the following:
+    cd chromium/src
+    ./build/android/play_services/update.py download
+    cd ../..
+
+For Windows the gclient sync must be done from an Administrator command prompt.
+
+The sync will generate native build files for your environment using gyp (Windows: Visual Studio, OSX: XCode, Linux: make). This generation can also be forced manually: `gclient runhooks`
+
+To get just the source (not buildable):
+    git clone https://chromium.googlesource.com/libyuv/libyuv
+
+
+## Building the Library and Unittests
+
+### Windows
+
+    set GYP_DEFINES=target_arch=ia32
+    call python gyp_libyuv -fninja -G msvs_version=2013
+    ninja -j7 -C out\Release
+    ninja -j7 -C out\Debug
+
+    set GYP_DEFINES=target_arch=x64
+    call python gyp_libyuv -fninja -G msvs_version=2013
+    ninja -C out\Debug_x64
+    ninja -C out\Release_x64
+
+#### Building with clangcl
+    set GYP_DEFINES=clang=1 target_arch=ia32 libyuv_enable_svn=1
+    set LLVM_REPO_URL=svn://svn.chromium.org/llvm-project
+    call python tools\clang\scripts\update.py
+    call python gyp_libyuv -fninja libyuv_test.gyp
+    ninja -C out\Debug
+    ninja -C out\Release
+
+### OSX
+
+Clang 64 bit shown. Remove `clang=1` for GCC and change x64 to ia32 for 32 bit.
+
+    GYP_DEFINES="clang=1 target_arch=x64" ./gyp_libyuv
+    ninja -j7 -C out/Debug
+    ninja -j7 -C out/Release
+
+    GYP_DEFINES="clang=1 target_arch=ia32" ./gyp_libyuv
+    ninja -j7 -C out/Debug
+    ninja -j7 -C out/Release
+
+### iOS
+http://www.chromium.org/developers/how-tos/build-instructions-ios
+
+Add to .gclient last line: `target_os=['ios'];`
+
+armv7
+
+    GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
+    ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
+    ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+
+arm64
+
+    GYP_DEFINES="OS=ios target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
+    ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
+    ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+
+both armv7 and arm64 (fat)
+
+    GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=both" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
+    ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
+    ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+
+simulator
+
+    GYP_DEFINES="OS=ios target_arch=ia32 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_sim" ./gyp_libyuv
+    ninja -j7 -C out_sim/Debug-iphonesimulator libyuv_unittest
+    ninja -j7 -C out_sim/Release-iphonesimulator libyuv_unittest
+
+### Android
+https://code.google.com/p/chromium/wiki/AndroidBuildInstructions
+
+Add to .gclient last line: `target_os=['android'];`
+
+armv7
+
+    GYP_DEFINES="OS=android" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+    ninja -j7 -C out/Debug libyuv_unittest_apk
+    ninja -j7 -C out/Release libyuv_unittest_apk
+
+arm64
+
+    GYP_DEFINES="OS=android target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+    ninja -j7 -C out/Debug libyuv_unittest_apk
+    ninja -j7 -C out/Release libyuv_unittest_apk
+
+ia32
+
+    GYP_DEFINES="OS=android target_arch=ia32" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+    ninja -j7 -C out/Debug libyuv_unittest_apk
+    ninja -j7 -C out/Release libyuv_unittest_apk
+
+    GYP_DEFINES="OS=android target_arch=ia32 android_full_debug=1" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+    ninja -j7 -C out/Debug libyuv_unittest_apk
+
+mipsel
+
+    GYP_DEFINES="OS=android target_arch=mipsel" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+    ninja -j7 -C out/Debug libyuv_unittest_apk
+    ninja -j7 -C out/Release libyuv_unittest_apk
+
+arm32 disassembly:
+
+    third_party/android_tools/ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-objdump -d out/Release/obj/source/libyuv.row_neon.o
+
+arm64 disassembly:
+
+    third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d out/Release/obj/source/libyuv.row_neon64.o
+
+Running tests:
+
+    util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
+
+Running test as benchmark:
+
+    util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1"
+
+Running test with C code:
+
+    util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1"
+
+#### Building with GN
+
+    gn gen out/Release "--args=is_debug=false target_cpu=\"x86\""
+    gn gen out/Debug "--args=is_debug=true target_cpu=\"x86\""
+    ninja -C out/Release
+    ninja -C out/Debug
+
+### Building Offical with GN
+
+    gn gen out/Official "--args=is_debug=false is_official_build=true is_chrome_branded=true"
+    ninja -C out/Official
+
+### Linux
+
+    GYP_DEFINES="target_arch=x64" ./gyp_libyuv
+    ninja -j7 -C out/Debug
+    ninja -j7 -C out/Release
+
+    GYP_DEFINES="target_arch=ia32" ./gyp_libyuv
+    ninja -j7 -C out/Debug
+    ninja -j7 -C out/Release
+
+#### CentOS
+
+On CentOS 32 bit the following work around allows a sync:
+
+    export GYP_DEFINES="host_arch=ia32"
+    gclient sync
+
+### Windows Shared Library
+
+Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'.
+
+    gclient runhooks
+
+After this command follow the building the library instructions above.
+
+If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows
+
+
+### Build targets
+
+    ninja -C out/Debug libyuv
+    ninja -C out/Debug libyuv_unittest
+    ninja -C out/Debug compare
+    ninja -C out/Debug convert
+    ninja -C out/Debug psnr
+    ninja -C out/Debug cpuid
+
+
+## Building the Library with make
+
+### Linux
+
+    make -j7 V=1 -f linux.mk
+    make -j7 V=1 -f linux.mk clean
+    make -j7 V=1 -f linux.mk CXX=clang++
+
+## Building the Library with cmake
+
+Install cmake: http://www.cmake.org/
+
+Default debug build:
+
+    mkdir out
+    cd out
+    cmake ..
+    cmake --build .
+
+Release build/install
+
+    mkdir out
+    cd out
+    cmake -DCMAKE_INSTALL_PREFIX="/usr/lib" -DCMAKE_BUILD_TYPE="Release" ..
+    cmake --build . --config Release
+    sudo cmake --build . --target install --config Release
+
+### Windows 8 Phone
+
+Pre-requisite:
+
+* Install Visual Studio 2012 and Arm to your environment.<br>
+
+Then:
+
+    call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
+
+or with Visual Studio 2013:
+
+    call "c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
+    nmake /f winarm.mk clean
+    nmake /f winarm.mk
+
+### Windows Shared Library
+
+Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'. Then run this.
+
+    gclient runhooks
+
+After this command follow the building the library instructions above.
+
+If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows
+
+### 64 bit Windows
+
+    set GYP_DEFINES=target_arch=x64
+    gclient runhooks V=1
+
+### ARM Linux
+
+    export GYP_DEFINES="target_arch=arm"
+    export CROSSTOOL=`<path>`/arm-none-linux-gnueabi
+    export CXX=$CROSSTOOL-g++
+    export CC=$CROSSTOOL-gcc
+    export AR=$CROSSTOOL-ar
+    export AS=$CROSSTOOL-as
+    export RANLIB=$CROSSTOOL-ranlib
+    gclient runhooks
+
+## Running Unittests
+
+### Windows
+
+    out\Release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter="*"
+
+### OSX
+
+    out/Release/libyuv_unittest --gtest_filter="*"
+
+### Linux
+
+    out/Release/libyuv_unittest --gtest_filter="*"
+
+Replace --gtest_filter="*" with specific unittest to run.  May include wildcards. e.g.
+
+    out/Release/libyuv_unittest --gtest_filter=libyuvTest.I420ToARGB_Opt
+
+## CPU Emulator tools
+
+### Intel SDE (Software Development Emulator)
+
+Pre-requisite: Install IntelSDE for Windows: http://software.intel.com/en-us/articles/intel-software-development-emulator
+
+Then run:
+
+    c:\intelsde\sde -hsw -- out\release\libyuv_unittest.exe --gtest_filter=*
+
+
+## Memory tools
+
+### Running Dr Memory memcheck for Windows
+
+Pre-requisite: Install Dr Memory for Windows and add it to your path: http://www.drmemory.org/docs/page_install_windows.html
+
+    set GYP_DEFINES=build_for_tool=drmemory target_arch=ia32
+    call python gyp_libyuv -fninja -G msvs_version=2013
+    ninja -C out\Debug
+    drmemory out\Debug\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*
+
+### Running UBSan
+
+See Chromium instructions for sanitizers: https://www.chromium.org/developers/testing/undefinedbehaviorsanitizer
+
+Sanitizers available: TSan, MSan, ASan, UBSan, LSan
+
+    GYP_DEFINES='ubsan=1' gclient runhooks
+    ninja -C out/Release
+
+### Running Valgrind memcheck
+
+Memory errors and race conditions can be found by running tests under special memory tools. [Valgrind] [1] is an instrumentation framework for building dynamic analysis tools. Various tests and profilers are built upon it to find memory handling errors and memory leaks, for instance.
+
+[1]: http://valgrind.org
+
+    solutions = [
+      { "name"        : "libyuv",
+        "url"         : "https://chromium.googlesource.com/libyuv/libyuv",
+        "deps_file"   : "DEPS",
+        "managed"     : True,
+        "custom_deps" : {
+           "libyuv/chromium/src/third_party/valgrind": "https://chromium.googlesource.com/chromium/deps/valgrind/binaries",
+        },
+        "safesync_url": "",
+      },
+    ]
+
+Then run:
+
+    GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=memcheck" python gyp_libyuv
+    ninja -C out/Debug
+    valgrind out/Debug/libyuv_unittest
+
+
+For more information, see http://www.chromium.org/developers/how-tos/using-valgrind
+
+### Running Thread Sanitizer (TSan)
+
+    GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=tsan" python gyp_libyuv
+    ninja -C out/Debug
+    valgrind out/Debug/libyuv_unittest
+
+For more info, see http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer
+
+### Running Address Sanitizer (ASan)
+
+    GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=asan" python gyp_libyuv
+    ninja -C out/Debug
+    valgrind out/Debug/libyuv_unittest
+
+For more info, see http://dev.chromium.org/developers/testing/addresssanitizer
+
+## Benchmarking
+
+The unittests can be used to benchmark.
+
+### Windows
+
+    set LIBYUV_WIDTH=1280
+    set LIBYUV_HEIGHT=720
+    set LIBYUV_REPEAT=999
+    set LIBYUV_FLAGS=-1
+    out\Release\libyuv_unittest.exe --gtest_filter=*I420ToARGB_Opt
+
+
+### Linux and Mac
+
+    LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=1000 out/Release/libyuv_unittest --gtest_filter=*I420ToARGB_Opt
+
+    libyuvTest.I420ToARGB_Opt (547 ms)
+
+Indicates 0.547 ms/frame for 1280 x 720.
+
+## Making a change
+
+    gclient sync
+    git checkout -b mycl -t origin/master
+    git pull
+    <edit files>
+    git add -u
+    git commit -m "my change"
+    git cl lint
+    git cl try
+    git cl upload -r a-reviewer@chomium.org -s
+    <once approved..>
+    git cl land
diff --git a/files/docs/rotation.md b/files/docs/rotation.md
new file mode 100644
index 00000000..fb84fce5
--- /dev/null
+++ b/files/docs/rotation.md
@@ -0,0 +1,103 @@
+# Introduction
+
+Rotation by multiplies of 90 degrees allows mobile devices to rotate webcams from landscape to portrait.  The higher level functions ConvertToI420 and ConvertToARGB allow rotation of any format.  Optimized functionality is supported for I420, ARGB, NV12 and NV21.
+
+# ConvertToI420
+
+    int ConvertToI420(const uint8* src_frame, size_t src_size,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int crop_x, int crop_y,
+                      int src_width, int src_height,
+                      int crop_width, int crop_height,
+                      enum RotationMode rotation,
+                      uint32 format);
+
+This function crops, converts, and rotates.  You should think of it in that order.
+  * Crops the original image, which is src_width x src_height, to crop_width x crop_height.  At this point the image is still not rotated.
+  * Converts the cropped region to I420.  Supports inverted source for src_height negative.
+  * Rotates by 90, 180 or 270 degrees.
+The buffer the caller provides should account for rotation.  Be especially important to get stride of the destination correct.
+
+e.g.
+640 x 480 NV12 captured<br>
+Crop to 640 x 360<br>
+Rotate by 90 degrees to 360 x 640.<br>
+Caller passes stride of 360 for Y and 360 / 2 for U and V.<br>
+Caller passes crop_width of 640, crop_height of 360.<br>
+
+# ConvertToARGB
+
+    int ConvertToARGB(const uint8* src_frame, size_t src_size,
+                      uint8* dst_argb, int dst_stride_argb,
+                      int crop_x, int crop_y,
+                      int src_width, int src_height,
+                      int crop_width, int crop_height,
+                      enum RotationMode rotation,
+                      uint32 format);
+
+Same as I420, but implementation is less optimized - reads columns and writes rows, 16 bytes at a time.
+
+# I420Rotate
+
+    int I420Rotate(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int src_width, int src_height, enum RotationMode mode);
+
+Destination is rotated, so pass dst_stride_y etc that consider rotation.<br>
+Rotate by 180 can be done in place, but 90 and 270 can not.
+
+Implementation (Neon/SSE2) uses 8 x 8 block transpose, so best efficiency is with sizes and pointers that are aligned to 8.
+
+Cropping can be achieved by adjusting the src_y/u/v pointers and src_width, src_height.
+
+Lower level plane functions are provided, allowing other planar formats to be rotated.  (e.g. I444)
+
+For other planar YUV formats (I444, I422, I411, I400, NV16, NV24), the planar functions are exposed and can be called directly
+
+
+    // Rotate a plane by 0, 90, 180, or 270.
+    int RotatePlane(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int src_width, int src_height, enum RotationMode mode);
+
+# ARGBRotate
+
+    LIBYUV_API
+    int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int src_width, int src_height, enum RotationMode mode);
+
+Same as I420, but implementation is less optimized - reads columns and writes rows.
+
+Rotate by 90, or any angle, can be achieved using ARGBAffine.
+
+# Mirror - Horizontal Flip
+
+Mirror functions for horizontally flipping an image, which can be useful for 'self view' of a webcam.
+
+    int I420Mirror(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
+    int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height);
+
+Mirror functionality can also be achieved with the I420Scale and ARGBScale functions by passing negative width and/or height.
+
+# Invert - Vertical Flip
+
+Inverting can be achieved with almost any libyuv function by passing a negative source height.
+
+I420Mirror and ARGBMirror can also be used to rotate by 180 degrees by passing a negative height.
+
+
diff --git a/files/download_vs_toolchain.py b/files/download_vs_toolchain.py
new file mode 100644
index 00000000..4b345789
--- /dev/null
+++ b/files/download_vs_toolchain.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This script is used to run the vs_toolchain.py script to download the
+# Visual Studio toolchain. It's just a temporary measure while waiting for the
+# Chrome team to move find_depot_tools into src/build to get rid of these
+# workarounds (similar one in gyp_libyuv).
+
+import os
+import sys
+
+
+checkout_root = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, os.path.join(checkout_root, 'build'))
+sys.path.insert(0, os.path.join(checkout_root, 'tools', 'find_depot_tools'))
+
+
+import vs_toolchain
+
+
+if __name__ == '__main__':
+  sys.exit(vs_toolchain.main())
diff --git a/files/gyp_libyuv b/files/gyp_libyuv
new file mode 100755
index 00000000..445b924f
--- /dev/null
+++ b/files/gyp_libyuv
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This script is used to run GYP for libyuv. It contains selected parts of the
+# main function from the src/build/gyp_chromium file.
+
+import glob
+import os
+import shlex
+import sys
+
+checkout_root = os.path.dirname(os.path.realpath(__file__))
+
+sys.path.insert(0, os.path.join(checkout_root, 'build'))
+import gyp_chromium
+import gyp_helper
+import vs_toolchain
+
+sys.path.insert(0, os.path.join(checkout_root, 'tools', 'gyp', 'pylib'))
+import gyp
+
+def GetSupplementalFiles():
+  """Returns a list of the supplemental files that are included in all GYP
+  sources."""
+  # Can't use the one in gyp_chromium since the directory location of the root
+  # is different.
+  return glob.glob(os.path.join(checkout_root, '*', 'supplement.gypi'))
+
+
+if __name__ == '__main__':
+  args = sys.argv[1:]
+
+  if int(os.environ.get('GYP_CHROMIUM_NO_ACTION', 0)):
+    print 'Skipping gyp_libyuv due to GYP_CHROMIUM_NO_ACTION env var.'
+    sys.exit(0)
+
+  # This could give false positives since it doesn't actually do real option
+  # parsing.  Oh well.
+  gyp_file_specified = False
+  for arg in args:
+    if arg.endswith('.gyp'):
+      gyp_file_specified = True
+      break
+
+  # If we didn't get a file, assume 'all.gyp' in the root of the checkout.
+  if not gyp_file_specified:
+    # Because of a bug in gyp, simply adding the abspath to all.gyp doesn't
+    # work, but chdir'ing and adding the relative path does. Spooky :/
+    os.chdir(checkout_root)
+    args.append('all.gyp')
+
+  # There shouldn't be a circular dependency relationship between .gyp files,
+  args.append('--no-circular-check')
+
+  # Default to ninja unless GYP_GENERATORS is set.
+  if not os.environ.get('GYP_GENERATORS'):
+    os.environ['GYP_GENERATORS'] = 'ninja'
+
+  vs2013_runtime_dll_dirs = None
+  if int(os.environ.get('DEPOT_TOOLS_WIN_TOOLCHAIN', '1')):
+    vs2013_runtime_dll_dirs = vs_toolchain.SetEnvironmentAndGetRuntimeDllDirs()
+
+  # Enforce gyp syntax checking. This adds about 20% execution time.
+  args.append('--check')
+
+  supplemental_includes = gyp_chromium.GetSupplementalFiles()
+  gyp_vars_dict = gyp_chromium.GetGypVars(supplemental_includes)
+
+  # Automatically turn on crosscompile support for platforms that need it.
+  if all(('ninja' in os.environ.get('GYP_GENERATORS', ''),
+          gyp_vars_dict.get('OS') in ['android', 'ios'],
+          'GYP_CROSSCOMPILE' not in os.environ)):
+    os.environ['GYP_CROSSCOMPILE'] = '1'
+
+  args.extend(['-I' + i for i in
+               gyp_chromium.additional_include_files(supplemental_includes,
+                                                     args)])
+
+  # Set the gyp depth variable to the root of the checkout.
+  args.append('--depth=' + os.path.relpath(checkout_root))
+
+  print 'Updating projects from gyp files...'
+  sys.stdout.flush()
+
+  # Off we go...
+  gyp_rc = gyp.main(args)
+
+  if vs2013_runtime_dll_dirs:
+    x64_runtime, x86_runtime = vs2013_runtime_dll_dirs
+    vs_toolchain.CopyVsRuntimeDlls(
+        os.path.join(checkout_root, gyp_chromium.GetOutputDirectory()),
+        (x86_runtime, x64_runtime))
+
+  sys.exit(gyp_rc)
diff --git a/files/gyp_libyuv.py b/files/gyp_libyuv.py
new file mode 100644
index 00000000..ac42038d
--- /dev/null
+++ b/files/gyp_libyuv.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+
+# This script is a modified copy of the src/build/gyp_chromium.py file. 
+# It is needed for parallel processing.
+
+# This file is (possibly, depending on python version) imported by
+# gyp_libyuv when GYP_PARALLEL=1 and it creates sub-processes
+# through the multiprocessing library.
+
+# Importing in Python 2.6 (fixed in 2.7) on Windows doesn't search for
+# imports that don't end in .py (and aren't directories with an
+# __init__.py). This wrapper makes "import gyp_libyuv" work with
+# those old versions and makes it possible to execute gyp_libyuv.py
+# directly on Windows where the extension is useful.
+
+import os
+
+path = os.path.abspath(os.path.split(__file__)[0])
+execfile(os.path.join(path, 'gyp_libyuv'))
diff --git a/files/include/libyuv.h b/files/include/libyuv.h
index 06f26aae..de652836 100644
--- a/files/include/libyuv.h
+++ b/files/include/libyuv.h
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -16,13 +16,16 @@
 #include "libyuv/convert.h"
 #include "libyuv/convert_argb.h"
 #include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
 #include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
+#include "libyuv/mjpeg_decoder.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
 #include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
 #include "libyuv/scale.h"
 #include "libyuv/scale_argb.h"
+#include "libyuv/scale_row.h"
 #include "libyuv/version.h"
 #include "libyuv/video_common.h"
 
diff --git a/files/include/libyuv/basic_types.h b/files/include/libyuv/basic_types.h
index 9e9f2abc..beb750ba 100644
--- a/files/include/libyuv/basic_types.h
+++ b/files/include/libyuv/basic_types.h
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -13,10 +13,13 @@
 
 #include <stddef.h>  // for NULL, size_t
 
-#if !(defined(_MSC_VER) && (_MSC_VER < 1600))
+#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
+#include <sys/types.h>  // for uintptr_t on x86
+#else
 #include <stdint.h>  // for uintptr_t
 #endif
 
+#ifndef GG_LONGLONG
 #ifndef INT_TYPES_DEFINED
 #define INT_TYPES_DEFINED
 #ifdef COMPILER_MSVC
@@ -30,7 +33,7 @@ typedef __int64 int64;
 #endif
 #define INT64_F "I64"
 #else  // COMPILER_MSVC
-#ifdef __LP64__
+#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
 typedef unsigned long uint64;  // NOLINT
 typedef long int64;  // NOLINT
 #ifndef INT64_C
@@ -40,7 +43,7 @@ typedef long int64;  // NOLINT
 #define UINT64_C(x) x ## UL
 #endif
 #define INT64_F "l"
-#else  // __LP64__
+#else  // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
 typedef unsigned long long uint64;  // NOLINT
 typedef long long int64;  // NOLINT
 #ifndef INT64_C
@@ -59,6 +62,7 @@ typedef short int16;  // NOLINT
 typedef unsigned char uint8;
 typedef signed char int8;
 #endif  // INT_TYPES_DEFINED
+#endif  // GG_LONGLONG
 
 // Detect compiler is for x86 or x64.
 #if defined(__x86_64__) || defined(_M_X64) || \
@@ -71,9 +75,14 @@ typedef signed char int8;
 #endif
 
 #ifndef ALIGNP
+#ifdef __cplusplus
 #define ALIGNP(p, t) \
     (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
     ((t) - 1)) & ~((t) - 1))))
+#else
+#define ALIGNP(p, t) \
+    ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1))))  /* NOLINT */
+#endif
 #endif
 
 #if !defined(LIBYUV_API)
@@ -94,4 +103,16 @@ typedef signed char int8;
 #endif  // __GNUC__
 #endif  // LIBYUV_API
 
+#define LIBYUV_BOOL int
+#define LIBYUV_FALSE 0
+#define LIBYUV_TRUE 1
+
+// Visual C x86 or GCC little endian.
+#if defined(__x86_64__) || defined(_M_X64) || \
+  defined(__i386__) || defined(_M_IX86) || \
+  defined(__arm__) || defined(_M_ARM) || \
+  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define LIBYUV_LITTLE_ENDIAN
+#endif
+
 #endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_  NOLINT
diff --git a/files/include/libyuv/compare.h b/files/include/libyuv/compare.h
index 5fd924b8..08b2bb2e 100644
--- a/files/include/libyuv/compare.h
+++ b/files/include/libyuv/compare.h
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -22,6 +22,11 @@ extern "C" {
 LIBYUV_API
 uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
 
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
+
 // Sum Square Error - used to compute Mean Square Error or PSNR.
 LIBYUV_API
 uint64 ComputeSumSquareError(const uint8* src_a,
diff --git a/files/include/libyuv/compare_row.h b/files/include/libyuv/compare_row.h
new file mode 100644
index 00000000..38a957b2
--- /dev/null
+++ b/files/include/libyuv/compare_row.h
@@ -0,0 +1,84 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_COMPARE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+    defined(_M_IX86) && (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_HASHDJB2_AVX2
+#endif
+
+// The following are available for Visual C and GCC:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) || defined(_M_IX86)))
+#define HAS_HASHDJB2_SSE41
+#define HAS_SUMSQUAREERROR_SSE2
+#endif
+
+// The following are available for Visual C and clangcl 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_HASHDJB2_AVX2
+#define HAS_SUMSQUAREERROR_AVX2
+#endif
+
+// The following are available for Neon:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SUMSQUAREERROR_NEON
+#endif
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_COMPARE_ROW_H_  NOLINT
diff --git a/files/include/libyuv/convert.h b/files/include/libyuv/convert.h
index 1d4b6a5b..a2cdc571 100644
--- a/files/include/libyuv/convert.h
+++ b/files/include/libyuv/convert.h
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -12,32 +12,17 @@
 #define INCLUDE_LIBYUV_CONVERT_H_
 
 #include "libyuv/basic_types.h"
-// TODO(fbarchard): Remove the following headers includes.
-#include "libyuv/convert_from.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
+
+#include "libyuv/rotate.h"  // For enum RotationMode.
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-// Alias.
-#define I420ToI420 I420Copy
-
-// Copy I420 to I420.
-LIBYUV_API
-int I420Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height);
-
-// Convert I422 to I420.
+// Convert I444 to I420.
 LIBYUV_API
-int I422ToI420(const uint8* src_y, int src_stride_y,
+int I444ToI420(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
                uint8* dst_y, int dst_stride_y,
@@ -45,9 +30,9 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-// Convert I444 to I420.
+// Convert I422 to I420.
 LIBYUV_API
-int I444ToI420(const uint8* src_y, int src_stride_y,
+int I422ToI420(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
                uint8* dst_y, int dst_stride_y,
@@ -65,6 +50,17 @@ int I411ToI420(const uint8* src_y, int src_stride_y,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
+// Copy I420 to I420.
+#define I420ToI420 I420Copy
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
 // Convert I400 (grey) to I420.
 LIBYUV_API
 int I400ToI420(const uint8* src_y, int src_stride_y,
@@ -73,7 +69,9 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-// Convert NV12 to I420. Also used for NV21.
+#define J400ToJ420 I400ToI420
+
+// Convert NV12 to I420.
 LIBYUV_API
 int NV12ToI420(const uint8* src_y, int src_stride_y,
                const uint8* src_uv, int src_stride_uv,
@@ -82,18 +80,10 @@ int NV12ToI420(const uint8* src_y, int src_stride_y,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-// Convert M420 to I420.
+// Convert NV21 to I420.
 LIBYUV_API
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert Q420 to I420.
-LIBYUV_API
-int Q420ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_yuy2, int src_stride_yuy2,
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
@@ -115,9 +105,9 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-// Convert V210 to I420.
+// Convert M420 to I420.
 LIBYUV_API
-int V210ToI420(const uint8* src_uyvy, int src_stride_uyvy,
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
@@ -205,9 +195,12 @@ int MJPGToI420(const uint8* sample, size_t sample_size,
                uint8* dst_v, int dst_stride_v,
                int src_width, int src_height,
                int dst_width, int dst_height);
-#endif
 
-// Note Bayer formats (BGGR) To I420 are in format_conversion.h
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+             int* width, int* height);
+#endif
 
 // Convert camera sample to I420 with cropping, rotation and vertical flip.
 // "src_size" is needed to parse MJPG.
@@ -225,7 +218,7 @@ int MJPGToI420(const uint8* sample, size_t sample_size,
 //              crop_y = (src_height - dst_height) / 2
 // "src_width" / "src_height" is size of src_frame in pixels.
 //   "src_height" can be negative indicating a vertically flipped image source.
-// "dst_width" / "dst_height" is size of destination to crop to.
+// "crop_width" / "crop_height" is the size to crop the src to.
 //    Must be less than or equal to src_width/src_height
 //    Cropping parameters are pre-rotation.
 // "rotation" can be 0, 90, 180 or 270.
@@ -238,8 +231,8 @@ int ConvertToI420(const uint8* src_frame, size_t src_size,
                   uint8* dst_v, int dst_stride_v,
                   int crop_x, int crop_y,
                   int src_width, int src_height,
-                  int dst_width, int dst_height,
-                  RotationMode rotation,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
                   uint32 format);
 
 #ifdef __cplusplus
diff --git a/files/include/libyuv/convert_argb.h b/files/include/libyuv/convert_argb.h
index 86085252..996f4768 100644
--- a/files/include/libyuv/convert_argb.h
+++ b/files/include/libyuv/convert_argb.h
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -12,13 +12,10 @@
 #define INCLUDE_LIBYUV_CONVERT_ARGB_H_
 
 #include "libyuv/basic_types.h"
-// TODO(fbarchard): Remove the following headers includes
-#include "libyuv/convert_from.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
+
+#include "libyuv/rotate.h"  // For enum RotationMode.
 
 // TODO(fbarchard): This set of functions should exactly match convert.h
-// Add missing V210 and Q420.
 // TODO(fbarchard): Add tests. Create random content of right size and convert
 // with C vs Opt and or to I420 and compare.
 // TODO(fbarchard): Some of these functions lack parameter setting.
@@ -61,6 +58,22 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height);
 
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
 // Convert I411 to ARGB.
 LIBYUV_API
 int I411ToARGB(const uint8* src_y, int src_stride_y,
@@ -69,17 +82,38 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height);
 
-// Convert I400 (grey) to ARGB.
+// Convert I420 with Alpha to preattenuated ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    const uint8* src_a, int src_stride_a,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height, int attenuate);
+
+// Convert I420 with Alpha to preattenuated ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    const uint8* src_a, int src_stride_a,
+                    uint8* dst_abgr, int dst_stride_abgr,
+                    int width, int height, int attenuate);
+
+// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
 LIBYUV_API
 int I400ToARGB(const uint8* src_y, int src_stride_y,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height);
 
-// Convert I400 to ARGB. Reverse of ARGBToI400.
+// Convert J400 (jpeg grey) to ARGB.
 LIBYUV_API
-int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
-                         uint8* dst_argb, int dst_stride_argb,
-                         int width, int height);
+int J400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Alias.
+#define YToARGB I400ToARGB
 
 // Convert NV12 to ARGB.
 LIBYUV_API
@@ -101,13 +135,6 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height);
 
-// TODO(fbarchard): Convert Q420 to ARGB.
-// LIBYUV_API
-// int Q420ToARGB(const uint8* src_y, int src_stride_y,
-//                const uint8* src_yuy2, int src_stride_yuy2,
-//                uint8* dst_argb, int dst_stride_argb,
-//                int width, int height);
-
 // Convert YUY2 to ARGB.
 LIBYUV_API
 int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
@@ -120,11 +147,69 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height);
 
-// TODO(fbarchard): Convert V210 to ARGB.
-// LIBYUV_API
-// int V210ToARGB(const uint8* src_uyvy, int src_stride_uyvy,
-//                uint8* dst_argb, int dst_stride_argb,
-//                int width, int height);
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
 
 // BGRA little endian (argb in memory) to ARGB.
 LIBYUV_API
@@ -177,7 +262,6 @@ int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
                    uint8* dst_argb, int dst_stride_argb,
                    int width, int height);
 
-#ifdef HAVE_JPEG
 // src_width/height provided by capture
 // dst_width/height for clipping determine final size.
 LIBYUV_API
@@ -185,9 +269,6 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
                uint8* dst_argb, int dst_stride_argb,
                int src_width, int src_height,
                int dst_width, int dst_height);
-#endif
-
-// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
 
 // Convert camera sample to ARGB with cropping, rotation and vertical flip.
 // "src_size" is needed to parse MJPG.
@@ -205,7 +286,7 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
 //              crop_y = (src_height - dst_height) / 2
 // "src_width" / "src_height" is size of src_frame in pixels.
 //   "src_height" can be negative indicating a vertically flipped image source.
-// "dst_width" / "dst_height" is size of destination to crop to.
+// "crop_width" / "crop_height" is the size to crop the src to.
 //    Must be less than or equal to src_width/src_height
 //    Cropping parameters are pre-rotation.
 // "rotation" can be 0, 90, 180 or 270.
@@ -216,8 +297,8 @@ int ConvertToARGB(const uint8* src_frame, size_t src_size,
                   uint8* dst_argb, int dst_stride_argb,
                   int crop_x, int crop_y,
                   int src_width, int src_height,
-                  int dst_width, int dst_height,
-                  RotationMode rotation,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
                   uint32 format);
 
 #ifdef __cplusplus
diff --git a/files/include/libyuv/convert_from.h b/files/include/libyuv/convert_from.h
index 4eae950c..7522ea5c 100644
--- a/files/include/libyuv/convert_from.h
+++ b/files/include/libyuv/convert_from.h
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -56,26 +56,31 @@ int I400Copy(const uint8* src_y, int src_stride_y,
              uint8* dst_y, int dst_stride_y,
              int width, int height);
 
-// TODO(fbarchard): I420ToNV12
-// TODO(fbarchard): I420ToM420
-// TODO(fbarchard): I420ToQ420
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
 
 LIBYUV_API
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
+int I420ToNV21(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
                int width, int height);
 
 LIBYUV_API
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
                uint8* dst_frame, int dst_stride_frame,
                int width, int height);
 
 LIBYUV_API
-int I420ToV210(const uint8* src_y, int src_stride_y,
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
                uint8* dst_frame, int dst_stride_frame,
@@ -123,13 +128,24 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
               uint8* dst_frame, int dst_stride_frame,
               int width, int height);
 
-LIBYUV_API
+//LIBYUV_API
 int I420ToRGB565(const uint8* src_y, int src_stride_y,
                  const uint8* src_u, int src_stride_u,
                  const uint8* src_v, int src_stride_v,
                  uint8* dst_frame, int dst_stride_frame,
                  int width, int height);
 
+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+
+LIBYUV_API
+int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint8* dst_frame, int dst_stride_frame,
+                       const uint8* dither4x4, int width, int height);
+
 LIBYUV_API
 int I420ToARGB1555(const uint8* src_y, int src_stride_y,
                    const uint8* src_u, int src_stride_u,
@@ -144,8 +160,6 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
                    uint8* dst_frame, int dst_stride_frame,
                    int width, int height);
 
-// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
-
 // Convert I420 to specified format.
 // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
 //    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
diff --git a/files/include/libyuv/convert_from_argb.h b/files/include/libyuv/convert_from_argb.h
new file mode 100644
index 00000000..1df53200
--- /dev/null
+++ b/files/include/libyuv/convert_from_argb.h
@@ -0,0 +1,190 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB to ARGB.
+#define ARGBToARGB ARGBCopy
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Convert ARGB To BGRA.
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height);
+
+// Convert ARGB To ABGR.
+LIBYUV_API
+int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert ARGB To RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height);
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_rgb, int dst_stride_rgb,
+              int width, int height);
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
+// const uint8(*dither)[4][4];
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height);
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height);
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height);
+
+// Convert ARGB To I444.
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I422.
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I420. (also in convert.h)
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J422.
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I411.
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J400. (JPeg full range).
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               int width, int height);
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
+LIBYUV_API
+int ARGBToG(const uint8* src_argb, int src_stride_argb,
+            uint8* dst_g, int dst_stride_g,
+            int width, int height);
+
+// Convert ARGB To NV12.
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+// Convert ARGB To YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height);
+
+// Convert ARGB To UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT
diff --git a/files/include/libyuv/cpu_id.h b/files/include/libyuv/cpu_id.h
index 0914f1d2..dfb7445e 100644
--- a/files/include/libyuv/cpu_id.h
+++ b/files/include/libyuv/cpu_id.h
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -18,7 +18,7 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// Internal flag to indicate cpuid is initialized.
+// Internal flag to indicate cpuid requires initialization.
 static const int kCpuInitialized = 0x1;
 
 // These flags are only valid on ARM processors.
@@ -34,6 +34,14 @@ static const int kCpuHasSSE41 = 0x80;
 static const int kCpuHasSSE42 = 0x100;
 static const int kCpuHasAVX = 0x200;
 static const int kCpuHasAVX2 = 0x400;
+static const int kCpuHasERMS = 0x800;
+static const int kCpuHasFMA3 = 0x1000;
+static const int kCpuHasAVX3 = 0x2000;
+// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
+
+// These flags are only valid on MIPS processors.
+static const int kCpuHasMIPS = 0x10000;
+static const int kCpuHasDSPR2 = 0x20000;
 
 // Internal function used to auto-init.
 LIBYUV_API
@@ -48,19 +56,21 @@ int ArmCpuCaps(const char* cpuinfo_name);
 // returns non-zero if instruction set is detected
 static __inline int TestCpuFlag(int test_flag) {
   LIBYUV_API extern int cpu_info_;
-  return (cpu_info_ ? cpu_info_ : InitCpuFlags()) & test_flag;
+  return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag;
 }
 
 // For testing, allow CPU flags to be disabled.
 // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
 // MaskCpuFlags(-1) to enable all cpu specific optimizations.
-// MaskCpuFlags(0) to disable all cpu specific optimizations.
+// MaskCpuFlags(1) to disable all cpu specific optimizations.
 LIBYUV_API
 void MaskCpuFlags(int enable_flags);
 
 // Low level cpuid for X86. Returns zeros on other CPUs.
+// eax is the info type that you want.
+// ecx is typically the cpu number, and should normally be zero.
 LIBYUV_API
-void CpuId(int cpu_info[4], int info_type);
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/files/include/libyuv/format_conversion.h b/files/include/libyuv/format_conversion.h
deleted file mode 100644
index 06bd387f..00000000
--- a/files/include/libyuv/format_conversion.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_  // NOLINT
-#define INCLUDE_LIBYUV_FORMATCONVERSION_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Convert Bayer RGB formats to I420.
-LIBYUV_API
-int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-LIBYUV_API
-int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-// Temporary API mapper.
-#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \
-    BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f)
-
-LIBYUV_API
-int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height,
-                uint32 src_fourcc_bayer);
-
-// Convert I420 to Bayer RGB formats.
-LIBYUV_API
-int I420ToBayerBGGR(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-LIBYUV_API
-int I420ToBayerGBRG(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-LIBYUV_API
-int I420ToBayerGRBG(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-LIBYUV_API
-int I420ToBayerRGGB(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-// Temporary API mapper.
-#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \
-    I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f)
-
-LIBYUV_API
-int I420ToBayer(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_frame, int dst_stride_frame,
-                int width, int height,
-                uint32 dst_fourcc_bayer);
-
-// Convert Bayer RGB formats to ARGB.
-LIBYUV_API
-int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-LIBYUV_API
-int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-// Temporary API mapper.
-#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f)
-
-LIBYUV_API
-int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height,
-                uint32 src_fourcc_bayer);
-
-// Converts ARGB to Bayer RGB formats.
-LIBYUV_API
-int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-LIBYUV_API
-int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-LIBYUV_API
-int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-LIBYUV_API
-int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-// Temporary API mapper.
-#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f)
-
-LIBYUV_API
-int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_bayer, int dst_stride_bayer,
-                int width, int height,
-                uint32 dst_fourcc_bayer);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_FORMATCONVERSION_H_  NOLINT
diff --git a/files/include/libyuv/mjpeg_decoder.h b/files/include/libyuv/mjpeg_decoder.h
index 67090cf0..8423121d 100644
--- a/files/include/libyuv/mjpeg_decoder.h
+++ b/files/include/libyuv/mjpeg_decoder.h
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -13,6 +13,7 @@
 
 #include "libyuv/basic_types.h"
 
+#ifdef __cplusplus
 // NOTE: For a simplified public API use convert.h MJPGToI420().
 
 struct jpeg_common_struct;
@@ -21,6 +22,16 @@ struct jpeg_source_mgr;
 
 namespace libyuv {
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 static const uint32 kUnknownDataSize = 0xFFFFFFFF;
 
 enum JpegSubsamplingType {
@@ -32,6 +43,17 @@ enum JpegSubsamplingType {
   kJpegUnknown
 };
 
+struct Buffer {
+  const uint8* data;
+  int len;
+};
+
+struct BufferVector {
+  Buffer* buffers;
+  int len;
+  int pos;
+};
+
 struct SetJmpErrorMgr;
 
 // MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
@@ -41,7 +63,7 @@ struct SetJmpErrorMgr;
 // MJPEG frames.
 //
 // See http://tools.ietf.org/html/rfc2435
-class MJpegDecoder {
+class LIBYUV_API MJpegDecoder {
  public:
   typedef void (*CallbackFunction)(void* opaque,
                                    const uint8* const* data,
@@ -59,11 +81,12 @@ class MJpegDecoder {
   ~MJpegDecoder();
 
   // Loads a new frame, reads its headers, and determines the uncompressed
-  // image format. Returns true if image looks valid and format is supported.
-  // If return value is true, then the values for all the following getters
-  // are populated.
+  // image format.
+  // Returns LIBYUV_TRUE if image looks valid and format is supported.
+  // If return value is LIBYUV_TRUE, then the values for all the following
+  // getters are populated.
   // src_len is the size of the compressed mjpeg frame in bytes.
-  bool LoadFrame(const uint8* src, size_t src_len);
+  LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);
 
   // Returns width of the last loaded frame in pixels.
   int GetWidth();
@@ -107,7 +130,7 @@ class MJpegDecoder {
 
   // Call this after LoadFrame() if you decide you don't want to decode it
   // after all.
-  bool UnloadFrame();
+  LIBYUV_BOOL UnloadFrame();
 
   // Decodes the entire image into a one-buffer-per-color-component format.
   // dst_width must match exactly. dst_height must be <= to image height; if
@@ -116,13 +139,13 @@ class MJpegDecoder {
   // at least GetComponentSize(i). The pointers in planes are incremented
   // to point to after the end of the written data.
   // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
-  bool DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
+  LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
 
   // Decodes the entire image and passes the data via repeated calls to a
   // callback function. Each call will get the data for a whole number of
   // image scanlines.
   // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
-  bool DecodeToCallback(CallbackFunction fn, void* opaque,
+  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,
                         int dst_width, int dst_height);
 
   // The helper function which recognizes the jpeg sub-sampling type.
@@ -130,34 +153,14 @@ class MJpegDecoder {
      int* subsample_x, int* subsample_y, int number_of_components);
 
  private:
-  struct Buffer {
-    const uint8* data;
-    int len;
-  };
-
-  struct BufferVector {
-    Buffer* buffers;
-    int len;
-    int pos;
-  };
-
-  // Methods that are passed to jpeglib.
-  static int fill_input_buffer(jpeg_decompress_struct* cinfo);
-  static void init_source(jpeg_decompress_struct* cinfo);
-  static void skip_input_data(jpeg_decompress_struct* cinfo,
-                              long num_bytes);  // NOLINT
-  static void term_source(jpeg_decompress_struct* cinfo);
-
-  static void ErrorHandler(jpeg_common_struct* cinfo);
-
   void AllocOutputBuffers(int num_outbufs);
   void DestroyOutputBuffers();
 
-  bool StartDecode();
-  bool FinishDecode();
+  LIBYUV_BOOL StartDecode();
+  LIBYUV_BOOL FinishDecode();
 
   void SetScanlinePointers(uint8** data);
-  bool DecodeImcuRow();
+  LIBYUV_BOOL DecodeImcuRow();
 
   int GetComponentScanlinePadding(int component);
 
@@ -169,9 +172,9 @@ class MJpegDecoder {
   jpeg_source_mgr* source_mgr_;
   SetJmpErrorMgr* error_mgr_;
 
-  // true iff at least one component has scanline padding. (i.e.,
+  // LIBYUV_TRUE iff at least one component has scanline padding. (i.e.,
   // GetComponentScanlinePadding() != 0.)
-  bool has_scanline_padding_;
+  LIBYUV_BOOL has_scanline_padding_;
 
   // Temporaries used to point to scanline outputs.
   int num_outbufs_;  // Outermost size of all arrays below.
@@ -185,4 +188,5 @@ class MJpegDecoder {
 
 }  // namespace libyuv
 
+#endif  //  __cplusplus
 #endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_  NOLINT
diff --git a/files/include/libyuv/planar_functions.h b/files/include/libyuv/planar_functions.h
index 7e43dabb..881b0c5c 100644
--- a/files/include/libyuv/planar_functions.h
+++ b/files/include/libyuv/planar_functions.h
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -22,20 +22,53 @@ namespace libyuv {
 extern "C" {
 #endif
 
+// Copy a plane of data.
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+LIBYUV_API
+void CopyPlane_16(const uint16* src_y, int src_stride_y,
+                  uint16* dst_y, int dst_stride_y,
+                  int width, int height);
+
+// Set a plane of data to a 32 bit value.
 LIBYUV_API
 void SetPlane(uint8* dst_y, int dst_stride_y,
               int width, int height,
               uint32 value);
 
-// Alias.
-#define I400ToI400 CopyPlane
-
-// Copy a plane of data (I420 to I400).
+// Copy I400.  Supports inverting.
 LIBYUV_API
-void CopyPlane(const uint8* src_y, int src_stride_y,
+int I400ToI400(const uint8* src_y, int src_stride_y,
                uint8* dst_y, int dst_stride_y,
                int width, int height);
 
+#define J400ToJ400 I400ToI400
+
+// Copy I422 to I422.
+#define I422ToI422 I422Copy
+LIBYUV_API
+int I422Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
+// Copy I444 to I444.
+#define I444ToI444 I444Copy
+LIBYUV_API
+int I444Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
 // Convert YUY2 to I422.
 LIBYUV_API
 int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
@@ -45,20 +78,37 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
                int width, int height);
 
 // Convert UYVY to I422.
+LIBYUV_API
 int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
 // Convert I420 to I400. (calls CopyPlane ignoring u/v).
 LIBYUV_API
 int I420ToI400(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
                uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
                int width, int height);
 
+// Alias
+#define J420ToJ400 I420ToI400
+#define I420ToI420Mirror I420Mirror
+
 // I420 mirror.
 LIBYUV_API
 int I420Mirror(const uint8* src_y, int src_stride_y,
@@ -69,6 +119,19 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
+// Alias
+#define I400ToI400Mirror I400Mirror
+
+// I400 mirror.  A single plane is mirrored horizontally.
+// Pass negative height to achieve 180 degree rotation.
+LIBYUV_API
+int I400Mirror(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Alias
+#define ARGBToARGBMirror ARGBMirror
+
 // ARGB mirror.
 LIBYUV_API
 int ARGBMirror(const uint8* src_argb, int src_stride_argb,
@@ -82,67 +145,6 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
                  uint8* dst_rgb565, int dst_stride_rgb565,
                  int width, int height);
 
-// Convert NV21 to RGB565.
-LIBYUV_API
-int NV21ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height);
-
-// Aliases.
-#define ARGBToBGRA BGRAToARGB
-#define ARGBToABGR ABGRToARGB
-
-// Convert ARGB To RGBA.
-LIBYUV_API
-int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert ARGB To RGB24.
-LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height);
-
-// Convert ARGB To RAW.
-LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_rgb, int dst_stride_rgb,
-              int width, int height);
-
-// Convert ARGB To RGB565.
-LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height);
-
-// Convert ARGB To ARGB1555.
-LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height);
-
-// Convert ARGB To ARGB4444.
-LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height);
-
-// Convert ARGB to I400.
-LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
-
-// ARGB little endian (bgra in memory) to I422.
-LIBYUV_API
-int ARGBToI422(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
 // I422ToARGB is in convert_argb.h
 // Convert I422 to BGRA.
 LIBYUV_API
@@ -168,6 +170,14 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y,
                uint8* dst_rgba, int dst_stride_rgba,
                int width, int height);
 
+// Alias
+#define RGB24ToRAW RAWToRGB24
+
+LIBYUV_API
+int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
+               uint8* dst_rgb24, int dst_stride_rgb24,
+               int width, int height);
+
 // Draw a rectangle into I420.
 LIBYUV_API
 int I420Rect(uint8* dst_y, int dst_stride_y,
@@ -198,14 +208,27 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
               int x, int y, int width, int height);
 
 // Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The next 4 coefficients apply to B, G, R, A and produce R of the output.
+// The last 4 coefficients apply to B, G, R, A and produce A of the output.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    const int8* matrix_argb,
+                    int width, int height);
+
+// Deprecated. Use ARGBColorMatrix instead.
+// Apply a matrix rotation to each ARGB pixel.
 // matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
 // The first 4 coefficients apply to B, G, R, A and produce B of the output.
 // The next 4 coefficients apply to B, G, R, A and produce G of the output.
 // The last 4 coefficients apply to B, G, R, A and produce R of the output.
 LIBYUV_API
-int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
-                    const int8* matrix_argb,
-                    int x, int y, int width, int height);
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                   const int8* matrix_rgb,
+                   int x, int y, int width, int height);
 
 // Apply a color table each ARGB pixel.
 // Table contains 256 ARGB values.
@@ -214,6 +237,36 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
                    const uint8* table_argb,
                    int x, int y, int width, int height);
 
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                  const uint8* table_argb,
+                  int x, int y, int width, int height);
+
+// Apply a luma/color table each ARGB pixel but preserve destination alpha.
+// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
+// RGB (YJ style) and C is an 8 bit color component (R, G or B).
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_argb, int dst_stride_argb,
+                       const uint8* luma_rgb_table,
+                       int width, int height);
+
+// Apply a 3 term polynomial to ARGB values.
+// poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is
+// coefficients for b, g, r and a.  The 3rd row is coefficients for b squared,
+// g squared, r squared and a squared.  The 4rd row is coefficients for b to
+// the 3, g to the 3, r to the 3 and a to the 3.  The values are summed and
+// result clamped to 0 to 255.
+// A polynomial approximation can be dirived using software such as 'R'.
+
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   const float* poly,
+                   int width, int height);
+
 // Quantize a rectangle of ARGB. Alpha unaffected.
 // scale is a 16 bit fractional fixed point scaler between 0 and 65535.
 // interval_size should be a value between 1 and 255.
@@ -229,6 +282,24 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
              uint8* dst_argb, int dst_stride_argb,
              int width, int height);
 
+// Copy Alpha channel of ARGB to alpha of ARGB.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height);
+
+// Extract the alpha channel from ARGB.
+LIBYUV_API
+int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb,
+                     uint8* dst_a, int dst_stride_a,
+                     int width, int height);
+
+// Copy Y channel to Alpha of ARGB.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+                     uint8* dst_argb, int dst_stride_argb,
+                     int width, int height);
+
 typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
                              uint8* dst_argb, int width);
 
@@ -237,6 +308,7 @@ LIBYUV_API
 ARGBBlendRow GetARGBBlend();
 
 // Alpha Blend ARGB images and store to destination.
+// Source is pre-multiplied by alpha using ARGBAttenuate.
 // Alpha of destination is set to 255.
 LIBYUV_API
 int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
@@ -244,6 +316,52 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);
 
+// Alpha Blend plane and store to destination.
+// Source is not pre-multiplied by alpha.
+LIBYUV_API
+int BlendPlane(const uint8* src_y0, int src_stride_y0,
+               const uint8* src_y1, int src_stride_y1,
+               const uint8* alpha, int alpha_stride,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Alpha Blend YUV images and store to destination.
+// Source is not pre-multiplied by alpha.
+// Alpha is full width x height and subsampled to half size to apply to UV.
+LIBYUV_API
+int I420Blend(const uint8* src_y0, int src_stride_y0,
+              const uint8* src_u0, int src_stride_u0,
+              const uint8* src_v0, int src_stride_v0,
+              const uint8* src_y1, int src_stride_y1,
+              const uint8* src_u1, int src_stride_u1,
+              const uint8* src_v1, int src_stride_v1,
+              const uint8* alpha, int alpha_stride,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height);
+
+// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
+// Add ARGB image with ARGB image. Saturates to 255.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
+            const uint8* src_argb1, int src_stride_argb1,
+            uint8* dst_argb, int dst_stride_argb,
+            int width, int height);
+
+// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
 // Convert I422 to YUY2.
 LIBYUV_API
 int I422ToYUY2(const uint8* src_y, int src_stride_y,
@@ -272,12 +390,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
                     uint8* dst_argb, int dst_stride_argb,
                     int width, int height);
 
-// Convert MJPG to ARGB.
-LIBYUV_API
-int MJPGToARGB(const uint8* sample, size_t sample_size,
-               uint8* argb, int argb_stride,
-               int w, int h, int dw, int dh);
-
+// Internal function - do not call directly.
 // Computes table of cumulative sum for image where the value is the sum
 // of all values above and to the left of the entry. Used by ARGBBlur.
 LIBYUV_API
@@ -286,8 +399,11 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
                              int width, int height);
 
 // Blur ARGB image.
-// Caller should allocate dst_cumsum table of width * height * 16 bytes aligned
-// to 16 byte boundary.
+// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
+//   16 byte boundary.
+// dst_stride32_cumsum is number of ints in a row (width * 4).
+// radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
+// Blur is optimized for radius of 5 (11x11) or less.
 LIBYUV_API
 int ARGBBlur(const uint8* src_argb, int src_stride_argb,
              uint8* dst_argb, int dst_stride_argb,
@@ -300,35 +416,88 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height, uint32 value);
 
-// Interpolate between two ARGB images using specified amount of interpolation
+// Interpolate between two images using specified amount of interpolation
 // (0 to 255) and store to destination.
-// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
-// and 255 means 1% src_argb0 and 99% src_argb1.
-// Internally uses ARGBScale bilinear filtering.
-// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
+// and 255 means 1% src0 and 99% src1.
+LIBYUV_API
+int InterpolatePlane(const uint8* src0, int src_stride0,
+                     const uint8* src1, int src_stride1,
+                     uint8* dst, int dst_stride,
+                     int width, int height, int interpolation);
+
+// Interpolate between two ARGB images using specified amount of interpolation
+// Internally calls InterpolatePlane with width * 4 (bpp).
 LIBYUV_API
 int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                     const uint8* src_argb1, int src_stride_argb1,
                     uint8* dst_argb, int dst_stride_argb,
                     int width, int height, int interpolation);
 
-#if defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
-    defined(TARGET_IPHONE_SIMULATOR)
-#define YUV_DISABLE_ASM
+// Interpolate between two YUV images using specified amount of interpolation
+// Internally calls InterpolatePlane on each plane where the U and V planes
+// are half width and half height.
+LIBYUV_API
+int I420Interpolate(const uint8* src0_y, int src0_stride_y,
+                    const uint8* src0_u, int src0_stride_u,
+                    const uint8* src0_v, int src0_stride_v,
+                    const uint8* src1_y, int src1_stride_y,
+                    const uint8* src1_u, int src1_stride_u,
+                    const uint8* src1_v, int src1_stride_v,
+                    uint8* dst_y, int dst_stride_y,
+                    uint8* dst_u, int dst_stride_u,
+                    uint8* dst_v, int dst_stride_v,
+                    int width, int height, int interpolation);
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_ARGBAFFINEROW_SSE2
 #endif
-// Row functions for copying a pixels from a source with a slope to a row
+
+// Row function for copying pixels from a source with a slope to a row
 // of destination. Useful for scaling, rotation, mirror, texture mapping.
 LIBYUV_API
 void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
                      uint8* dst_argb, const float* uv_dudv, int width);
-// The following are available on all x86 platforms:
-#if !defined(YUV_DISABLE_ASM) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 LIBYUV_API
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                         uint8* dst_argb, const float* uv_dudv, int width);
-#define HAS_ARGBAFFINEROW_SSE2
-#endif
+
+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
+// shuffler is 16 bytes and must be aligned.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+                uint8* dst_argb, int dst_stride_argb,
+                const uint8* shuffler, int width, int height);
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+                     uint8* dst_y, int dst_stride_y,
+                     int width, int height);
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/files/include/libyuv/rotate.h b/files/include/libyuv/rotate.h
index e7608a2d..8af60b89 100644
--- a/files/include/libyuv/rotate.h
+++ b/files/include/libyuv/rotate.h
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -19,7 +19,7 @@ extern "C" {
 #endif
 
 // Supported rotation.
-enum RotationMode {
+typedef enum RotationMode {
   kRotate0 = 0,  // No rotation.
   kRotate90 = 90,  // Rotate 90 degrees clockwise.
   kRotate180 = 180,  // Rotate 180 degrees.
@@ -29,7 +29,7 @@ enum RotationMode {
   kRotateNone = 0,
   kRotateClockwise = 90,
   kRotateCounterClockwise = 270,
-};
+} RotationModeEnum;
 
 // Rotate I420 frame.
 LIBYUV_API
@@ -39,7 +39,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
-               int src_width, int src_height, RotationMode mode);
+               int src_width, int src_height, enum RotationMode mode);
 
 // Rotate NV12 input and store in I420.
 LIBYUV_API
@@ -48,9 +48,15 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
                      uint8* dst_y, int dst_stride_y,
                      uint8* dst_u, int dst_stride_u,
                      uint8* dst_v, int dst_stride_v,
-                     int src_width, int src_height, RotationMode mode);
+                     int src_width, int src_height, enum RotationMode mode);
 
-// Rotate planes by 90, 180, 270
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+                uint8* dst, int dst_stride,
+                int src_width, int src_height, enum RotationMode mode);
+
+// Rotate planes by 90, 180, 270. Deprecated.
 LIBYUV_API
 void RotatePlane90(const uint8* src, int src_stride,
                    uint8* dst, int dst_stride,
@@ -75,7 +81,7 @@ void RotateUV90(const uint8* src, int src_stride,
 // Rotations for when U and V are interleaved.
 // These functions take one input pointer and
 // split the data into two buffers while
-// rotating them.
+// rotating them. Deprecated.
 LIBYUV_API
 void RotateUV180(const uint8* src, int src_stride,
                  uint8* dst_a, int dst_stride_a,
@@ -91,6 +97,7 @@ void RotateUV270(const uint8* src, int src_stride,
 // The 90 and 270 functions are based on transposes.
 // Doing a transpose with reversing the read/write
 // order will result in a rotation by +- 90 degrees.
+// Deprecated.
 LIBYUV_API
 void TransposePlane(const uint8* src, int src_stride,
                     uint8* dst, int dst_stride,
diff --git a/files/include/libyuv/rotate_argb.h b/files/include/libyuv/rotate_argb.h
index a2781df3..660ff557 100644
--- a/files/include/libyuv/rotate_argb.h
+++ b/files/include/libyuv/rotate_argb.h
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -23,7 +23,7 @@ extern "C" {
 LIBYUV_API
 int ARGBRotate(const uint8* src_argb, int src_stride_argb,
                uint8* dst_argb, int dst_stride_argb,
-               int src_width, int src_height, RotationMode mode);
+               int src_width, int src_height, enum RotationMode mode);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/files/include/libyuv/rotate_row.h b/files/include/libyuv/rotate_row.h
new file mode 100644
index 00000000..ebc487f9
--- /dev/null
+++ b/files/include/libyuv/rotate_row.h
@@ -0,0 +1,121 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// The following are available for Visual C and clangcl 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+#define HAS_TRANSPOSEWX8_SSSE3
+#endif
+
+// The following are available for 64 bit GCC but not NaCL:
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+    defined(__x86_64__)
+#define HAS_TRANSPOSEWX8_FAST_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_TRANSPOSEWX8_NEON
+#define HAS_TRANSPOSEUVWX8_NEON
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+    defined(__mips__) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_TRANSPOSEWX8_DSPR2
+#define HAS_TRANSPOSEUVWX8_DSPR2
+#endif  // defined(__mips__)
+
+void TransposeWxH_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width, int height);
+
+void TransposeWx8_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width);
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width);
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+void TransposeWx8_DSPR2(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+
+void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
+                           uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
+                            uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
+                                 uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride,
+                            uint8* dst, int dst_stride, int width);
+
+void TransposeUVWxH_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b,
+                      int width, int height);
+
+void TransposeUVWx8_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
+                          uint8* dst_a, int dst_stride_a,
+                          uint8* dst_b, int dst_stride_b, int width);
+
+void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride,
+                             uint8* dst_a, int dst_stride_a,
+                             uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride,
+                             uint8* dst_a, int dst_stride_a,
+                             uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride,
+                              uint8* dst_a, int dst_stride_a,
+                              uint8* dst_b, int dst_stride_b, int width);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_  NOLINT
diff --git a/files/include/libyuv/row.h b/files/include/libyuv/row.h
index 4814f254..055880ba 100644
--- a/files/include/libyuv/row.h
+++ b/files/include/libyuv/row.h
@@ -4,13 +4,15 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #ifndef INCLUDE_LIBYUV_ROW_H_  // NOLINT
 #define INCLUDE_LIBYUV_ROW_H_
 
+#include <stdlib.h>  // For malloc.
+
 #include "libyuv/basic_types.h"
 
 #ifdef __cplusplus
@@ -18,696 +20,1817 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// TODO(fbarchard): Remove kMaxStride
-#define kMaxStride (2880 * 4)
 #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
 
-#if defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
-    defined(TARGET_IPHONE_SIMULATOR)
-#define YUV_DISABLE_ASM
+#ifdef __cplusplus
+#define align_buffer_64(var, size)                                             \
+  uint8* var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63));            \
+  uint8* var = reinterpret_cast<uint8*>                                        \
+      ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63)
+#else
+#define align_buffer_64(var, size)                                             \
+  uint8* var##_mem = (uint8*)(malloc((size) + 63));               /* NOLINT */ \
+  uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63)       /* NOLINT */
+#endif
+
+#define free_aligned_buffer_64(var) \
+  free(var##_mem);  \
+  var = 0
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
 #endif
 // True if compiling for SSSE3 as a requirement.
 #if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
 #define LIBYUV_SSSE3_ONLY
 #endif
 
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// clang >= 3.5.0 required for Arm64.
+#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
+#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
+#define LIBYUV_DISABLE_NEON
+#endif  // clang >= 3.5
+#endif  // __clang__
+
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif  // GNUC >= 4.7
+#endif  // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
 // The following are available on all x86 platforms:
-#if !defined(YUV_DISABLE_ASM) && \
+#if !defined(LIBYUV_DISABLE_X86) && \
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-// Conversions.
-#define HAS_ABGRTOARGBROW_SSSE3
+// Conversions:
 #define HAS_ABGRTOUVROW_SSSE3
 #define HAS_ABGRTOYROW_SSSE3
 #define HAS_ARGB1555TOARGBROW_SSE2
 #define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBSETROW_X86
+#define HAS_ARGBSHUFFLEROW_SSE2
+#define HAS_ARGBSHUFFLEROW_SSSE3
 #define HAS_ARGBTOARGB1555ROW_SSE2
 #define HAS_ARGBTOARGB4444ROW_SSE2
 #define HAS_ARGBTORAWROW_SSSE3
 #define HAS_ARGBTORGB24ROW_SSSE3
+#define HAS_ARGBTORGB565DITHERROW_SSE2
 #define HAS_ARGBTORGB565ROW_SSE2
-#define HAS_ARGBTORGBAROW_SSSE3
+#define HAS_ARGBTOUV444ROW_SSSE3
+#define HAS_ARGBTOUVJROW_SSSE3
 #define HAS_ARGBTOUVROW_SSSE3
+#define HAS_ARGBTOYJROW_SSSE3
 #define HAS_ARGBTOYROW_SSSE3
-#define HAS_BGRATOARGBROW_SSSE3
+#define HAS_ARGBEXTRACTALPHAROW_SSE2
 #define HAS_BGRATOUVROW_SSSE3
 #define HAS_BGRATOYROW_SSSE3
+#define HAS_COPYROW_ERMS
 #define HAS_COPYROW_SSE2
-#define HAS_COPYROW_X86
+#define HAS_H422TOARGBROW_SSSE3
 #define HAS_I400TOARGBROW_SSE2
-#define HAS_I411TOARGBROW_SSSE3
-#define HAS_I422TOABGRROW_SSSE3
+#define HAS_I422TOARGB1555ROW_SSSE3
+#define HAS_I422TOARGB4444ROW_SSSE3
 #define HAS_I422TOARGBROW_SSSE3
-#define HAS_I422TOBGRAROW_SSSE3
+#define HAS_I422TORGB24ROW_SSSE3
+#define HAS_I422TORGB565ROW_SSSE3
+#define HAS_I422TORGBAROW_SSSE3
+#define HAS_I422TOUYVYROW_SSE2
+#define HAS_I422TOYUY2ROW_SSE2
 #define HAS_I444TOARGBROW_SSSE3
+#define HAS_J400TOARGBROW_SSE2
+#define HAS_J422TOARGBROW_SSSE3
+#define HAS_MERGEUVROW_SSE2
 #define HAS_MIRRORROW_SSSE3
-#define HAS_MIRRORROWUV_SSSE3
+#define HAS_MIRRORUVROW_SSSE3
 #define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV12TORGB565ROW_SSSE3
 #define HAS_NV21TOARGBROW_SSSE3
 #define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RAWTORGB24ROW_SSSE3
+#define HAS_RAWTOYROW_SSSE3
 #define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB24TOYROW_SSSE3
 #define HAS_RGB565TOARGBROW_SSE2
+#define HAS_RGBATOUVROW_SSSE3
+#define HAS_RGBATOYROW_SSSE3
+#define HAS_SETROW_ERMS
 #define HAS_SETROW_X86
-#define HAS_SPLITUV_SSE2
+#define HAS_SPLITUVROW_SSE2
+#define HAS_UYVYTOARGBROW_SSSE3
 #define HAS_UYVYTOUV422ROW_SSE2
 #define HAS_UYVYTOUVROW_SSE2
 #define HAS_UYVYTOYROW_SSE2
-#define HAS_YTOARGBROW_SSE2
+#define HAS_YUY2TOARGBROW_SSSE3
 #define HAS_YUY2TOUV422ROW_SSE2
 #define HAS_YUY2TOUVROW_SSE2
 #define HAS_YUY2TOYROW_SSE2
 
-// Effects
+// Effects:
+#define HAS_ARGBADDROW_SSE2
 #define HAS_ARGBAFFINEROW_SSE2
 #define HAS_ARGBATTENUATEROW_SSSE3
 #define HAS_ARGBBLENDROW_SSSE3
 #define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_ARGBCOPYALPHAROW_SSE2
+#define HAS_ARGBCOPYYTOALPHAROW_SSE2
 #define HAS_ARGBGRAYROW_SSSE3
-#define HAS_ARGBINTERPOLATEROW_SSSE3
-#define HAS_ARGBMIRRORROW_SSSE3
+#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSE2
+#define HAS_ARGBMULTIPLYROW_SSE2
+#define HAS_ARGBPOLYNOMIALROW_SSE2
 #define HAS_ARGBQUANTIZEROW_SSE2
 #define HAS_ARGBSEPIAROW_SSSE3
-#define HAS_ARGBSHADE_SSE2
+#define HAS_ARGBSHADEROW_SSE2
+#define HAS_ARGBSUBTRACTROW_SSE2
 #define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_BLENDPLANEROW_SSSE3
 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
-#define HAS_CUMULATIVESUMTOAVERAGE_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_RGBCOLORTABLEROW_X86
+#define HAS_SOBELROW_SSE2
+#define HAS_SOBELTOPLANEROW_SSE2
+#define HAS_SOBELXROW_SSE2
+#define HAS_SOBELXYROW_SSE2
+#define HAS_SOBELYROW_SSE2
+
+// The following functions fail on gcc/clang 32 bit with fpic and framepointer.
+// caveat: clangcl uses row_win.cc which works.
+#if defined(NDEBUG) || !(defined(_DEBUG) && defined(__i386__)) || \
+    !defined(__i386__) || defined(_MSC_VER)
+// TODO(fbarchard): fix build error on x86 debug
+// https://code.google.com/p/libyuv/issues/detail?id=524
+#define HAS_I411TOARGBROW_SSSE3
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I422ALPHATOARGBROW_SSSE3
+#endif
 #endif
 
-// The following are Windows only:
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_ABGRTOARGBROW_SSSE3
-#define HAS_ARGBCOLORTABLEROW_X86
-#define HAS_I422TORGBAROW_SSSE3
-#define HAS_RGBATOARGBROW_SSSE3
-#define HAS_RGBATOUVROW_SSSE3
-#define HAS_RGBATOYROW_SSSE3
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
+    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_ARGBCOPYALPHAROW_AVX2
+#define HAS_ARGBCOPYYTOALPHAROW_AVX2
+#define HAS_ARGBMIRRORROW_AVX2
+#define HAS_ARGBPOLYNOMIALROW_AVX2
+#define HAS_ARGBSHUFFLEROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_AVX2
+#define HAS_ARGBTOUVJROW_AVX2
+#define HAS_ARGBTOUVROW_AVX2
+#define HAS_ARGBTOYJROW_AVX2
+#define HAS_ARGBTOYROW_AVX2
+#define HAS_COPYROW_AVX
+#define HAS_H422TOARGBROW_AVX2
+#define HAS_I400TOARGBROW_AVX2
+#if !(defined(_DEBUG) && defined(__i386__))
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I422ALPHATOARGBROW_AVX2
+#endif
+#define HAS_I411TOARGBROW_AVX2
+#define HAS_I422TOARGB1555ROW_AVX2
+#define HAS_I422TOARGB4444ROW_AVX2
+#define HAS_I422TOARGBROW_AVX2
+#define HAS_I422TORGB24ROW_AVX2
+#define HAS_I422TORGB565ROW_AVX2
+#define HAS_I422TORGBAROW_AVX2
+#define HAS_I444TOARGBROW_AVX2
+#define HAS_INTERPOLATEROW_AVX2
+#define HAS_J422TOARGBROW_AVX2
+#define HAS_MERGEUVROW_AVX2
+#define HAS_MIRRORROW_AVX2
+#define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV12TORGB565ROW_AVX2
+#define HAS_NV21TOARGBROW_AVX2
+#define HAS_SPLITUVROW_AVX2
+#define HAS_UYVYTOARGBROW_AVX2
+#define HAS_UYVYTOUV422ROW_AVX2
+#define HAS_UYVYTOUVROW_AVX2
+#define HAS_UYVYTOYROW_AVX2
+#define HAS_YUY2TOARGBROW_AVX2
+#define HAS_YUY2TOUV422ROW_AVX2
+#define HAS_YUY2TOUVROW_AVX2
+#define HAS_YUY2TOYROW_AVX2
+
+// Effects:
+#define HAS_ARGBADDROW_AVX2
+#define HAS_ARGBATTENUATEROW_AVX2
+#define HAS_ARGBMULTIPLYROW_AVX2
+#define HAS_ARGBSUBTRACTROW_AVX2
+#define HAS_ARGBUNATTENUATEROW_AVX2
+#define HAS_BLENDPLANEROW_AVX2
+#endif
+
+// The following are available for AVX2 Visual C and clangcl 32 bit:
+// TODO(fbarchard): Port to gcc.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_ARGB1555TOARGBROW_AVX2
+#define HAS_ARGB4444TOARGBROW_AVX2
+#define HAS_ARGBTOARGB1555ROW_AVX2
+#define HAS_ARGBTOARGB4444ROW_AVX2
+#define HAS_ARGBTORGB565ROW_AVX2
+#define HAS_J400TOARGBROW_AVX2
+#define HAS_RGB565TOARGBROW_AVX2
 #endif
 
-// The following are disabled when SSSE3 is available:
-#if !defined(YUV_DISABLE_ASM) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
-    !defined(LIBYUV_SSSE3_ONLY)
-#define HAS_ARGBATTENUATE_SSE2
-#define HAS_ARGBBLENDROW_SSE2
-#define HAS_MIRRORROW_SSE2
+// The following are also available on x64 Visual C.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_X64) && \
+    (!defined(__clang__) || defined(__SSSE3__))
+#define HAS_I422ALPHATOARGBROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
 #endif
 
-// The following are available on Neon platforms
-#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_ABGRTOUVROW_NEON
+#define HAS_ABGRTOYROW_NEON
+#define HAS_ARGB1555TOARGBROW_NEON
+#define HAS_ARGB1555TOUVROW_NEON
+#define HAS_ARGB1555TOYROW_NEON
+#define HAS_ARGB4444TOARGBROW_NEON
+#define HAS_ARGB4444TOUVROW_NEON
+#define HAS_ARGB4444TOYROW_NEON
+#define HAS_ARGBSETROW_NEON
+#define HAS_ARGBTOARGB1555ROW_NEON
+#define HAS_ARGBTOARGB4444ROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORGB565DITHERROW_NEON
+#define HAS_ARGBTORGB565ROW_NEON
+#define HAS_ARGBTOUV411ROW_NEON
+#define HAS_ARGBTOUV444ROW_NEON
+#define HAS_ARGBTOUVJROW_NEON
+#define HAS_ARGBTOUVROW_NEON
+#define HAS_ARGBTOYJROW_NEON
+#define HAS_ARGBTOYROW_NEON
+#define HAS_ARGBEXTRACTALPHAROW_NEON
+#define HAS_BGRATOUVROW_NEON
+#define HAS_BGRATOYROW_NEON
 #define HAS_COPYROW_NEON
-#define HAS_I422TOABGRROW_NEON
+#define HAS_I400TOARGBROW_NEON
+#define HAS_I411TOARGBROW_NEON
+#define HAS_I422ALPHATOARGBROW_NEON
+#define HAS_I422TOARGB1555ROW_NEON
+#define HAS_I422TOARGB4444ROW_NEON
 #define HAS_I422TOARGBROW_NEON
-#define HAS_I422TOBGRAROW_NEON
-#define HAS_I422TORAWROW_NEON
 #define HAS_I422TORGB24ROW_NEON
+#define HAS_I422TORGB565ROW_NEON
 #define HAS_I422TORGBAROW_NEON
+#define HAS_I422TOUYVYROW_NEON
+#define HAS_I422TOYUY2ROW_NEON
+#define HAS_I444TOARGBROW_NEON
+#define HAS_J400TOARGBROW_NEON
+#define HAS_MERGEUVROW_NEON
 #define HAS_MIRRORROW_NEON
-#define HAS_MIRRORROWUV_NEON
+#define HAS_MIRRORUVROW_NEON
+#define HAS_NV12TOARGBROW_NEON
+#define HAS_NV12TORGB565ROW_NEON
+#define HAS_NV21TOARGBROW_NEON
+#define HAS_RAWTOARGBROW_NEON
+#define HAS_RAWTORGB24ROW_NEON
+#define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYROW_NEON
+#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYROW_NEON
+#define HAS_RGB565TOARGBROW_NEON
+#define HAS_RGB565TOUVROW_NEON
+#define HAS_RGB565TOYROW_NEON
+#define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYROW_NEON
 #define HAS_SETROW_NEON
-#define HAS_SPLITUV_NEON
+#define HAS_SPLITUVROW_NEON
+#define HAS_UYVYTOARGBROW_NEON
 #define HAS_UYVYTOUV422ROW_NEON
 #define HAS_UYVYTOUVROW_NEON
 #define HAS_UYVYTOYROW_NEON
+#define HAS_YUY2TOARGBROW_NEON
 #define HAS_YUY2TOUV422ROW_NEON
 #define HAS_YUY2TOUVROW_NEON
 #define HAS_YUY2TOYROW_NEON
 
-// TODO(fbarchard): Hook these up to calling functions.
-#define HAS_ABGRTOARGBROW_NEON
-#define HAS_ARGBTORAWROW_NEON
-#define HAS_ARGBTORGB24ROW_NEON
-#define HAS_ARGBTORGBAROW_NEON
-#define HAS_BGRATOARGBROW_NEON
-#define HAS_NV12TOARGBROW_NEON
-#define HAS_NV21TOARGBROW_NEON
-#define HAS_RAWTOARGBROW_NEON
-#define HAS_RGB24TOARGBROW_NEON
-#define HAS_RGBATOARGBROW_NEON
+// Effects:
+#define HAS_ARGBADDROW_NEON
+#define HAS_ARGBATTENUATEROW_NEON
+#define HAS_ARGBBLENDROW_NEON
+#define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBGRAYROW_NEON
+#define HAS_ARGBMIRRORROW_NEON
+#define HAS_ARGBMULTIPLYROW_NEON
+#define HAS_ARGBQUANTIZEROW_NEON
+#define HAS_ARGBSEPIAROW_NEON
+#define HAS_ARGBSHADEROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
+#define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_INTERPOLATEROW_NEON
+#define HAS_SOBELROW_NEON
+#define HAS_SOBELTOPLANEROW_NEON
+#define HAS_SOBELXROW_NEON
+#define HAS_SOBELXYROW_NEON
+#define HAS_SOBELYROW_NEON
 #endif
 
-#if defined(_MSC_VER) && !defined(__CLR_VER)
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
+#define HAS_COPYROW_MIPS
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_I422TOARGBROW_DSPR2
+#define HAS_INTERPOLATEROW_DSPR2
+#define HAS_MIRRORROW_DSPR2
+#define HAS_MIRRORUVROW_DSPR2
+#define HAS_SPLITUVROW_DSPR2
+#endif
+#endif
+
+#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
-typedef __declspec(align(16)) int8 vec8[16];
-typedef __declspec(align(16)) uint8 uvec8[16];
+#define SIMD_ALIGNED32(var) __declspec(align(64)) var
 typedef __declspec(align(16)) int16 vec16[8];
-typedef __declspec(align(16)) uint16 uvec16[8];
 typedef __declspec(align(16)) int32 vec32[4];
+typedef __declspec(align(16)) int8 vec8[16];
+typedef __declspec(align(16)) uint16 uvec16[8];
 typedef __declspec(align(16)) uint32 uvec32[4];
-#elif defined(__GNUC__)
+typedef __declspec(align(16)) uint8 uvec8[16];
+typedef __declspec(align(32)) int16 lvec16[16];
+typedef __declspec(align(32)) int32 lvec32[8];
+typedef __declspec(align(32)) int8 lvec8[32];
+typedef __declspec(align(32)) uint16 ulvec16[16];
+typedef __declspec(align(32)) uint32 ulvec32[8];
+typedef __declspec(align(32)) uint8 ulvec8[32];
+#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))
+// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-typedef int8 __attribute__((vector_size(16))) vec8;
-typedef uint8 __attribute__((vector_size(16))) uvec8;
+#define SIMD_ALIGNED32(var) var __attribute__((aligned(64)))
 typedef int16 __attribute__((vector_size(16))) vec16;
-typedef uint16 __attribute__((vector_size(16))) uvec16;
 typedef int32 __attribute__((vector_size(16))) vec32;
+typedef int8 __attribute__((vector_size(16))) vec8;
+typedef uint16 __attribute__((vector_size(16))) uvec16;
 typedef uint32 __attribute__((vector_size(16))) uvec32;
+typedef uint8 __attribute__((vector_size(16))) uvec8;
+typedef int16 __attribute__((vector_size(32))) lvec16;
+typedef int32 __attribute__((vector_size(32))) lvec32;
+typedef int8 __attribute__((vector_size(32))) lvec8;
+typedef uint16 __attribute__((vector_size(32))) ulvec16;
+typedef uint32 __attribute__((vector_size(32))) ulvec32;
+typedef uint8 __attribute__((vector_size(32))) ulvec8;
 #else
 #define SIMD_ALIGNED(var) var
-typedef int8 vec8[16];
-typedef uint8 uvec8[16];
+#define SIMD_ALIGNED32(var) var
 typedef int16 vec16[8];
-typedef uint16 uvec16[8];
 typedef int32 vec32[4];
+typedef int8 vec8[16];
+typedef uint16 uvec16[8];
 typedef uint32 uvec32[4];
+typedef uint8 uvec8[16];
+typedef int16 lvec16[16];
+typedef int32 lvec32[8];
+typedef int8 lvec8[32];
+typedef uint16 ulvec16[16];
+typedef uint32 ulvec32[8];
+typedef uint8 ulvec8[32];
+#endif
+
+#if defined(__aarch64__)
+// This struct is for Arm64 color conversion.
+struct YuvConstants {
+  uvec16 kUVToRB;
+  uvec16 kUVToRB2;
+  uvec16 kUVToG;
+  uvec16 kUVToG2;
+  vec16 kUVBiasBGR;
+  vec32 kYToRgb;
+};
+#elif defined(__arm__)
+// This struct is for ArmV7 color conversion.
+struct YuvConstants {
+  uvec8 kUVToRB;
+  uvec8 kUVToG;
+  vec16 kUVBiasBGR;
+  vec32 kYToRgb;
+};
+#else
+// This struct is for Intel color conversion.
+struct YuvConstants {
+  lvec8 kUVToB;
+  lvec8 kUVToG;
+  lvec8 kUVToR;
+  lvec16 kUVBiasB;
+  lvec16 kUVBiasG;
+  lvec16 kUVBiasR;
+  lvec16 kYToRgb;
+};
+
+// Offsets into YuvConstants structure
+#define KUVTOB   0
+#define KUVTOG   32
+#define KUVTOR   64
+#define KUVBIASB 96
+#define KUVBIASG 128
+#define KUVBIASR 160
+#define KYTORGB  192
 #endif
 
+// Conversion matrix for YUV to RGB
+extern const struct YuvConstants kYuvI601Constants;  // BT.601
+extern const struct YuvConstants kYuvJPEGConstants;  // JPeg color space
+extern const struct YuvConstants kYuvH709Constants;  // BT.709
+
+// Conversion matrix for YVU to BGR
+extern const struct YuvConstants kYvuI601Constants;  // BT.601
+extern const struct YuvConstants kYvuJPEGConstants;  // JPeg color space
+extern const struct YuvConstants kYvuH709Constants;  // BT.709
+
 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
 #define OMITFP
 #else
 #define OMITFP __attribute__((optimize("omit-frame-pointer")))
 #endif
 
-void I422ToARGBRow_NEON(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
+// NaCL macros for GCC x86 and x64.
+#if defined(__native_client__)
+#define LABELALIGN ".p2align 5\n"
+#else
+#define LABELALIGN
+#endif
+#if defined(__native_client__) && defined(__x86_64__)
+// r14 is used for MEMOP macros.
+#define NACL_R14 "r14",
+#define BUNDLELOCK ".bundle_lock\n"
+#define BUNDLEUNLOCK ".bundle_unlock\n"
+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
+#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
+#define MEMLEA(offset, base) #offset "(%q" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%q" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%q" #base ",%q" #index "," #scale ")"
+#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
+#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg "\n" \
+    BUNDLEUNLOCK
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " %%" #reg ",(%%r15,%%r14)\n" \
+    BUNDLEUNLOCK
+#define MEMOPARG(opcode, offset, base, index, scale, arg) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%" #arg "\n" \
+    BUNDLEUNLOCK
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \
+    BUNDLEUNLOCK
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \
+    BUNDLEUNLOCK
+#else  // defined(__native_client__) && defined(__x86_64__)
+#define NACL_R14
+#define BUNDLEALIGN
+#define MEMACCESS(base) "(%" #base ")"
+#define MEMACCESS2(offset, base) #offset "(%" #base ")"
+#define MEMLEA(offset, base) #offset "(%" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%" #base ",%" #index "," #scale ")"
+#define MEMMOVESTRING(s, d)
+#define MEMSTORESTRING(reg, d)
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#define MEMOPARG(opcode, offset, base, index, scale, arg) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \
+    #reg2 "\n"
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+    #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#endif  // defined(__native_client__) && defined(__x86_64__)
+
+#if defined(__arm__) || defined(__aarch64__)
+#undef MEMACCESS
+#if defined(__native_client__)
+#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
+#else
+#define MEMACCESS(base)
+#endif
+#endif
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToBGRARow_NEON(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToABGRRow_NEON(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
+void I422AlphaToARGBRow_NEON(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             const uint8* a_buf,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToRGBARow_NEON(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToRGB24Row_NEON(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
                          int width);
-void I422ToRAWRow_NEON(const uint8* y_buf,
-                       const uint8* u_buf,
-                       const uint8* v_buf,
-                       uint8* rgb_buf,
-                       int width);
-void NV12ToARGBRow_NEON(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void NV21ToARGBRow_NEON(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
 
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int width);
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int width);
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int width);
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int width);
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int width);
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width);
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width);
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width);
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width);
+void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int width);
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width);
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width);
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width);
+void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
+void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
+void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
+void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int width);
+void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
+void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y,
+                             int width);
+void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y,
+                             int width);
 
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_AVX2(const uint8* src_argb, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
                        uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
                        uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
+                            uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,
+                           uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                             int width);
+void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                             int width);
+void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,
+                          uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,
+                          uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                           uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,
+                         uint8* dst_u, uint8* dst_v, int width);
+void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                            uint8* dst_u, uint8* dst_v, int width);
+void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
+                              int src_stride_argb1555,
+                              uint8* dst_u, uint8* dst_v, int width);
+void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
+                              int src_stride_argb4444,
+                              uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,
+                   uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,
+                   uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,
+                   uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24,
+                    uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw,
+                  uint8* dst_u, uint8* dst_v, int width);
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+                     uint8* dst_u, uint8* dst_v, int width);
+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
+                       uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV444Row_SSSE3(const uint8* src_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
+                              uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV411Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
 
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width);
 void MirrorRow_C(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
 
-void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
-void MirrorRowUV_NEON(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
-void MirrorRowUV_C(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
+void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                       int width);
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width);
+void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                       int width);
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
 
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
+
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width);
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width);
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width);
+void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width);
+void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                         int width);
+void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                         int width);
+void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                         int width);
+void SplitUVRow_Any_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                          int width);
 
-void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
-void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
-void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width);
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width);
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width);
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width);
+void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                         int width);
+void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                         int width);
+void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                         int width);
 
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
-void CopyRow_X86(const uint8* src, uint8* dst, int count);
+void CopyRow_AVX(const uint8* src, uint8* dst, int count);
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
 void CopyRow_NEON(const uint8* src, uint8* dst, int count);
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
 void CopyRow_C(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
+
+void CopyRow_16_C(const uint16* src, uint16* dst, int count);
+
+void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+
+void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width);
+void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width);
+void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width);
+void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_a,
+                                  int width);
+void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb, uint8* dst_a,
+                                  int width);
+
+void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y, uint8* dst_argb,
+                                  int width);
+void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, uint8* dst_argb,
+                                  int width);
+
+void SetRow_C(uint8* dst, uint8 v8, int count);
+void SetRow_X86(uint8* dst, uint8 v8, int count);
+void SetRow_ERMS(uint8* dst, uint8 v8, int count);
+void SetRow_NEON(uint8* dst, uint8 v8, int count);
+void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
+void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
+
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
+
+// ARGBShufflers for BGRAToARGB etc.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+                      const uint8* shuffler, int width);
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width);
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int width);
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width);
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width);
+void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int width);
+void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const uint8* shuffler, int width);
+void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int width);
+void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int width);
+
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width);
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                            int width);
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+                            int width);
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                            int width);
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                            int width);
 
-void SetRow8_X86(uint8* dst, uint32 v32, int count);
-void SetRows32_X86(uint8* dst, uint32 v32, int width,
-                   int dst_stride, int height);
-void SetRow8_NEON(uint8* dst, uint32 v32, int count);
-void SetRows32_NEON(uint8* dst, uint32 v32, int width,
-                    int dst_stride, int height);
-void SetRow8_C(uint8* dst, uint32 v32, int count);
-void SetRows32_C(uint8* dst, uint32 v32, int width, int dst_stride, int height);
-
-void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void RGBAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-
-void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
-                   uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
-                   uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
-                   uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
-                   uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width);
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int width);
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int width);
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width);
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb,
+                              int width);
+void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToRGB24Row_Any_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
 
-void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);
-void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);
-void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix);
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
-void RAWToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
-void RGB565ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
-
-void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix);
-void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix);
-void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix);
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
-void RAWToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
-
-void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);
-void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);
-void RGBAToARGBRow_C(const uint8* src_rgba, uint8* dst_argb, int pix);
-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
-void RAWToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
-void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
-void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
-
-void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
-
-void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
-
-void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
-
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
-
-void I444ToARGBRow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
-                     uint8* argb_buf,
-                     int width);
+void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+                              int width);
+void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                                int width);
+void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+                                int width);
+void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+                              int width);
+void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                                int width);
+void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                                int width);
 
-void I422ToARGBRow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
-                     uint8* argb_buf,
+void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb,
+                             int width);
+void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
+                              int width);
+void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                                int width);
+void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                                int width);
+
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
+
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint32 dither4, int width);
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width);
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width);
+
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width);
+
+void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
+
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
+
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
                      int width);
-
-void I411ToARGBRow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
-                     uint8* rgb_buf,
+void I422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
                      int width);
-
-void NV12ToARGBRow_C(const uint8* y_buf,
-                     const uint8* uv_buf,
-                     uint8* argb_buf,
+void I422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
                      int width);
-
-void NV21ToARGBRow_C(const uint8* y_buf,
-                     const uint8* vu_buf,
-                     uint8* argb_buf,
+void I422AlphaToARGBRow_C(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          const uint8* a_buf,
+                          uint8* dst_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void I411ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
                      int width);
-
-void I422ToBGRARow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
-                     uint8* bgra_buf,
+void NV12ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_uv,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
                      int width);
-
-void I422ToABGRRow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
-                     uint8* abgr_buf,
+void NV12ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_uv,
+                       uint8* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV21ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_uv,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
                      int width);
-
-void I422ToRGBARow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
-                     uint8* rgba_buf,
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
                      int width);
-void I422ToRGB24Row_C(const uint8* y_buf,
-                      const uint8* u_buf,
-                      const uint8* v_buf,
-                      uint8* rgb24_buf,
+void UYVYToARGBRow_C(const uint8* src_uyvy,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I422ToRGBARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I422ToRGB24Row_C(const uint8* src_y,
+                      const uint8* src_u,
+                      const uint8* src_v,
+                      uint8* dst_rgb24,
+                      const struct YuvConstants* yuvconstants,
                       int width);
-void I422ToRAWRow_C(const uint8* y_buf,
-                    const uint8* u_buf,
-                    const uint8* v_buf,
-                    uint8* raw_buf,
-                    int width);
-
-void YToARGBRow_C(const uint8* y_buf,
-                  uint8* rgb_buf,
-                  int width);
-
-void I444ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* argb_buf,
+void I422ToARGB4444Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb4444,
+                         const struct YuvConstants* yuvconstants,
                          int width);
-
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* argb_buf,
+void I422ToARGB1555Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb4444,
+                         const struct YuvConstants* yuvconstants,
                          int width);
-
-void I411ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
+void I422ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToRGBARow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I444ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
                          int width);
-
-void NV12ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* uv_buf,
-                         uint8* argb_buf,
+void I444ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I444ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
                          int width);
-
-void NV21ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* vu_buf,
-                         uint8* argb_buf,
+void I444ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
                          int width);
-
-void I422ToBGRARow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* bgra_buf,
+void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              const uint8* a_buf,
+                              uint8* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             const uint8* a_buf,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
                          int width);
-
-void I422ToABGRRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* abgr_buf,
+void I411ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
                          int width);
-
-void I422ToRGBARow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgba_buf,
+void I411ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void NV12ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_uv,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
                          int width);
-
-void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* argb_buf,
-                                   int width);
-
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* argb_buf,
-                                   int width);
-
-void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* rgb_buf,
-                                   int width);
-
-void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* uv_buf,
-                                   uint8* argb_buf,
-                                   int width);
-
-void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* vu_buf,
-                                   uint8* argb_buf,
-                                   int width);
-
-void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* bgra_buf,
-                                   int width);
-
-void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* abgr_buf,
-                                   int width);
-
-void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* rgba_buf,
-                                   int width);
-
-void I444ToARGBRow_Any_SSSE3(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* argb_buf,
+void NV12ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_uv,
+                           uint8* dst_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void NV21ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_uv,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV21ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToRGBARow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgba,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
                              int width);
-
-void I422ToARGBRow_Any_SSSE3(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* argb_buf,
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
                              int width);
-
-void I411ToARGBRow_Any_SSSE3(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* rgb_buf,
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void I422ToRGB24Row_SSSE3(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void I422ToRGB24Row_AVX2(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGBARow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
                              int width);
-
-void NV12ToARGBRow_Any_SSSE3(const uint8* y_buf,
-                             const uint8* uv_buf,
-                             uint8* argb_buf,
+void I444ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
                              int width);
-
-void NV21ToARGBRow_Any_SSSE3(const uint8* y_buf,
-                             const uint8* vu_buf,
-                             uint8* argb_buf,
+void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  const uint8* a_buf,
+                                  uint8* dst_argb,
+                                  const struct YuvConstants* yuvconstants,
+                                  int width);
+void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 const uint8* a_buf,
+                                 uint8* dst_argb,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
                              int width);
-
-void I422ToBGRARow_Any_SSSE3(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* bgra_buf,
+void I411ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_uv,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
                              int width);
-
-void I422ToABGRRow_Any_SSSE3(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* abgr_buf,
+void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_uv,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_vu,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
                              int width);
-
-void I422ToRGBARow_Any_SSSE3(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* rgba_buf,
+void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_vu,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
+                               const uint8* src_uv,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_uv,
+                              uint8* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGBARow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_rgba,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
+                                 const uint8* src_u,
+                                 const uint8* src_v,
+                                 uint8* dst_rgba,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_rgba,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
+                                 const uint8* src_u,
+                                 const uint8* src_v,
+                                 uint8* dst_rgba,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_rgba,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
+                               const uint8* src_u,
+                               const uint8* src_v,
+                               uint8* dst_rgba,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void I422ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_rgba,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
                              int width);
 
-void YToARGBRow_SSE2(const uint8* y_buf,
-                     uint8* argb_buf,
-                     int width);
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
 
 // ARGB preattenuated alpha blend.
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
                         uint8* dst_argb, int width);
-void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
                        uint8* dst_argb, int width);
-void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
                     uint8* dst_argb, int width);
 
-void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
-
-void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
-
-void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ABGRToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void RGBAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                           uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                           uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                           uint8* dst_u, uint8* dst_v, int width);
-void I422ToARGBRow_Any_NEON(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
+// Unattenuated planar alpha blend.
+void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
+                         const uint8* alpha, uint8* dst, int width);
+void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1,
+                             const uint8* alpha, uint8* dst, int width);
+void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
+                        const uint8* alpha, uint8* dst, int width);
+void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1,
+                            const uint8* alpha, uint8* dst, int width);
+void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
+                     const uint8* alpha, uint8* dst, int width);
+
+// ARGB multiply images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+
+// ARGB add images.
+void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,
+                  uint8* dst_argb, int width);
+void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+
+// ARGB subtract images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+
+void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                int width);
+void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                int width);
+
+void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int width);
+void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int width);
+
+void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                int width);
+void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                int width);
+
+void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                int width);
+void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                int width);
+void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int width);
+
+void I444ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToBGRARow_Any_NEON(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
+void I422ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToABGRRow_Any_NEON(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
+void I422AlphaToARGBRow_Any_NEON(const uint8* src_y,
+                                 const uint8* src_u,
+                                 const uint8* src_v,
+                                 const uint8* src_a,
+                                 uint8* dst_argb,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I411ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToRGBARow_Any_NEON(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
+void I422ToRGBARow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToRGB24Row_Any_NEON(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* rgb_buf,
+void I422ToRGB24Row_Any_NEON(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
                              int width);
-void I422ToRAWRow_Any_NEON(const uint8* y_buf,
-                           const uint8* u_buf,
-                           const uint8* v_buf,
-                           uint8* rgb_buf,
-                           int width);
-void NV12ToARGBRow_Any_NEON(const uint8* y_buf,
-                            const uint8* uv_buf,
-                            uint8* argb_buf,
+void I422ToARGB4444Row_Any_NEON(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToARGB1555Row_Any_NEON(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToRGB565Row_Any_NEON(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void NV12ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_uv,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV21ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_vu,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
+                              const uint8* src_uv,
+                              uint8* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
                             int width);
-void NV21ToARGBRow_Any_NEON(const uint8* y_buf,
-                            const uint8* uv_buf,
-                            uint8* argb_buf,
+void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
                             int width);
+void I422ToARGBRow_DSPR2(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGBRow_DSPR2(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
 
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix);
+                      uint8* dst_u, uint8* dst_v, int width);
 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
-                               uint8* dst_y, int pix);
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                                uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
-                                   uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+                         uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix);
+                      uint8* dst_u, uint8* dst_v, int width);
 void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);
+                         uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width);
 void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
-                   uint8* dst_u, uint8* dst_v, int pix);
+                   uint8* dst_u, uint8* dst_v, int width);
 void YUY2ToUV422Row_C(const uint8* src_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+                      uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
 void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int pix);
+                          uint8* dst_u, uint8* dst_v, int width);
 void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
-                             uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+                             uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
 void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int pix);
+                          uint8* dst_u, uint8* dst_v, int width);
 void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
-                             uint8* dst_u, uint8* dst_v, int pix);
-
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+                             uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix);
+                      uint8* dst_u, uint8* dst_v, int width);
 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
-                               uint8* dst_y, int pix);
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                                uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
-                                   uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+                         uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix);
+                      uint8* dst_u, uint8* dst_v, int width);
 void UYVYToUV422Row_NEON(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix);
+                         uint8* dst_u, uint8* dst_v, int width);
 
-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width);
 void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
-                   uint8* dst_u, uint8* dst_v, int pix);
+                   uint8* dst_u, uint8* dst_v, int width);
 void UYVYToUV422Row_C(const uint8* src_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+                      uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
 void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int pix);
+                          uint8* dst_u, uint8* dst_v, int width);
 void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
-                             uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+                             uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
 void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int pix);
+                          uint8* dst_u, uint8* dst_v, int width);
 void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
-                             uint8* dst_u, uint8* dst_v, int pix);
-
+                             uint8* dst_u, uint8* dst_v, int width);
+
+void I422ToYUY2Row_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_yuy2, int width);
+void I422ToUYVYRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_uyvy, int width);
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width);
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_SSE2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_SSE2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_uyvy, int width);
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width);
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_uyvy, int width);
+
+// Effects related row functions.
 void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                int width);
+void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+                               int width);
 
 // Inverse table for unattenuate, shared by C and SSE2.
-extern uint32 fixed_invtbl8[256];
+extern const uint32 fixed_invtbl8[256];
 void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                                 int width);
+void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                                 int width);
 
 void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
 
 void ARGBSepiaRow_C(uint8* dst_argb, int width);
 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
 
-void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width);
-void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
-                              int width);
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+                          const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width);
 
 void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
 
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
 void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
                        int interval_offset, int width);
 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
                           int interval_offset, int width);
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width);
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                    uint32 value);
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value);
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value);
 
 // Used for blur.
-void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
-                                 int width, int area, uint8* dst, int count);
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+                                    int width, int area, uint8* dst, int count);
 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
                                   const int32* previous_cumsum, int width);
 
-void CumulativeSumToAverage_C(const int32* topleft, const int32* botleft,
-                              int width, int area, uint8* dst, int count);
+void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
+                                 int width, int area, uint8* dst, int count);
 void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
                                const int32* previous_cumsum, int width);
 
-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                    uint32 value);
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value);
-
 LIBYUV_API
 void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
                      uint8* dst_argb, const float* uv_dudv, int width);
@@ -715,17 +1838,102 @@ LIBYUV_API
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                         uint8* dst_argb, const float* uv_dudv, int width);
 
-void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride,
-                          int dst_width, int source_y_fraction);
-void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                              ptrdiff_t src_stride, int dst_width,
+// Used for I420Scale, ARGBScale, and ARGBInterpolate.
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                      ptrdiff_t src_stride_ptr,
+                      int width, int source_y_fraction);
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride_ptr, int width,
+                          int source_y_fraction);
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
+void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
+void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride_ptr, int width,
+                          int source_y_fraction);
+void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
+                             int source_y_fraction);
+void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride_ptr, int width,
+                              int source_y_fraction);
+void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
+                             int source_y_fraction);
+void InterpolateRow_Any_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride_ptr, int width,
                               int source_y_fraction);
 
+void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         ptrdiff_t src_stride_ptr,
+                         int width, int source_y_fraction);
+
+// Sobel images.
+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
+                 uint8* dst_sobelx, int width);
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
+                 uint8* dst_sobely, int width);
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width);
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width);
+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                uint8* dst_argb, int width);
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width);
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width);
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_y, int width);
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width);
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width);
+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                  uint8* dst_argb, int width);
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width);
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width);
+void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_argb, int width);
+void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_argb, int width);
+void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                              uint8* dst_y, int width);
+void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                              uint8* dst_y, int width);
+void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                         uint8* dst_argb, int width);
+void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                         uint8* dst_argb, int width);
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+                         uint8* dst_argb, const float* poly,
+                         int width);
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width);
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width);
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                             const uint8* luma, uint32 lumacoeff);
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff);
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
 #endif  // INCLUDE_LIBYUV_ROW_H_  NOLINT
-
diff --git a/files/include/libyuv/scale.h b/files/include/libyuv/scale.h
index 18098798..102158d1 100644
--- a/files/include/libyuv/scale.h
+++ b/files/include/libyuv/scale.h
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -18,12 +18,13 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// Supported filtering
-enum FilterMode {
+// Supported filtering.
+typedef enum FilterMode {
   kFilterNone = 0,  // Point sample; Fastest.
-  kFilterBilinear = 1,  // Faster than box, but lower quality scaling down.
-  kFilterBox = 2  // Highest quality.
-};
+  kFilterLinear = 1,  // Filter horizontally only.
+  kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
+  kFilterBox = 3  // Highest quality.
+} FilterModeEnum;
 
 // Scale a YUV plane.
 LIBYUV_API
@@ -31,7 +32,14 @@ void ScalePlane(const uint8* src, int src_stride,
                 int src_width, int src_height,
                 uint8* dst, int dst_stride,
                 int dst_width, int dst_height,
-                FilterMode filtering);
+                enum FilterMode filtering);
+
+LIBYUV_API
+void ScalePlane_16(const uint16* src, int src_stride,
+                   int src_width, int src_height,
+                   uint16* dst, int dst_stride,
+                   int dst_width, int dst_height,
+                   enum FilterMode filtering);
 
 // Scales a YUV 4:2:0 image from the src width and height to the
 // dst width and height.
@@ -52,8 +60,20 @@ int I420Scale(const uint8* src_y, int src_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int dst_width, int dst_height,
-              FilterMode filtering);
+              enum FilterMode filtering);
 
+LIBYUV_API
+int I420Scale_16(const uint16* src_y, int src_stride_y,
+                 const uint16* src_u, int src_stride_u,
+                 const uint16* src_v, int src_stride_v,
+                 int src_width, int src_height,
+                 uint16* dst_y, int dst_stride_y,
+                 uint16* dst_u, int dst_stride_u,
+                 uint16* dst_v, int dst_stride_v,
+                 int dst_width, int dst_height,
+                 enum FilterMode filtering);
+
+#ifdef __cplusplus
 // Legacy API.  Deprecated.
 LIBYUV_API
 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
@@ -62,17 +82,18 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
           uint8* dst_y, uint8* dst_u, uint8* dst_v,
           int dst_stride_y, int dst_stride_u, int dst_stride_v,
           int dst_width, int dst_height,
-          bool interpolate);
+          LIBYUV_BOOL interpolate);
 
 // Legacy API.  Deprecated.
 LIBYUV_API
-int ScaleOffset(const uint8* src, int src_width, int src_height,
-                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
-                bool interpolate);
+int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
+                uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
+                LIBYUV_BOOL interpolate);
 
 // For testing, allow disabling of specialized scalers.
 LIBYUV_API
-void SetUseReferenceImpl(bool use);
+void SetUseReferenceImpl(LIBYUV_BOOL use);
+#endif  // __cplusplus
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/files/include/libyuv/scale_argb.h b/files/include/libyuv/scale_argb.h
index 1af0e1dc..b56cf520 100644
--- a/files/include/libyuv/scale_argb.h
+++ b/files/include/libyuv/scale_argb.h
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -24,7 +24,29 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb,
               int src_width, int src_height,
               uint8* dst_argb, int dst_stride_argb,
               int dst_width, int dst_height,
-              FilterMode filtering);
+              enum FilterMode filtering);
+
+// Clipped scale takes destination rectangle coordinates for clip values.
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+                  int src_width, int src_height,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int dst_width, int dst_height,
+                  int clip_x, int clip_y, int clip_width, int clip_height,
+                  enum FilterMode filtering);
+
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint32 src_fourcc,
+                       int src_width, int src_height,
+                       uint8* dst_argb, int dst_stride_argb,
+                       uint32 dst_fourcc,
+                       int dst_width, int dst_height,
+                       int clip_x, int clip_y, int clip_width, int clip_height,
+                       enum FilterMode filtering);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/files/include/libyuv/scale_row.h b/files/include/libyuv/scale_row.h
new file mode 100644
index 00000000..df699e6c
--- /dev/null
+++ b/files/include/libyuv/scale_row.h
@@ -0,0 +1,503 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ROW_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif  // GNUC >= 4.7
+#endif  // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_FIXEDDIV1_X86
+#define HAS_FIXEDDIV_X86
+#define HAS_SCALEARGBCOLS_SSE2
+#define HAS_SCALEARGBCOLSUP2_SSE2
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
+#define HAS_SCALEARGBROWDOWN2_SSE2
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+#define HAS_SCALECOLSUP2_SSE2
+#define HAS_SCALEFILTERCOLS_SSSE3
+#define HAS_SCALEROWDOWN2_SSSE3
+#define HAS_SCALEROWDOWN34_SSSE3
+#define HAS_SCALEROWDOWN38_SSSE3
+#define HAS_SCALEROWDOWN4_SSSE3
+#define HAS_SCALEADDROW_SSE2
+#endif
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
+    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_SCALEADDROW_AVX2
+#define HAS_SCALEROWDOWN2_AVX2
+#define HAS_SCALEROWDOWN4_AVX2
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEARGBCOLS_NEON
+#define HAS_SCALEARGBROWDOWN2_NEON
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEFILTERCOLS_NEON
+#define HAS_SCALEROWDOWN2_NEON
+#define HAS_SCALEROWDOWN34_NEON
+#define HAS_SCALEROWDOWN38_NEON
+#define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEARGBFILTERCOLS_NEON
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+    defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_SCALEROWDOWN2_DSPR2
+#define HAS_SCALEROWDOWN4_DSPR2
+#define HAS_SCALEROWDOWN34_DSPR2
+#define HAS_SCALEROWDOWN38_DSPR2
+#endif
+
+// Scale ARGB vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_argb, uint8* dst_argb,
+                        int x, int y, int dy,
+                        int bpp, enum FilterMode filtering);
+
+void ScalePlaneVertical_16(int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint16* src_argb, uint16* dst_argb,
+                           int x, int y, int dy,
+                           int wpp, enum FilterMode filtering);
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  enum FilterMode filtering);
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div);
+int FixedDiv_X86(int num, int div);
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div);
+int FixedDiv1_X86(int num, int div);
+#ifdef HAS_FIXEDDIV_X86
+#define FixedDiv FixedDiv_X86
+#define FixedDiv1 FixedDiv1_X86
+#else
+#define FixedDiv FixedDiv_C
+#define FixedDiv1 FixedDiv1_C
+#endif
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+                int dst_width, int dst_height,
+                enum FilterMode filtering,
+                int* x, int* y, int* dx, int* dy);
+
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width);
+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width);
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width);
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                              uint16* dst, int dst_width);
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width);
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width);
+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width);
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width);
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width);
+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width);
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width);
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width);
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width);
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                 int dst_width, int x, int dx);
+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                    int dst_width, int x, int dx);
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+                    int dst_width, int, int);
+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int, int);
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx);
+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                          int dst_width, int x, int dx);
+void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
+                         int dst_width, int x, int dx);
+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                            int dst_width, int x, int dx);
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width);
+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width);
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width);
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+                         ptrdiff_t src_stride,
+                         uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            int src_stepx,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+                     int dst_width, int x, int dx);
+void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
+                       int dst_width, int x, int dx);
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int, int);
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx);
+void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
+                             int dst_width, int x, int dx);
+
+// Specialized scalers for x86.
+void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx);
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx);
+
+
+// ARGB Column functions
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx);
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx);
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx);
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx);
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx);
+void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
+                                  int dst_width, int x, int dx);
+void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
+                            int dst_width, int x, int dx);
+
+// ARGB Row functions
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst, int dst_width);
+void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst, int dst_width);
+
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8* dst_argb, int dst_width);
+
+// ScaleRowDown2Box also used by planar functions
+// NEON downscalers with interpolation.
+
+// Note - not static due to reuse in convert for 444 to 420.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width);
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width);
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+//  to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst, int dst_width);
+void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+// 32 -> 12
+void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx);
+
+void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                              int dst_width, int x, int dx);
+
+void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst, int dst_width);
+void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst, int dst_width);
+void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst, int dst_width);
+void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* d, int dst_width);
+void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst, int dst_width);
+void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_  NOLINT
diff --git a/files/include/libyuv/version.h b/files/include/libyuv/version.h
index e782ae18..ca0c062e 100644
--- a/files/include/libyuv/version.h
+++ b/files/include/libyuv/version.h
@@ -4,13 +4,13 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 397
+#define LIBYUV_VERSION 1602
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/files/include/libyuv/video_common.h b/files/include/libyuv/video_common.h
index 5d812c98..ad934e42 100644
--- a/files/include/libyuv/video_common.h
+++ b/files/include/libyuv/video_common.h
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -27,9 +27,15 @@ extern "C" {
 // Convert four characters to a FourCC code.
 // Needs to be a macro otherwise the OS X compiler complains when the kFormat*
 // constants are used in a switch.
+#ifdef __cplusplus
 #define FOURCC(a, b, c, d) ( \
     (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
     (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+#else
+#define FOURCC(a, b, c, d) ( \
+    ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
+    ((uint32)(c) << 16) | ((uint32)(d) << 24))  /* NOLINT */
+#endif
 
 // Some pages discussing FourCC codes:
 //   http://www.fourcc.org/yuv.php
@@ -38,59 +44,79 @@ extern "C" {
 //   http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
 //   http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
 
+// FourCC codes grouped according to implementation efficiency.
+// Primary formats should convert in 1 efficient step.
+// Secondary formats are converted in 2 steps.
+// Auxilliary formats call primary converters.
 enum FourCC {
-  // Canonical fourcc codes used in our code.
+  // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
   FOURCC_I420 = FOURCC('I', '4', '2', '0'),
   FOURCC_I422 = FOURCC('I', '4', '2', '2'),
   FOURCC_I444 = FOURCC('I', '4', '4', '4'),
   FOURCC_I411 = FOURCC('I', '4', '1', '1'),
   FOURCC_I400 = FOURCC('I', '4', '0', '0'),
-  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
-  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
-  FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
-  FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
   FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
   FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+
+  // 2 Secondary YUV formats: row biplanar.
   FOURCC_M420 = FOURCC('M', '4', '2', '0'),
-  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
-  FOURCC_V210 = FOURCC('V', '2', '1', '0'),
-  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),  // deprecated.
+
+  // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
   FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
   FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
   FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
-  FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
-  FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // bgr565.
-  FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // abgr1555.
-  FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444.
+  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
   FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
-  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
-  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
-  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
-  FOURCC_H264 = FOURCC('H', '2', '6', '4'),
-  // Next four are Bayer RGB formats. The four characters define the order of
-  // the colours in each 2x2 pixel grid, going left-to-right and top-to-bottom.
+  FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+  FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
+  FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
+  FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
+
+  // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
   FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
   FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
   FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
   FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
 
-  // Aliases for canonical fourcc codes, replaced with their canonical
-  // equivalents by CanonicalFourCC().
+  // 1 Primary Compressed YUV format.
+  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+
+  // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+  FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+  FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
+  FOURCC_J420 = FOURCC('J', '4', '2', '0'),
+  FOURCC_J400 = FOURCC('J', '4', '0', '0'),  // unofficial fourcc
+  FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // unofficial fourcc
+
+  // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
   FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
   FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
   FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
   FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
   FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
   FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
-  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY.
+  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.
   FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
   FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
   FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
   FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
   FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
+  FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB
+  FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB
+  FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.
+  FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
+  FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
+
+  // 1 Auxiliary compressed YUV format set aside for capturer.
+  FOURCC_H264 = FOURCC('H', '2', '6', '4'),
 
   // Match any fourcc.
-  FOURCC_ANY  = 0xFFFFFFFF,
+  FOURCC_ANY = -1,
 };
 
 enum FourCCBpp {
@@ -100,37 +126,34 @@ enum FourCCBpp {
   FOURCC_BPP_I444 = 24,
   FOURCC_BPP_I411 = 12,
   FOURCC_BPP_I400 = 8,
-  FOURCC_BPP_YU12 = 12,
-  FOURCC_BPP_YV12 = 12,
-  FOURCC_BPP_YV16 = 16,
-  FOURCC_BPP_YV24 = 24,
+  FOURCC_BPP_NV21 = 12,
+  FOURCC_BPP_NV12 = 12,
   FOURCC_BPP_YUY2 = 16,
   FOURCC_BPP_UYVY = 16,
   FOURCC_BPP_M420 = 12,
   FOURCC_BPP_Q420 = 12,
-  FOURCC_BPP_V210 = 22,  // 128 / 6 actually.
-  FOURCC_BPP_24BG = 24,
   FOURCC_BPP_ARGB = 32,
   FOURCC_BPP_BGRA = 32,
   FOURCC_BPP_ABGR = 32,
   FOURCC_BPP_RGBA = 32,
+  FOURCC_BPP_24BG = 24,
+  FOURCC_BPP_RAW  = 24,
   FOURCC_BPP_RGBP = 16,
   FOURCC_BPP_RGBO = 16,
   FOURCC_BPP_R444 = 16,
-  FOURCC_BPP_RAW  = 24,
-  FOURCC_BPP_NV21 = 12,
-  FOURCC_BPP_NV12 = 12,
-  FOURCC_BPP_MJPG = 0,  // 0 means unknown.
-  FOURCC_BPP_H264 = 0,
-  // Next four are Bayer RGB formats. The four characters define the order of
-  // the colours in each 2x2 pixel grid, going left-to-right and top-to-bottom.
   FOURCC_BPP_RGGB = 8,
   FOURCC_BPP_BGGR = 8,
   FOURCC_BPP_GRBG = 8,
   FOURCC_BPP_GBRG = 8,
-
-  // Aliases for canonical fourcc codes, replaced with their canonical
-  // equivalents by CanonicalFourCC().
+  FOURCC_BPP_YV12 = 12,
+  FOURCC_BPP_YV16 = 16,
+  FOURCC_BPP_YV24 = 24,
+  FOURCC_BPP_YU12 = 12,
+  FOURCC_BPP_J420 = 12,
+  FOURCC_BPP_J400 = 8,
+  FOURCC_BPP_H420 = 12,
+  FOURCC_BPP_MJPG = 0,  // 0 means unknown.
+  FOURCC_BPP_H264 = 0,
   FOURCC_BPP_IYUV = 12,
   FOURCC_BPP_YU16 = 16,
   FOURCC_BPP_YU24 = 24,
@@ -143,6 +166,8 @@ enum FourCCBpp {
   FOURCC_BPP_BA81 = 8,
   FOURCC_BPP_RGB3 = 24,
   FOURCC_BPP_BGR3 = 24,
+  FOURCC_BPP_CM32 = 32,
+  FOURCC_BPP_CM24 = 24,
 
   // Match any fourcc.
   FOURCC_BPP_ANY  = 0,  // 0 means unknown.
diff --git a/files/libyuv.gyp b/files/libyuv.gyp
index 18137538..db4b5490 100644
--- a/files/libyuv.gyp
+++ b/files/libyuv.gyp
@@ -3,34 +3,116 @@
 # Use of this source code is governed by a BSD-style license
 # that can be found in the LICENSE file in the root of the source
 # tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
+# in the file PATENTS. All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
 {
+  'includes': [
+    'libyuv.gypi',
+  ],
+  # Make sure that if we are being compiled to an xcodeproj, nothing tries to
+  # include a .pch.
+  'xcode_settings': {
+    'GCC_PREFIX_HEADER': '',
+    'GCC_PRECOMPILE_PREFIX_HEADER': 'NO',
+  },
   'variables': {
-     'use_system_libjpeg%': 0,
+    'use_system_libjpeg%': 0,
+    'libyuv_disable_jpeg%': 0,
+    # 'chromium_code' treats libyuv as internal and increases warning level.
+    'chromium_code': 1,
+    # clang compiler default variable usable by other apps that include libyuv.
+    'clang%': 0,
+    # Link-Time Optimizations.
+    'use_lto%': 0,
+    'build_neon': 0,
+    'conditions': [
+       ['(target_arch == "armv7" or target_arch == "armv7s" or \
+       (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
+       and (arm_neon == 1 or arm_neon_optional == 1)',
+       {
+         'build_neon': 1,
+       }],
+    ],
   },
+
   'targets': [
     {
       'target_name': 'libyuv',
+      # Change type to 'shared_library' to build .so or .dll files.
       'type': 'static_library',
-      # 'type': 'shared_library',
+      'variables': {
+        'optimize': 'max',  # enable O2 and ltcg.
+      },
+      # Allows libyuv.a redistributable library without external dependencies.
+      'standalone_static_library': 1,
       'conditions': [
-         ['use_system_libjpeg==0', {
-          'dependencies': [
-             '<(DEPTH)/third_party/libjpeg_turbo/libjpeg.gyp:libjpeg',
+       # Disable -Wunused-parameter
+        ['clang == 1', {
+          'cflags': [
+            '-Wno-unused-parameter',
+         ],
+        }],
+        ['build_neon != 0', {
+          'defines': [
+            'LIBYUV_NEON',
+          ],
+          'cflags!': [
+            '-mfpu=vfp',
+            '-mfpu=vfpv3',
+            '-mfpu=vfpv3-d16',
+            # '-mthumb',  # arm32 not thumb
+          ],
+          'conditions': [
+            # Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug.
+            ['clang == 0 and use_lto == 1', {
+              'cflags!': [
+                '-flto',
+                '-ffat-lto-objects',
+              ],
+            }],
+            # arm64 does not need -mfpu=neon option as neon is not optional
+            ['target_arch != "arm64"', {
+              'cflags': [
+                '-mfpu=neon',
+                # '-marm',  # arm32 not thumb
+              ],
+            }],
           ],
-        }, {
-          'link_settings': {
-            'libraries': [
-              '-ljpeg',
-            ],
-          },
         }],
-      ],
+        ['OS != "ios" and libyuv_disable_jpeg != 1', {
+          'defines': [
+            'HAVE_JPEG'
+          ],
+          'conditions': [
+            # Caveat system jpeg support may not support motion jpeg
+            [ 'use_system_libjpeg == 1', {
+              'dependencies': [
+                 '<(DEPTH)/third_party/libjpeg/libjpeg.gyp:libjpeg',
+              ],
+            }, {
+              'dependencies': [
+                 '<(DEPTH)/third_party/libjpeg_turbo/libjpeg.gyp:libjpeg',
+              ],
+            }],
+            [ 'use_system_libjpeg == 1', {
+              'link_settings': {
+                'libraries': [
+                  '-ljpeg',
+                ],
+              }
+            }],
+          ],
+        }],
+      ], #conditions
       'defines': [
-        'HAVE_JPEG',
-        # 'LIBYUV_BUILDING_SHARED_LIBRARY',
+        # Enable the following 3 macros to turn off assembly for specified CPU.
+        # 'LIBYUV_DISABLE_X86',
+        # 'LIBYUV_DISABLE_NEON',
+        # 'LIBYUV_DISABLE_MIPS',
+        # Enable the following macro to build libyuv as a shared library (dll).
+        # 'LIBYUV_USING_SHARED_LIBRARY',
+        # TODO(fbarchard): Make these into gyp defines.
       ],
       'include_dirs': [
         'include',
@@ -41,48 +123,21 @@
           'include',
           '.',
         ],
+        'conditions': [
+          ['OS == "android" and target_arch == "arm64"', {
+            'ldflags': [
+              '-Wl,--dynamic-linker,/system/bin/linker64',
+            ],
+          }],
+          ['OS == "android" and target_arch != "arm64"', {
+            'ldflags': [
+              '-Wl,--dynamic-linker,/system/bin/linker',
+            ],
+          }],
+        ], #conditions
       },
       'sources': [
-        # includes.
-        'include/libyuv.h',
-        'include/libyuv/basic_types.h',
-        'include/libyuv/compare.h',
-        'include/libyuv/convert.h',
-        'include/libyuv/convert_argb.h',
-        'include/libyuv/convert_from.h',
-        'include/libyuv/cpu_id.h',
-        'include/libyuv/format_conversion.h',
-        'include/libyuv/mjpeg_decoder.h',
-        'include/libyuv/planar_functions.h',
-        'include/libyuv/rotate.h',
-        'include/libyuv/rotate_argb.h',
-        'include/libyuv/row.h',
-        'include/libyuv/scale.h',
-        'include/libyuv/scale_argb.h',
-        'include/libyuv/version.h',
-        'include/libyuv/video_common.h',
-
-        # sources.
-        'source/compare.cc',
-        'source/compare_neon.cc',
-        'source/convert.cc',
-        'source/convert_argb.cc',
-        'source/convert_from.cc',
-        'source/cpu_id.cc',
-        'source/format_conversion.cc',
-        'source/mjpeg_decoder.cc',
-        'source/planar_functions.cc',
-        'source/rotate.cc',
-        'source/rotate_argb.cc',
-        'source/rotate_neon.cc',
-        'source/row_common.cc',
-        'source/row_neon.cc',
-        'source/row_posix.cc',
-        'source/row_win.cc',
-        'source/scale.cc',
-        'source/scale_neon.cc',
-        'source/scale_argb.cc',
-        'source/video_common.cc',
+        '<@(libyuv_sources)',
       ],
     },
   ], # targets.
diff --git a/files/libyuv.gypi b/files/libyuv.gypi
new file mode 100644
index 00000000..73fdec0a
--- /dev/null
+++ b/files/libyuv.gypi
@@ -0,0 +1,79 @@
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+  'variables': {
+    'libyuv_sources': [
+      # includes.
+      'include/libyuv.h',
+      'include/libyuv/basic_types.h',
+      'include/libyuv/compare.h',
+      'include/libyuv/convert.h',
+      'include/libyuv/convert_argb.h',
+      'include/libyuv/convert_from.h',
+      'include/libyuv/convert_from_argb.h',
+      'include/libyuv/cpu_id.h',
+      'include/libyuv/mjpeg_decoder.h',
+      'include/libyuv/planar_functions.h',
+      'include/libyuv/rotate.h',
+      'include/libyuv/rotate_argb.h',
+      'include/libyuv/rotate_row.h',
+      'include/libyuv/row.h',
+      'include/libyuv/scale.h',
+      'include/libyuv/scale_argb.h',
+      'include/libyuv/scale_row.h',
+      'include/libyuv/version.h',
+      'include/libyuv/video_common.h',
+
+      # sources.
+      'source/compare.cc',
+      'source/compare_common.cc',
+      'source/compare_gcc.cc',
+      'source/compare_neon.cc',
+      'source/compare_neon64.cc',
+      'source/compare_win.cc',
+      'source/convert.cc',
+      'source/convert_argb.cc',
+      'source/convert_from.cc',
+      'source/convert_from_argb.cc',
+      'source/convert_jpeg.cc',
+      'source/convert_to_argb.cc',
+      'source/convert_to_i420.cc',
+      'source/cpu_id.cc',
+      'source/mjpeg_decoder.cc',
+      'source/mjpeg_validate.cc',
+      'source/planar_functions.cc',
+      'source/rotate.cc',
+      'source/rotate_any.cc',
+      'source/rotate_argb.cc',
+      'source/rotate_common.cc',
+      'source/rotate_gcc.cc',
+      'source/rotate_mips.cc',
+      'source/rotate_neon.cc',
+      'source/rotate_neon64.cc',
+      'source/rotate_win.cc',
+      'source/row_any.cc',
+      'source/row_common.cc',
+      'source/row_gcc.cc',
+      'source/row_mips.cc',
+      'source/row_neon.cc',
+      'source/row_neon64.cc',
+      'source/row_win.cc',
+      'source/scale.cc',
+      'source/scale_any.cc',
+      'source/scale_argb.cc',
+      'source/scale_common.cc',
+      'source/scale_gcc.cc',
+      'source/scale_mips.cc',
+      'source/scale_neon.cc',
+      'source/scale_neon64.cc',
+      'source/scale_win.cc',
+      'source/video_common.cc',
+    ],
+  }
+}
diff --git a/files/libyuv_nacl.gyp b/files/libyuv_nacl.gyp
new file mode 100644
index 00000000..b8fe57ee
--- /dev/null
+++ b/files/libyuv_nacl.gyp
@@ -0,0 +1,37 @@
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+  'includes': [
+    'libyuv.gypi',
+    '../../native_client/build/untrusted.gypi',
+  ],
+  'targets': [
+    {
+      'target_name': 'libyuv_nacl',
+      'type': 'none',
+      'variables': {
+        'nlib_target': 'libyuv_nacl.a',
+        'build_glibc': 0,
+        'build_newlib': 0,
+        'build_pnacl_newlib': 1,
+      },
+      'include_dirs': [
+        'include',
+      ],
+      'direct_dependent_settings': {
+        'include_dirs': [
+          'include',
+        ],
+      },
+      'sources': [
+        '<@(libyuv_sources)',
+      ],
+    },  # target libyuv_nacl
+  ]
+}
diff --git a/files/libyuv_test.gyp b/files/libyuv_test.gyp
index 27cec8f4..27b330f6 100755..100644
--- a/files/libyuv_test.gyp
+++ b/files/libyuv_test.gyp
@@ -3,38 +3,48 @@
 # Use of this source code is governed by a BSD-style license
 # that can be found in the LICENSE file in the root of the source
 # tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
+# in the file PATENTS. All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
 {
+  'variables': {
+    'libyuv_disable_jpeg%': 0,
+  },
   'targets': [
     {
       'target_name': 'libyuv_unittest',
-      'type': 'executable',
+      'type': '<(gtest_target_type)',
       'dependencies': [
         'libyuv.gyp:libyuv',
-        # The tests are based on gtest
         'testing/gtest.gyp:gtest',
-        'testing/gtest.gyp:gtest_main',
+        'third_party/gflags/gflags.gyp:gflags',
       ],
-      'defines': [
-        'LIBYUV_SVNREVISION="<!(svnversion -n)"',
-        # 'LIBYUV_USING_SHARED_LIBRARY',
+      'direct_dependent_settings': {
+        'defines': [
+          'GTEST_RELATIVE_PATH',
+        ],
+      },
+      'export_dependent_settings': [
+        '<(DEPTH)/testing/gtest.gyp:gtest',
       ],
       'sources': [
         # headers
         'unit_test/unit_test.h',
 
         # sources
+        'unit_test/basictypes_test.cc',
         'unit_test/compare_test.cc',
+        'unit_test/color_test.cc',
+        'unit_test/convert_test.cc',
         'unit_test/cpu_test.cc',
+        'unit_test/math_test.cc',
         'unit_test/planar_test.cc',
         'unit_test/rotate_argb_test.cc',
         'unit_test/rotate_test.cc',
         'unit_test/scale_argb_test.cc',
         'unit_test/scale_test.cc',
         'unit_test/unit_test.cc',
-        'unit_test/version_test.cc',
+        'unit_test/video_common_test.cc',
       ],
       'conditions': [
         ['OS=="linux"', {
@@ -42,9 +52,55 @@
             '-fexceptions',
           ],
         }],
+        [ 'OS == "ios" and target_subarch == 64', {
+          'defines': [
+            'LIBYUV_DISABLE_NEON'
+          ],
+        }],
+        [ 'OS == "ios"', {
+          'xcode_settings': {
+            'DEBUGGING_SYMBOLS': 'YES',
+            'DEBUG_INFORMATION_FORMAT' : 'dwarf-with-dsym',
+            # Work around compile issue with isosim.mm, see
+            # https://code.google.com/p/libyuv/issues/detail?id=548 for details.
+            'WARNING_CFLAGS': [
+              '-Wno-sometimes-uninitialized',
+            ],
+          },
+          'cflags': [
+            '-Wno-sometimes-uninitialized',
+          ],
+        }],
+        [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
+          'defines': [
+            'HAVE_JPEG',
+          ],
+        }],
+        ['OS=="android"', {
+          'dependencies': [
+            '<(DEPTH)/testing/android/native_test.gyp:native_test_native_code',
+          ],
+        }],
+        # TODO(YangZhang): These lines can be removed when high accuracy
+        # YUV to RGB to Neon is ported.
+        [ '(target_arch == "armv7" or target_arch == "armv7s" \
+          or (target_arch == "arm" and arm_version >= 7) \
+          or target_arch == "arm64") \
+          and (arm_neon == 1 or arm_neon_optional == 1)', {
+          'defines': [
+            'LIBYUV_NEON'
+          ],
+        }],
       ], # conditions
+      'defines': [
+        # Enable the following 3 macros to turn off assembly for specified CPU.
+        # 'LIBYUV_DISABLE_X86',
+        # 'LIBYUV_DISABLE_NEON',
+        # 'LIBYUV_DISABLE_MIPS',
+        # Enable the following macro to build libyuv as a shared library (dll).
+        # 'LIBYUV_USING_SHARED_LIBRARY',
+      ],
     },
-
     {
       'target_name': 'compare',
       'type': 'executable',
@@ -63,8 +119,108 @@
         }],
       ], # conditions
     },
+    {
+      'target_name': 'convert',
+      'type': 'executable',
+      'dependencies': [
+        'libyuv.gyp:libyuv',
+      ],
+      'sources': [
+        # sources
+        'util/convert.cc',
+      ],
+      'conditions': [
+        ['OS=="linux"', {
+          'cflags': [
+            '-fexceptions',
+          ],
+        }],
+      ], # conditions
+    },
+    # TODO(fbarchard): Enable SSE2 and OpenMP for better performance.
+    {
+      'target_name': 'psnr',
+      'type': 'executable',
+      'sources': [
+        # sources
+        'util/psnr_main.cc',
+        'util/psnr.cc',
+        'util/ssim.cc',
+      ],
+      'dependencies': [
+        'libyuv.gyp:libyuv',
+      ],
+      'conditions': [
+        [ 'OS == "ios" and target_subarch == 64', {
+          'defines': [
+            'LIBYUV_DISABLE_NEON'
+          ],
+        }],
+
+        [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
+          'defines': [
+            'HAVE_JPEG',
+          ],
+        }],
+      ], # conditions
+    },
 
+    {
+      'target_name': 'cpuid',
+      'type': 'executable',
+      'sources': [
+        # sources
+        'util/cpuid.c',
+      ],
+      'dependencies': [
+        'libyuv.gyp:libyuv',
+      ],
+    },
   ], # targets
+  'conditions': [
+    ['OS=="android"', {
+      'targets': [
+        {
+          # TODO(kjellander): Figure out what to change in build/apk_test.gypi
+          # to it can be used instead of the copied code below. Using it in its
+          # current version was not possible, since the target starts with 'lib',
+          # which somewhere confuses the variables.
+          'target_name': 'libyuv_unittest_apk',
+          'type': 'none',
+          'variables': {
+            # These are used to configure java_apk.gypi included below.
+            'test_type': 'gtest',
+            'apk_name': 'libyuv_unittest',
+            'test_suite_name': 'libyuv_unittest',
+            'intermediate_dir': '<(PRODUCT_DIR)/libyuv_unittest_apk',
+            'input_shlib_path': '<(SHARED_LIB_DIR)/<(SHARED_LIB_PREFIX)libyuv_unittest<(SHARED_LIB_SUFFIX)',
+            'final_apk_path': '<(intermediate_dir)/libyuv_unittest-debug.apk',
+            'java_in_dir': '<(DEPTH)/testing/android/native_test/java',
+            'test_runner_path': '<(DEPTH)/util/android/test_runner.py',
+            'native_lib_target': 'libyuv_unittest',
+            'gyp_managed_install': 0,
+          },
+          'includes': [
+            'build/android/test_runner.gypi',
+            'build/java_apk.gypi',
+           ],
+          'dependencies': [
+            '<(DEPTH)/base/base.gyp:base_java',
+            # TODO(kjellander): Figure out why base_build_config_gen is needed
+            # here. It really shouldn't since it's a dependency of base_java
+            # above, but there's always 0 tests run if it's missing.
+            '<(DEPTH)/base/base.gyp:base_build_config_gen',
+            '<(DEPTH)/build/android/pylib/device/commands/commands.gyp:chromium_commands',
+            '<(DEPTH)/build/android/pylib/remote/device/dummy/dummy.gyp:remote_device_dummy_apk',
+            '<(DEPTH)/testing/android/appurify_support.gyp:appurify_support_java',
+            '<(DEPTH)/testing/android/on_device_instrumentation.gyp:reporter_java',
+            '<(DEPTH)/tools/android/android_tools.gyp:android_tools',
+            'libyuv_unittest',
+          ],
+        },
+      ],
+    }],
+  ],
 }
 
 # Local Variables:
diff --git a/files/linux.mk b/files/linux.mk
new file mode 100644
index 00000000..ee5a3a70
--- /dev/null
+++ b/files/linux.mk
@@ -0,0 +1,81 @@
+# This is a generic makefile for libyuv for gcc.
+# make -f linux.mk CXX=clang++
+
+CC?=gcc
+CFLAGS?=-O2 -fomit-frame-pointer
+CFLAGS+=-Iinclude/
+
+CXX?=g++
+CXXFLAGS?=-O2 -fomit-frame-pointer
+CXXFLAGS+=-Iinclude/
+
+LOCAL_OBJ_FILES := \
+	source/compare.o           \
+	source/compare_common.o    \
+	source/compare_gcc.o       \
+	source/compare_neon64.o    \
+	source/compare_neon.o      \
+	source/compare_win.o       \
+	source/convert_argb.o      \
+	source/convert.o           \
+	source/convert_from_argb.o \
+	source/convert_from.o      \
+	source/convert_jpeg.o      \
+	source/convert_to_argb.o   \
+	source/convert_to_i420.o   \
+	source/cpu_id.o            \
+	source/mjpeg_decoder.o     \
+	source/mjpeg_validate.o    \
+	source/planar_functions.o  \
+	source/rotate_any.o        \
+	source/rotate_argb.o       \
+	source/rotate.o            \
+	source/rotate_common.o     \
+	source/rotate_gcc.o        \
+	source/rotate_mips.o       \
+	source/rotate_neon64.o     \
+	source/rotate_neon.o       \
+	source/rotate_win.o        \
+	source/row_any.o           \
+	source/row_common.o        \
+	source/row_gcc.o           \
+	source/row_mips.o          \
+	source/row_neon64.o        \
+	source/row_neon.o          \
+	source/row_win.o           \
+	source/scale_any.o         \
+	source/scale_argb.o        \
+	source/scale.o             \
+	source/scale_common.o      \
+	source/scale_gcc.o         \
+	source/scale_mips.o        \
+	source/scale_neon64.o      \
+	source/scale_neon.o        \
+	source/scale_win.o         \
+	source/video_common.o
+
+.cc.o:
+	$(CXX) -c $(CXXFLAGS) $*.cc -o $*.o
+
+.c.o:
+	$(CC) -c $(CFLAGS) $*.c -o $*.o
+
+all: libyuv.a convert cpuid psnr
+
+libyuv.a: $(LOCAL_OBJ_FILES)
+	$(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES)
+
+# A C++ test utility that uses libyuv conversion.
+convert: util/convert.cc libyuv.a
+	$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/convert.cc libyuv.a
+
+# A standalone test utility
+psnr: util/psnr.cc
+	$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/psnr.cc util/psnr_main.cc util/ssim.cc
+
+# A C test utility that uses libyuv conversion from C.
+cpuid: util/cpuid.c libyuv.a
+	$(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a
+
+clean:
+	/bin/rm -f source/*.o *.ii *.s libyuv.a convert cpuid psnr
diff --git a/files/public.mk b/files/public.mk
new file mode 100644
index 00000000..090d8cb6
--- /dev/null
+++ b/files/public.mk
@@ -0,0 +1,13 @@
+# This file contains all the common make variables which are useful for
+# anyone depending on this library.
+# Note that dependencies on NDK are not directly listed since NDK auto adds
+# them.
+
+LIBYUV_INCLUDES := $(LIBYUV_PATH)/include
+
+LIBYUV_C_FLAGS :=
+
+LIBYUV_CPP_FLAGS :=
+
+LIBYUV_LDLIBS :=
+LIBYUV_DEP_MODULES :=
diff --git a/files/setup_links.py b/files/setup_links.py
new file mode 100755
index 00000000..b2b459e6
--- /dev/null
+++ b/files/setup_links.py
@@ -0,0 +1,497 @@
+#!/usr/bin/env python
+# Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""Setup links to a Chromium checkout for WebRTC.
+
+WebRTC standalone shares a lot of dependencies and build tools with Chromium.
+To do this, many of the paths of a Chromium checkout is emulated by creating
+symlinks to files and directories. This script handles the setup of symlinks to
+achieve this.
+
+It also handles cleanup of the legacy Subversion-based approach that was used
+before Chrome switched over their master repo from Subversion to Git.
+"""
+
+
+import ctypes
+import errno
+import logging
+import optparse
+import os
+import shelve
+import shutil
+import subprocess
+import sys
+import textwrap
+
+
+DIRECTORIES = [
+  'build',
+  'buildtools',
+  'mojo',  # TODO(kjellander): Remove, see webrtc:5629.
+  'native_client',
+  'net',
+  'testing',
+  'third_party/binutils',
+  'third_party/drmemory',
+  'third_party/instrumented_libraries',
+  'third_party/libjpeg',
+  'third_party/libjpeg_turbo',
+  'third_party/llvm-build',
+  'third_party/lss',
+  'third_party/yasm',
+  'third_party/WebKit',  # TODO(kjellander): Remove, see webrtc:5629.
+  'tools/clang',
+  'tools/gn',
+  'tools/gyp',
+  'tools/memory',
+  'tools/python',
+  'tools/swarming_client',
+  'tools/valgrind',
+  'tools/vim',
+  'tools/win',
+]
+
+from sync_chromium import get_target_os_list
+target_os = get_target_os_list()
+if 'android' in target_os:
+  DIRECTORIES += [
+    'base',
+    'third_party/android_platform',
+    'third_party/android_tools',
+    'third_party/appurify-python',
+    'third_party/ashmem',
+    'third_party/catapult',
+    'third_party/icu',
+    'third_party/ijar',
+    'third_party/jsr-305',
+    'third_party/junit',
+    'third_party/libxml',
+    'third_party/mockito',
+    'third_party/modp_b64',
+    'third_party/protobuf',
+    'third_party/requests',
+    'third_party/robolectric',
+    'tools/android',
+    'tools/grit',
+  ]
+if 'ios' in target_os:
+  DIRECTORIES.append('third_party/class-dump')
+
+FILES = {
+  'tools/isolate_driver.py': None,
+  'third_party/BUILD.gn': None,
+}
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+CHROMIUM_CHECKOUT = os.path.join('chromium', 'src')
+LINKS_DB = 'links'
+
+# Version management to make future upgrades/downgrades easier to support.
+SCHEMA_VERSION = 1
+
+
+def query_yes_no(question, default=False):
+  """Ask a yes/no question via raw_input() and return their answer.
+
+  Modified from http://stackoverflow.com/a/3041990.
+  """
+  prompt = " [%s/%%s]: "
+  prompt = prompt % ('Y' if default is True  else 'y')
+  prompt = prompt % ('N' if default is False else 'n')
+
+  if default is None:
+    default = 'INVALID'
+
+  while True:
+    sys.stdout.write(question + prompt)
+    choice = raw_input().lower()
+    if choice == '' and default != 'INVALID':
+      return default
+
+    if 'yes'.startswith(choice):
+      return True
+    elif 'no'.startswith(choice):
+      return False
+
+    print "Please respond with 'yes' or 'no' (or 'y' or 'n')."
+
+
+# Actions
+class Action(object):
+  def __init__(self, dangerous):
+    self.dangerous = dangerous
+
+  def announce(self, planning):
+    """Log a description of this action.
+
+    Args:
+      planning - True iff we're in the planning stage, False if we're in the
+                 doit stage.
+    """
+    pass
+
+  def doit(self, links_db):
+    """Execute the action, recording what we did to links_db, if necessary."""
+    pass
+
+
+class Remove(Action):
+  def __init__(self, path, dangerous):
+    super(Remove, self).__init__(dangerous)
+    self._priority = 0
+    self._path = path
+
+  def announce(self, planning):
+    log = logging.warn
+    filesystem_type = 'file'
+    if not self.dangerous:
+      log = logging.info
+      filesystem_type = 'link'
+    if planning:
+      log('Planning to remove %s: %s', filesystem_type, self._path)
+    else:
+      log('Removing %s: %s', filesystem_type, self._path)
+
+  def doit(self, _):
+    os.remove(self._path)
+
+
+class Rmtree(Action):
+  def __init__(self, path):
+    super(Rmtree, self).__init__(dangerous=True)
+    self._priority = 0
+    self._path = path
+
+  def announce(self, planning):
+    if planning:
+      logging.warn('Planning to remove directory: %s', self._path)
+    else:
+      logging.warn('Removing directory: %s', self._path)
+
+  def doit(self, _):
+    if sys.platform.startswith('win'):
+      # shutil.rmtree() doesn't work on Windows if any of the directories are
+      # read-only, which svn repositories are.
+      subprocess.check_call(['rd', '/q', '/s', self._path], shell=True)
+    else:
+      shutil.rmtree(self._path)
+
+
+class Makedirs(Action):
+  def __init__(self, path):
+    super(Makedirs, self).__init__(dangerous=False)
+    self._priority = 1
+    self._path = path
+
+  def doit(self, _):
+    try:
+      os.makedirs(self._path)
+    except OSError as e:
+      if e.errno != errno.EEXIST:
+        raise
+
+
+class Symlink(Action):
+  def __init__(self, source_path, link_path):
+    super(Symlink, self).__init__(dangerous=False)
+    self._priority = 2
+    self._source_path = source_path
+    self._link_path = link_path
+
+  def announce(self, planning):
+    if planning:
+      logging.info(
+          'Planning to create link from %s to %s', self._link_path,
+          self._source_path)
+    else:
+      logging.debug(
+          'Linking from %s to %s', self._link_path, self._source_path)
+
+  def doit(self, links_db):
+    # Files not in the root directory need relative path calculation.
+    # On Windows, use absolute paths instead since NTFS doesn't seem to support
+    # relative paths for symlinks.
+    if sys.platform.startswith('win'):
+      source_path = os.path.abspath(self._source_path)
+    else:
+      if os.path.dirname(self._link_path) != self._link_path:
+        source_path = os.path.relpath(self._source_path,
+                                      os.path.dirname(self._link_path))
+
+    os.symlink(source_path, os.path.abspath(self._link_path))
+    links_db[self._source_path] = self._link_path
+
+
+class LinkError(IOError):
+  """Failed to create a link."""
+  pass
+
+
+# Handles symlink creation on the different platforms.
+if sys.platform.startswith('win'):
+  def symlink(source_path, link_path):
+    flag = 1 if os.path.isdir(source_path) else 0
+    if not ctypes.windll.kernel32.CreateSymbolicLinkW(
+        unicode(link_path), unicode(source_path), flag):
+      raise OSError('Failed to create symlink to %s. Notice that only NTFS '
+                    'version 5.0 and up has all the needed APIs for '
+                    'creating symlinks.' % source_path)
+  os.symlink = symlink
+
+
+class WebRTCLinkSetup(object):
+  def __init__(self, links_db, force=False, dry_run=False, prompt=False):
+    self._force = force
+    self._dry_run = dry_run
+    self._prompt = prompt
+    self._links_db = links_db
+
+  def CreateLinks(self, on_bot):
+    logging.debug('CreateLinks')
+    # First, make a plan of action
+    actions = []
+
+    for source_path, link_path in FILES.iteritems():
+      actions += self._ActionForPath(
+          source_path, link_path, check_fn=os.path.isfile, check_msg='files')
+    for source_dir in DIRECTORIES:
+      actions += self._ActionForPath(
+          source_dir, None, check_fn=os.path.isdir,
+          check_msg='directories')
+
+    if not on_bot and self._force:
+      # When making the manual switch from legacy SVN checkouts to the new
+      # Git-based Chromium DEPS, the .gclient_entries file that contains cached
+      # URLs for all DEPS entries must be removed to avoid future sync problems.
+      entries_file = os.path.join(os.path.dirname(ROOT_DIR), '.gclient_entries')
+      if os.path.exists(entries_file):
+        actions.append(Remove(entries_file, dangerous=True))
+
+    actions.sort()
+
+    if self._dry_run:
+      for action in actions:
+        action.announce(planning=True)
+      logging.info('Not doing anything because dry-run was specified.')
+      sys.exit(0)
+
+    if any(a.dangerous for a in actions):
+      logging.warn('Dangerous actions:')
+      for action in (a for a in actions if a.dangerous):
+        action.announce(planning=True)
+      print
+
+      if not self._force:
+        logging.error(textwrap.dedent("""\
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+                              A C T I O N     R E Q I R E D
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        Because chromium/src is transitioning to Git (from SVN), we needed to
+        change the way that the WebRTC standalone checkout works. Instead of
+        individually syncing subdirectories of Chromium in SVN, we're now
+        syncing Chromium (and all of its DEPS, as defined by its own DEPS file),
+        into the `chromium/src` directory.
+
+        As such, all Chromium directories which are currently pulled by DEPS are
+        now replaced with a symlink into the full Chromium checkout.
+
+        To avoid disrupting developers, we've chosen to not delete your
+        directories forcibly, in case you have some work in progress in one of
+        them :).
+
+        ACTION REQUIRED:
+        Before running `gclient sync|runhooks` again, you must run:
+        %s%s --force
+
+        Which will replace all directories which now must be symlinks, after
+        prompting with a summary of the work-to-be-done.
+        """), 'python ' if sys.platform.startswith('win') else '', sys.argv[0])
+        sys.exit(1)
+      elif self._prompt:
+        if not query_yes_no('Would you like to perform the above plan?'):
+          sys.exit(1)
+
+    for action in actions:
+      action.announce(planning=False)
+      action.doit(self._links_db)
+
+    if not on_bot and self._force:
+      logging.info('Completed!\n\nNow run `gclient sync|runhooks` again to '
+                   'let the remaining hooks (that probably were interrupted) '
+                   'execute.')
+
+  def CleanupLinks(self):
+    logging.debug('CleanupLinks')
+    for source, link_path  in self._links_db.iteritems():
+      if source == 'SCHEMA_VERSION':
+        continue
+      if os.path.islink(link_path) or sys.platform.startswith('win'):
+        # os.path.islink() always returns false on Windows
+        # See http://bugs.python.org/issue13143.
+        logging.debug('Removing link to %s at %s', source, link_path)
+        if not self._dry_run:
+          if os.path.exists(link_path):
+            if sys.platform.startswith('win') and os.path.isdir(link_path):
+              subprocess.check_call(['rmdir', '/q', '/s', link_path],
+                                    shell=True)
+            else:
+              os.remove(link_path)
+          del self._links_db[source]
+
+  @staticmethod
+  def _ActionForPath(source_path, link_path=None, check_fn=None,
+                     check_msg=None):
+    """Create zero or more Actions to link to a file or directory.
+
+    This will be a symlink on POSIX platforms. On Windows this requires
+    that NTFS is version 5.0 or higher (Vista or newer).
+
+    Args:
+      source_path: Path relative to the Chromium checkout root.
+        For readability, the path may contain slashes, which will
+        automatically be converted to the right path delimiter on Windows.
+      link_path: The location for the link to create. If omitted it will be the
+        same path as source_path.
+      check_fn: A function returning true if the type of filesystem object is
+        correct for the attempted call. Otherwise an error message with
+        check_msg will be printed.
+      check_msg: String used to inform the user of an invalid attempt to create
+        a file.
+    Returns:
+      A list of Action objects.
+    """
+    def fix_separators(path):
+      if sys.platform.startswith('win'):
+        return path.replace(os.altsep, os.sep)
+      else:
+        return path
+
+    assert check_fn
+    assert check_msg
+    link_path = link_path or source_path
+    link_path = fix_separators(link_path)
+
+    source_path = fix_separators(source_path)
+    source_path = os.path.join(CHROMIUM_CHECKOUT, source_path)
+    if os.path.exists(source_path) and not check_fn:
+      raise LinkError('_LinkChromiumPath can only be used to link to %s: '
+                      'Tried to link to: %s' % (check_msg, source_path))
+
+    if not os.path.exists(source_path):
+      logging.debug('Silently ignoring missing source: %s. This is to avoid '
+                    'errors on platform-specific dependencies.', source_path)
+      return []
+
+    actions = []
+
+    if os.path.exists(link_path) or os.path.islink(link_path):
+      if os.path.islink(link_path):
+        actions.append(Remove(link_path, dangerous=False))
+      elif os.path.isfile(link_path):
+        actions.append(Remove(link_path, dangerous=True))
+      elif os.path.isdir(link_path):
+        actions.append(Rmtree(link_path))
+      else:
+        raise LinkError('Don\'t know how to plan: %s' % link_path)
+
+    # Create parent directories to the target link if needed.
+    target_parent_dirs = os.path.dirname(link_path)
+    if (target_parent_dirs and
+        target_parent_dirs != link_path and
+        not os.path.exists(target_parent_dirs)):
+      actions.append(Makedirs(target_parent_dirs))
+
+    actions.append(Symlink(source_path, link_path))
+
+    return actions
+
+def _initialize_database(filename):
+  links_database = shelve.open(filename)
+
+  # Wipe the database if this version of the script ends up looking at a
+  # newer (future) version of the links db, just to be sure.
+  version = links_database.get('SCHEMA_VERSION')
+  if version and version != SCHEMA_VERSION:
+    logging.info('Found database with schema version %s while this script only '
+                 'supports %s. Wiping previous database contents.', version,
+                 SCHEMA_VERSION)
+    links_database.clear()
+  links_database['SCHEMA_VERSION'] = SCHEMA_VERSION
+  return links_database
+
+
+def main():
+  on_bot = os.environ.get('CHROME_HEADLESS') == '1'
+
+  parser = optparse.OptionParser()
+  parser.add_option('-d', '--dry-run', action='store_true', default=False,
+                    help='Print what would be done, but don\'t perform any '
+                         'operations. This will automatically set logging to '
+                         'verbose.')
+  parser.add_option('-c', '--clean-only', action='store_true', default=False,
+                    help='Only clean previously created links, don\'t create '
+                         'new ones. This will automatically set logging to '
+                         'verbose.')
+  parser.add_option('-f', '--force', action='store_true', default=on_bot,
+                    help='Force link creation. CAUTION: This deletes existing '
+                         'folders and files in the locations where links are '
+                         'about to be created.')
+  parser.add_option('-n', '--no-prompt', action='store_false', dest='prompt',
+                    default=(not on_bot),
+                    help='Prompt if we\'re planning to do a dangerous action')
+  parser.add_option('-v', '--verbose', action='store_const',
+                    const=logging.DEBUG, default=logging.INFO,
+                    help='Print verbose output for debugging.')
+  options, _ = parser.parse_args()
+
+  if options.dry_run or options.force or options.clean_only:
+    options.verbose = logging.DEBUG
+  logging.basicConfig(format='%(message)s', level=options.verbose)
+
+  # Work from the root directory of the checkout.
+  script_dir = os.path.dirname(os.path.abspath(__file__))
+  os.chdir(script_dir)
+
+  if sys.platform.startswith('win'):
+    def is_admin():
+      try:
+        return os.getuid() == 0
+      except AttributeError:
+        return ctypes.windll.shell32.IsUserAnAdmin() != 0
+    if not is_admin():
+      logging.error('On Windows, you now need to have administrator '
+                    'privileges for the shell running %s (or '
+                    '`gclient sync|runhooks`).\nPlease start another command '
+                    'prompt as Administrator and try again.', sys.argv[0])
+      return 1
+
+  if not os.path.exists(CHROMIUM_CHECKOUT):
+    logging.error('Cannot find a Chromium checkout at %s. Did you run "gclient '
+                  'sync" before running this script?', CHROMIUM_CHECKOUT)
+    return 2
+
+  links_database = _initialize_database(LINKS_DB)
+  try:
+    symlink_creator = WebRTCLinkSetup(links_database, options.force,
+                                      options.dry_run, options.prompt)
+    symlink_creator.CleanupLinks()
+    if not options.clean_only:
+      symlink_creator.CreateLinks(on_bot)
+  except LinkError as e:
+    print >> sys.stderr, e.message
+    return 3
+  finally:
+    links_database.close()
+  return 0
+
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/files/source/compare.cc b/files/source/compare.cc
index bf4a7dae..e3846bdf 100644
--- a/files/source/compare.cc
+++ b/files/source/compare.cc
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -17,8 +17,10 @@
 #endif
 
 #include "libyuv/basic_types.h"
+#include "libyuv/compare_row.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/row.h"
+#include "libyuv/video_common.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -26,363 +28,133 @@ extern "C" {
 #endif
 
 // hash seed of 5381 recommended.
-// Internal C version of HashDjb2 with int sized count for efficiency.
-static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
-  uint32 hash = seed;
-  for (int i = 0; i < count; ++i) {
-    hash += (hash << 5) + src[i];
-  }
-  return hash;
-}
-
-// This module is for Visual C x86
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_HASHDJB2_SSE41
-static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
-static const uvec32 kHashMul0 = {
-  0x0c3525e1,  // 33 ^ 15
-  0xa3476dc1,  // 33 ^ 14
-  0x3b4039a1,  // 33 ^ 13
-  0x4f5f0981,  // 33 ^ 12
-};
-static const uvec32 kHashMul1 = {
-  0x30f35d61,  // 33 ^ 11
-  0x855cb541,  // 33 ^ 10
-  0x040a9121,  // 33 ^ 9
-  0x747c7101,  // 33 ^ 8
-};
-static const uvec32 kHashMul2 = {
-  0xec41d4e1,  // 33 ^ 7
-  0x4cfa3cc1,  // 33 ^ 6
-  0x025528a1,  // 33 ^ 5
-  0x00121881,  // 33 ^ 4
-};
-static const uvec32 kHashMul3 = {
-  0x00008c61,  // 33 ^ 3
-  0x00000441,  // 33 ^ 2
-  0x00000021,  // 33 ^ 1
-  0x00000001,  // 33 ^ 0
-};
-
-// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
-// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
-// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
-// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
-// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
-#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
-    _asm _emit 0x40 _asm _emit reg
-
-__declspec(naked) __declspec(align(16))
-static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
-  __asm {
-    mov        eax, [esp + 4]    // src
-    mov        ecx, [esp + 8]    // count
-    movd       xmm0, [esp + 12]  // seed
-
-    pxor       xmm7, xmm7        // constant 0 for unpck
-    movdqa     xmm6, kHash16x33
-
-    align      16
-  wloop:
-    movdqu     xmm1, [eax]       // src[0-15]
-    lea        eax, [eax + 16]
-    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
-    movdqa     xmm5, kHashMul0
-    movdqa     xmm2, xmm1
-    punpcklbw  xmm2, xmm7        // src[0-7]
-    movdqa     xmm3, xmm2
-    punpcklwd  xmm3, xmm7        // src[0-3]
-    pmulld(0xdd)                 // pmulld     xmm3, xmm5
-    movdqa     xmm5, kHashMul1
-    movdqa     xmm4, xmm2
-    punpckhwd  xmm4, xmm7        // src[4-7]
-    pmulld(0xe5)                 // pmulld     xmm4, xmm5
-    movdqa     xmm5, kHashMul2
-    punpckhbw  xmm1, xmm7        // src[8-15]
-    movdqa     xmm2, xmm1
-    punpcklwd  xmm2, xmm7        // src[8-11]
-    pmulld(0xd5)                 // pmulld     xmm2, xmm5
-    movdqa     xmm5, kHashMul3
-    punpckhwd  xmm1, xmm7        // src[12-15]
-    pmulld(0xcd)                 // pmulld     xmm1, xmm5
-    paddd      xmm3, xmm4        // add 16 results
-    paddd      xmm1, xmm2
-    sub        ecx, 16
-    paddd      xmm1, xmm3
-
-    pshufd     xmm2, xmm1, 14    // upper 2 dwords
-    paddd      xmm1, xmm2
-    pshufd     xmm2, xmm1, 1
-    paddd      xmm1, xmm2
-    paddd      xmm0, xmm1
-    jg         wloop
-
-    movd       eax, xmm0        // return hash
-    ret
-  }
-}
-
-#elif !defined(YUV_DISABLE_ASM) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-// GCC 4.2 on OSX has link error when passing static or const to inline.
-// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
-#ifdef __APPLE__
-#define CONST
-#else
-#define CONST static const
-#endif
-#define HAS_HASHDJB2_SSE41
-CONST uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
-CONST uvec32 kHashMul0 = {
-  0x0c3525e1,  // 33 ^ 15
-  0xa3476dc1,  // 33 ^ 14
-  0x3b4039a1,  // 33 ^ 13
-  0x4f5f0981,  // 33 ^ 12
-};
-CONST uvec32 kHashMul1 = {
-  0x30f35d61,  // 33 ^ 11
-  0x855cb541,  // 33 ^ 10
-  0x040a9121,  // 33 ^ 9
-  0x747c7101,  // 33 ^ 8
-};
-CONST uvec32 kHashMul2 = {
-  0xec41d4e1,  // 33 ^ 7
-  0x4cfa3cc1,  // 33 ^ 6
-  0x025528a1,  // 33 ^ 5
-  0x00121881,  // 33 ^ 4
-};
-CONST uvec32 kHashMul3 = {
-  0x00008c61,  // 33 ^ 3
-  0x00000441,  // 33 ^ 2
-  0x00000021,  // 33 ^ 1
-  0x00000001,  // 33 ^ 0
-};
-static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
-  uint32 hash;
-  asm volatile (
-    "movd      %2,%%xmm0                       \n"
-    "pxor      %%xmm7,%%xmm7                   \n"
-    "movdqa    %4,%%xmm6                       \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm1                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "pmulld    %%xmm6,%%xmm0                   \n"
-    "movdqa    %5,%%xmm5                       \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm7,%%xmm3                   \n"
-    "pmulld    %%xmm5,%%xmm3                   \n"
-    "movdqa    %6,%%xmm5                       \n"
-    "movdqa    %%xmm2,%%xmm4                   \n"
-    "punpckhwd %%xmm7,%%xmm4                   \n"
-    "pmulld    %%xmm5,%%xmm4                   \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "punpckhbw %%xmm7,%%xmm1                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklwd %%xmm7,%%xmm2                   \n"
-    "pmulld    %%xmm5,%%xmm2                   \n"
-    "movdqa    %8,%%xmm5                       \n"
-    "punpckhwd %%xmm7,%%xmm1                   \n"
-    "pmulld    %%xmm5,%%xmm1                   \n"
-    "paddd     %%xmm4,%%xmm3                   \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "sub       $0x10,%1                        \n"
-    "paddd     %%xmm3,%%xmm1                   \n"
-    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "jg        1b                              \n"
-    "movd      %%xmm0,%3                       \n"
-  : "+r"(src),        // %0
-    "+r"(count),      // %1
-    "+rm"(seed),      // %2
-    "=g"(hash)        // %3
-  : "m"(kHash16x33),  // %4
-    "m"(kHashMul0),   // %5
-    "m"(kHashMul1),   // %6
-    "m"(kHashMul2),   // %7
-    "m"(kHashMul3)    // %8
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-  return hash;
-}
-#endif  // HAS_HASHDJB2_SSE41
-
-// hash seed of 5381 recommended.
 LIBYUV_API
 uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
-  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+  const int kBlockSize = 1 << 15;  // 32768;
+  int remainder;
+  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
+      HashDjb2_C;
 #if defined(HAS_HASHDJB2_SSE41)
   if (TestCpuFlag(kCpuHasSSE41)) {
     HashDjb2_SSE = HashDjb2_SSE41;
   }
 #endif
+#if defined(HAS_HASHDJB2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HashDjb2_SSE = HashDjb2_AVX2;
+  }
+#endif
 
-  const int kBlockSize = 1 << 15;  // 32768;
-  while (count >= static_cast<uint64>(kBlockSize)) {
+  while (count >= (uint64)(kBlockSize)) {
     seed = HashDjb2_SSE(src, kBlockSize, seed);
     src += kBlockSize;
     count -= kBlockSize;
   }
-  int remainder = static_cast<int>(count) & ~15;
+  remainder = (int)(count) & ~15;
   if (remainder) {
     seed = HashDjb2_SSE(src, remainder, seed);
     src += remainder;
     count -= remainder;
   }
-  remainder = static_cast<int>(count) & 15;
+  remainder = (int)(count) & 15;
   if (remainder) {
     seed = HashDjb2_C(src, remainder, seed);
   }
   return seed;
 }
 
-#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_SUMSQUAREERROR_NEON
-
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
-
-#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_SUMSQUAREERROR_SSE2
-__declspec(naked) __declspec(align(16))
-static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
-                                  int count) {
-  __asm {
-    mov        eax, [esp + 4]    // src_a
-    mov        edx, [esp + 8]    // src_b
-    mov        ecx, [esp + 12]   // count
-    pxor       xmm0, xmm0
-    pxor       xmm5, xmm5
-    sub        edx, eax
-
-    align      16
-  wloop:
-    movdqa     xmm1, [eax]
-    movdqa     xmm2, [eax + edx]
-    lea        eax,  [eax + 16]
-    sub        ecx, 16
-    movdqa     xmm3, xmm1  // abs trick
-    psubusb    xmm1, xmm2
-    psubusb    xmm2, xmm3
-    por        xmm1, xmm2
-    movdqa     xmm2, xmm1
-    punpcklbw  xmm1, xmm5
-    punpckhbw  xmm2, xmm5
-    pmaddwd    xmm1, xmm1
-    pmaddwd    xmm2, xmm2
-    paddd      xmm0, xmm1
-    paddd      xmm0, xmm2
-    jg         wloop
-
-    pshufd     xmm1, xmm0, 0EEh
-    paddd      xmm0, xmm1
-    pshufd     xmm1, xmm0, 01h
-    paddd      xmm0, xmm1
-    movd       eax, xmm0
-    ret
+static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+    if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
+      return FOURCC_BGRA;
+    }
+    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
+      return FOURCC_ARGB;
+    }
+    argb += 8;
   }
+  if (width & 1) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+  }
+  return 0;
 }
 
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-#define HAS_SUMSQUAREERROR_SSE2
-static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
-                                  int count) {
-  uint32 sse;
-  asm volatile (
-    "pxor      %%xmm0,%%xmm0                   \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-    "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
-    "1:                                        \n"
-    "movdqa    (%0),%%xmm1                     \n"
-    "movdqa    (%0,%1,1),%%xmm2                \n"
-    "lea       0x10(%0),%0                     \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psubusb   %%xmm2,%%xmm1                   \n"
-    "psubusb   %%xmm3,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpckhbw %%xmm5,%%xmm2                   \n"
-    "pmaddwd   %%xmm1,%%xmm1                   \n"
-    "pmaddwd   %%xmm2,%%xmm2                   \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "jg        1b                              \n"
-
-    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "movd      %%xmm0,%3                       \n"
-
-  : "+r"(src_a),      // %0
-    "+r"(src_b),      // %1
-    "+r"(count),      // %2
-    "=g"(sse)         // %3
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
-  );
-  return sse;
-}
-#endif
-
-static uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b,
-                               int count) {
-  uint32 sse = 0u;
-  for (int i = 0; i < count; ++i) {
-    int diff = src_a[i] - src_b[i];
-    sse += static_cast<uint32>(diff * diff);
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
+  uint32 fourcc = 0;
+  int h;
+
+  // Coalesce rows.
+  if (stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    stride_argb = 0;
   }
-  return sse;
+  for (h = 0; h < height && fourcc == 0; ++h) {
+    fourcc = ARGBDetectRow_C(argb, width);
+    argb += stride_argb;
+  }
+  return fourcc;
 }
 
+// TODO(fbarchard): Refactor into row function.
 LIBYUV_API
 uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
                              int count) {
+  // SumSquareError returns values 0 to 65535 for each squared difference.
+  // Up to 65536 of those can be summed and remain within a uint32.
+  // After each block of 65536 pixels, accumulate into a uint64.
+  const int kBlockSize = 65536;
+  int remainder = count & (kBlockSize - 1) & ~31;
+  uint64 sse = 0;
+  int i;
   uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
       SumSquareError_C;
 #if defined(HAS_SUMSQUAREERROR_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     SumSquareError = SumSquareError_NEON;
   }
-#elif defined(HAS_SUMSQUAREERROR_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
     // Note only used for multiples of 16 so count is not checked.
     SumSquareError = SumSquareError_SSE2;
   }
 #endif
-  // 32K values will fit a 32bit int return value from SumSquareError.
-  // After each block of 32K, accumulate into 64 bit int.
-  const int kBlockSize = 1 << 15;  // 32768;
-  uint64 sse = 0;
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    // Note only used for multiples of 32 so count is not checked.
+    SumSquareError = SumSquareError_AVX2;
+  }
+#endif
 #ifdef _OPENMP
 #pragma omp parallel for reduction(+: sse)
 #endif
-  for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
     sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
   }
   src_a += count & ~(kBlockSize - 1);
   src_b += count & ~(kBlockSize - 1);
-  int remainder = count & (kBlockSize - 1) & ~15;
   if (remainder) {
     sse += SumSquareError(src_a, src_b, remainder);
     src_a += remainder;
     src_b += remainder;
   }
-  remainder = count & 15;
+  remainder = count & 31;
   if (remainder) {
     sse += SumSquareError_C(src_a, src_b, remainder);
   }
@@ -393,27 +165,20 @@ LIBYUV_API
 uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
                                   const uint8* src_b, int stride_b,
                                   int width, int height) {
-  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
-      SumSquareError_C;
-#if defined(HAS_SUMSQUAREERROR_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SumSquareError = SumSquareError_NEON;
-  }
-#elif defined(HAS_SUMSQUAREERROR_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
-      IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
-    SumSquareError = SumSquareError_SSE2;
-  }
-#endif
-
   uint64 sse = 0;
-  for (int h = 0; h < height; ++h) {
-    sse += SumSquareError(src_a, src_b, width);
+  int h;
+  // Coalesce rows.
+  if (stride_a == width &&
+      stride_b == width) {
+    width *= height;
+    height = 1;
+    stride_a = stride_b = 0;
+  }
+  for (h = 0; h < height; ++h) {
+    sse += ComputeSumSquareError(src_a, src_b, width);
     src_a += stride_a;
     src_b += stride_b;
   }
-
   return sse;
 }
 
@@ -421,7 +186,7 @@ LIBYUV_API
 double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
   double psnr;
   if (sse > 0) {
-    double mse = static_cast<double>(count) / static_cast<double>(sse);
+    double mse = (double)(count) / (double)(sse);
     psnr = 10.0 * log10(255.0 * 255.0 * mse);
   } else {
     psnr = kMaxPsnr;      // Limit to prevent divide by 0
@@ -479,8 +244,10 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a,
   int64 sum_sq_b = 0;
   int64 sum_axb = 0;
 
-  for (int i = 0; i < 8; ++i) {
-    for (int j = 0; j < 8; ++j) {
+  int i;
+  for (i = 0; i < 8; ++i) {
+    int j;
+    for (j = 0; j < 8; ++j) {
       sum_a += src_a[j];
       sum_b += src_b[j];
       sum_sq_a += src_a[j] * src_a[j];
@@ -492,26 +259,29 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a,
     src_b += stride_b;
   }
 
-  const int64 count = 64;
-  // scale the constants by number of pixels
-  const int64 c1 = (cc1 * count * count) >> 12;
-  const int64 c2 = (cc2 * count * count) >> 12;
+  {
+    const int64 count = 64;
+    // scale the constants by number of pixels
+    const int64 c1 = (cc1 * count * count) >> 12;
+    const int64 c2 = (cc2 * count * count) >> 12;
 
-  const int64 sum_a_x_sum_b = sum_a * sum_b;
+    const int64 sum_a_x_sum_b = sum_a * sum_b;
 
-  const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
-                       (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
+                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
 
-  const int64 sum_a_sq = sum_a*sum_a;
-  const int64 sum_b_sq = sum_b*sum_b;
+    const int64 sum_a_sq = sum_a*sum_a;
+    const int64 sum_b_sq = sum_b*sum_b;
 
-  const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
-                       (count * sum_sq_a - sum_a_sq +
-                        count * sum_sq_b - sum_b_sq + c2);
+    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
+                         (count * sum_sq_a - sum_a_sq +
+                          count * sum_sq_b - sum_b_sq + c2);
 
-  if (ssim_d == 0.0)
-    return DBL_MAX;
-  return ssim_n * 1.0 / ssim_d;
+    if (ssim_d == 0.0) {
+      return DBL_MAX;
+    }
+    return ssim_n * 1.0 / ssim_d;
+  }
 }
 
 // We are using a 8x8 moving window with starting location of each 8x8 window
@@ -523,15 +293,14 @@ double CalcFrameSsim(const uint8* src_a, int stride_a,
                      int width, int height) {
   int samples = 0;
   double ssim_total = 0;
-
   double (*Ssim8x8)(const uint8* src_a, int stride_a,
-                    const uint8* src_b, int stride_b);
-
-  Ssim8x8 = Ssim8x8_C;
+                    const uint8* src_b, int stride_b) = Ssim8x8_C;
 
   // sample point start with each 4x4 location
-  for (int i = 0; i < height - 8; i += 4) {
-    for (int j = 0; j < width - 8; j += 4) {
+  int i;
+  for (i = 0; i < height - 8; i += 4) {
+    int j;
+    for (j = 0; j < width - 8; j += 4) {
       ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
       samples++;
     }
diff --git a/files/source/compare_common.cc b/files/source/compare_common.cc
new file mode 100644
index 00000000..42fc5893
--- /dev/null
+++ b/files/source/compare_common.cc
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
+  uint32 sse = 0u;
+  int i;
+  for (i = 0; i < count; ++i) {
+    int diff = src_a[i] - src_b[i];
+    sse += (uint32)(diff * diff);
+  }
+  return sse;
+}
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
+  uint32 hash = seed;
+  int i;
+  for (i = 0; i < count; ++i) {
+    hash += (hash << 5) + src[i];
+  }
+  return hash;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/compare_gcc.cc b/files/source/compare_gcc.cc
new file mode 100644
index 00000000..1b83edb1
--- /dev/null
+++ b/files/source/compare_gcc.cc
@@ -0,0 +1,151 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+  uint32 sse;
+  asm volatile (
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10, 0) ",%0          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10, 1) ",%1          \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psubusb   %%xmm2,%%xmm1                   \n"
+    "psubusb   %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpckhbw %%xmm5,%%xmm2                   \n"
+    "pmaddwd   %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm2,%%xmm2                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+
+    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0,%3                       \n"
+
+  : "+r"(src_a),      // %0
+    "+r"(src_b),      // %1
+    "+r"(count),      // %2
+    "=g"(sse)         // %3
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+  return sse;
+}
+
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  uint32 hash;
+  asm volatile (
+    "movd      %2,%%xmm0                       \n"
+    "pxor      %%xmm7,%%xmm7                   \n"
+    "movdqa    %4,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10, 0) ",%0          \n"
+    "pmulld    %%xmm6,%%xmm0                   \n"
+    "movdqa    %5,%%xmm5                       \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm7,%%xmm3                   \n"
+    "pmulld    %%xmm5,%%xmm3                   \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "movdqa    %%xmm2,%%xmm4                   \n"
+    "punpckhwd %%xmm7,%%xmm4                   \n"
+    "pmulld    %%xmm5,%%xmm4                   \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "punpckhbw %%xmm7,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklwd %%xmm7,%%xmm2                   \n"
+    "pmulld    %%xmm5,%%xmm2                   \n"
+    "movdqa    %8,%%xmm5                       \n"
+    "punpckhwd %%xmm7,%%xmm1                   \n"
+    "pmulld    %%xmm5,%%xmm1                   \n"
+    "paddd     %%xmm4,%%xmm3                   \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm1                   \n"
+    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%1                        \n"
+    "jg        1b                              \n"
+    "movd      %%xmm0,%3                       \n"
+  : "+r"(src),        // %0
+    "+r"(count),      // %1
+    "+rm"(seed),      // %2
+    "=g"(hash)        // %3
+  : "m"(kHash16x33),  // %4
+    "m"(kHashMul0),   // %5
+    "m"(kHashMul1),   // %6
+    "m"(kHashMul2),   // %7
+    "m"(kHashMul3)    // %8
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+  return hash;
+}
+#endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/files/source/compare_neon.cc b/files/source/compare_neon.cc
index d8b375b8..49aa3b4e 100644
--- a/files/source/compare_neon.cc
+++ b/files/source/compare_neon.cc
@@ -4,18 +4,22 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #include "libyuv/basic_types.h"
 
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
 
 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
   volatile uint32 sse;
@@ -25,10 +29,11 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
     "vmov.u8    q9, #0                         \n"
     "vmov.u8    q11, #0                        \n"
 
-    ".p2align  2                               \n"
   "1:                                          \n"
-    "vld1.u8    {q0}, [%0]!                    \n"
-    "vld1.u8    {q1}, [%1]!                    \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"
     "subs       %2, %2, #16                    \n"
     "vsubl.u8   q2, d0, d2                     \n"
     "vsubl.u8   q3, d1, d3                     \n"
@@ -53,10 +58,9 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
   return sse;
 }
 
-#endif  // __ARM_NEON__
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
diff --git a/files/source/compare_neon64.cc b/files/source/compare_neon64.cc
new file mode 100644
index 00000000..f9c7df98
--- /dev/null
+++ b/files/source/compare_neon64.cc
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "eor        v16.16b, v16.16b, v16.16b      \n"
+    "eor        v18.16b, v18.16b, v18.16b      \n"
+    "eor        v17.16b, v17.16b, v17.16b      \n"
+    "eor        v19.16b, v19.16b, v19.16b      \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"
+    "subs       %w2, %w2, #16                  \n"
+    "usubl      v2.8h, v0.8b, v1.8b            \n"
+    "usubl2     v3.8h, v0.16b, v1.16b          \n"
+    "smlal      v16.4s, v2.4h, v2.4h           \n"
+    "smlal      v17.4s, v3.4h, v3.4h           \n"
+    "smlal2     v18.4s, v2.8h, v2.8h           \n"
+    "smlal2     v19.4s, v3.8h, v3.8h           \n"
+    "b.gt       1b                             \n"
+
+    "add        v16.4s, v16.4s, v17.4s         \n"
+    "add        v18.4s, v18.4s, v19.4s         \n"
+    "add        v19.4s, v16.4s, v18.4s         \n"
+    "addv       s0, v19.4s                     \n"
+    "fmov       %w3, s0                        \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+  return sse;
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/compare_win.cc b/files/source/compare_win.cc
new file mode 100644
index 00000000..dc86fe25
--- /dev/null
+++ b/files/source/compare_win.cc
@@ -0,0 +1,222 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+__declspec(naked)
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    pxor       xmm0, xmm0
+    pxor       xmm5, xmm5
+
+  wloop:
+    movdqu     xmm1, [eax]
+    lea        eax,  [eax + 16]
+    movdqu     xmm2, [edx]
+    lea        edx,  [edx + 16]
+    movdqa     xmm3, xmm1  // abs trick
+    psubusb    xmm1, xmm2
+    psubusb    xmm2, xmm3
+    por        xmm1, xmm2
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm1, xmm5
+    punpckhbw  xmm2, xmm5
+    pmaddwd    xmm1, xmm1
+    pmaddwd    xmm2, xmm2
+    paddd      xmm0, xmm1
+    paddd      xmm0, xmm2
+    sub        ecx, 16
+    jg         wloop
+
+    pshufd     xmm1, xmm0, 0xee
+    paddd      xmm0, xmm1
+    pshufd     xmm1, xmm0, 0x01
+    paddd      xmm0, xmm1
+    movd       eax, xmm0
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable: 4752)
+__declspec(naked)
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    vpxor      ymm0, ymm0, ymm0  // sum
+    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
+    sub        edx, eax
+
+  wloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + edx]
+    lea        eax,  [eax + 32]
+    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
+    vpsubusb   ymm2, ymm2, ymm1
+    vpor       ymm1, ymm2, ymm3
+    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
+    vpunpckhbw ymm1, ymm1, ymm5
+    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
+    vpmaddwd   ymm1, ymm1, ymm1
+    vpaddd     ymm0, ymm0, ymm1
+    vpaddd     ymm0, ymm0, ymm2
+    sub        ecx, 32
+    jg         wloop
+
+    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpermq     ymm1, ymm0, 0x02  // high + low lane.
+    vpaddd     ymm0, ymm0, ymm1
+    vmovd      eax, xmm0
+    vzeroupper
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+
+uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+
+__declspec(naked)
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  __asm {
+    mov        eax, [esp + 4]    // src
+    mov        ecx, [esp + 8]    // count
+    movd       xmm0, [esp + 12]  // seed
+
+    pxor       xmm7, xmm7        // constant 0 for unpck
+    movdqa     xmm6, xmmword ptr kHash16x33
+
+  wloop:
+    movdqu     xmm1, [eax]       // src[0-15]
+    lea        eax, [eax + 16]
+    pmulld     xmm0, xmm6        // hash *= 33 ^ 16
+    movdqa     xmm5, xmmword ptr kHashMul0
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm2, xmm7        // src[0-7]
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm3, xmm7        // src[0-3]
+    pmulld     xmm3, xmm5
+    movdqa     xmm5, xmmword ptr kHashMul1
+    movdqa     xmm4, xmm2
+    punpckhwd  xmm4, xmm7        // src[4-7]
+    pmulld     xmm4, xmm5
+    movdqa     xmm5, xmmword ptr kHashMul2
+    punpckhbw  xmm1, xmm7        // src[8-15]
+    movdqa     xmm2, xmm1
+    punpcklwd  xmm2, xmm7        // src[8-11]
+    pmulld     xmm2, xmm5
+    movdqa     xmm5, xmmword ptr kHashMul3
+    punpckhwd  xmm1, xmm7        // src[12-15]
+    pmulld     xmm1, xmm5
+    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm1, xmm2
+    paddd      xmm1, xmm3
+
+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
+    paddd      xmm1, xmm2
+    pshufd     xmm2, xmm1, 0x01
+    paddd      xmm1, xmm2
+    paddd      xmm0, xmm1
+    sub        ecx, 16
+    jg         wloop
+
+    movd       eax, xmm0         // return hash
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+__declspec(naked)
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+  __asm {
+    mov        eax, [esp + 4]    // src
+    mov        ecx, [esp + 8]    // count
+    vmovd      xmm0, [esp + 12]  // seed
+
+  wloop:
+    vpmovzxbd  xmm3, [eax]  // src[0-3]
+    vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16
+    vpmovzxbd  xmm4, [eax + 4]  // src[4-7]
+    vpmulld    xmm3, xmm3, xmmword ptr kHashMul0
+    vpmovzxbd  xmm2, [eax + 8]  // src[8-11]
+    vpmulld    xmm4, xmm4, xmmword ptr kHashMul1
+    vpmovzxbd  xmm1, [eax + 12]  // src[12-15]
+    vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
+    lea        eax, [eax + 16]
+    vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
+    vpaddd     xmm3, xmm3, xmm4        // add 16 results
+    vpaddd     xmm1, xmm1, xmm2
+    vpaddd     xmm1, xmm1, xmm3
+    vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
+    vpaddd     xmm1, xmm1,xmm2
+    vpshufd    xmm2, xmm1, 0x01
+    vpaddd     xmm1, xmm1, xmm2
+    vpaddd     xmm0, xmm0, xmm1
+    sub        ecx, 16
+    jg         wloop
+
+    vmovd      eax, xmm0         // return hash
+    vzeroupper
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/conversion_tables.h b/files/source/conversion_tables.h
deleted file mode 100644
index ef3ebf36..00000000
--- a/files/source/conversion_tables.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-/**************************************************************
-*  conversion_tables.h
-*
-*    Pre-compiled definitions of the conversion equations: YUV -> RGB.
-*
-***************************************************************/
-
-#ifndef LIBYUV_SOURCE_CONVERSION_TABLES_H_
-#define LIBYUV_SOURCE_CONVERSION_TABLES_H_
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-/******************************************************************************
-* YUV TO RGB approximation
-*
-*  R = clip( (298 * (Y - 16)                   + 409 * (V - 128) + 128 ) >> 8 )
-*  G = clip( (298 * (Y - 16) - 100 * (U - 128) - 208 * (V - 128) + 128 ) >> 8 )
-*  B = clip( (298 * (Y - 16) + 516 * (U - 128)                   + 128 ) >> 8 )
-*******************************************************************************/
-
-    #define Yc(i)  static_cast<int> ( 298  * ( i - 16 )) // Y contribution
-    #define Ucg(i) static_cast<int> ( -100 * ( i - 128 ))// U contribution to G
-    #define Ucb(i) static_cast<int> ( 516  * ( i - 128 ))// U contribution to B
-    #define Vcr(i) static_cast<int> ( 409  * ( i - 128 ))// V contribution to R
-    #define Vcg(i) static_cast<int> ( -208 * ( i - 128 ))// V contribution to G
-
-    static const int mapYc[256] = {
-        Yc(0),Yc(1),Yc(2),Yc(3),Yc(4),Yc(5),Yc(6),Yc(7),Yc(8),Yc(9),
-        Yc(10),Yc(11),Yc(12),Yc(13),Yc(14),Yc(15),Yc(16),Yc(17),Yc(18),Yc(19),
-        Yc(20),Yc(21),Yc(22),Yc(23),Yc(24),Yc(25),Yc(26),Yc(27),Yc(28),Yc(29),
-        Yc(30),Yc(31),Yc(32),Yc(33),Yc(34),Yc(35),Yc(36),Yc(37),Yc(38),Yc(39),
-        Yc(40),Yc(41),Yc(42),Yc(43),Yc(44),Yc(45),Yc(46),Yc(47),Yc(48),Yc(49),
-        Yc(50),Yc(51),Yc(52),Yc(53),Yc(54),Yc(55),Yc(56),Yc(57),Yc(58),Yc(59),
-        Yc(60),Yc(61),Yc(62),Yc(63),Yc(64),Yc(65),Yc(66),Yc(67),Yc(68),Yc(69),
-        Yc(70),Yc(71),Yc(72),Yc(73),Yc(74),Yc(75),Yc(76),Yc(77),Yc(78),Yc(79),
-        Yc(80),Yc(81),Yc(82),Yc(83),Yc(84),Yc(85),Yc(86),Yc(87),Yc(88),Yc(89),
-        Yc(90),Yc(91),Yc(92),Yc(93),Yc(94),Yc(95),Yc(96),Yc(97),Yc(98),Yc(99),
-        Yc(100),Yc(101),Yc(102),Yc(103),Yc(104),Yc(105),Yc(106),Yc(107),Yc(108),
-        Yc(109),Yc(110),Yc(111),Yc(112),Yc(113),Yc(114),Yc(115),Yc(116),Yc(117),
-        Yc(118),Yc(119),Yc(120),Yc(121),Yc(122),Yc(123),Yc(124),Yc(125),Yc(126),
-        Yc(127),Yc(128),Yc(129),Yc(130),Yc(131),Yc(132),Yc(133),Yc(134),Yc(135),
-        Yc(136),Yc(137),Yc(138),Yc(139),Yc(140),Yc(141),Yc(142),Yc(143),Yc(144),
-        Yc(145),Yc(146),Yc(147),Yc(148),Yc(149),Yc(150),Yc(151),Yc(152),Yc(153),
-        Yc(154),Yc(155),Yc(156),Yc(157),Yc(158),Yc(159),Yc(160),Yc(161),Yc(162),
-        Yc(163),Yc(164),Yc(165),Yc(166),Yc(167),Yc(168),Yc(169),Yc(170),Yc(171),
-        Yc(172),Yc(173),Yc(174),Yc(175),Yc(176),Yc(177),Yc(178),Yc(179),Yc(180),
-        Yc(181),Yc(182),Yc(183),Yc(184),Yc(185),Yc(186),Yc(187),Yc(188),Yc(189),
-        Yc(190),Yc(191),Yc(192),Yc(193),Yc(194),Yc(195),Yc(196),Yc(197),Yc(198),
-        Yc(199),Yc(200),Yc(201),Yc(202),Yc(203),Yc(204),Yc(205),Yc(206),Yc(207),
-        Yc(208),Yc(209),Yc(210),Yc(211),Yc(212),Yc(213),Yc(214),Yc(215),Yc(216),
-        Yc(217),Yc(218),Yc(219),Yc(220),Yc(221),Yc(222),Yc(223),Yc(224),Yc(225),
-        Yc(226),Yc(227),Yc(228),Yc(229),Yc(230),Yc(231),Yc(232),Yc(233),Yc(234),
-        Yc(235),Yc(236),Yc(237),Yc(238),Yc(239),Yc(240),Yc(241),Yc(242),Yc(243),
-        Yc(244),Yc(245),Yc(246),Yc(247),Yc(248),Yc(249),Yc(250),Yc(251),Yc(252),
-        Yc(253),Yc(254),Yc(255)};
-
-   static const int mapUcg[256] = {
-        Ucg(0),Ucg(1),Ucg(2),Ucg(3),Ucg(4),Ucg(5),Ucg(6),Ucg(7),Ucg(8),Ucg(9),
-        Ucg(10),Ucg(11),Ucg(12),Ucg(13),Ucg(14),Ucg(15),Ucg(16),Ucg(17),Ucg(18),
-        Ucg(19),Ucg(20),Ucg(21),Ucg(22),Ucg(23),Ucg(24),Ucg(25),Ucg(26),Ucg(27),
-        Ucg(28),Ucg(29),Ucg(30),Ucg(31),Ucg(32),Ucg(33),Ucg(34),Ucg(35),Ucg(36),
-        Ucg(37),Ucg(38),Ucg(39),Ucg(40),Ucg(41),Ucg(42),Ucg(43),Ucg(44),Ucg(45),
-        Ucg(46),Ucg(47),Ucg(48),Ucg(49),Ucg(50),Ucg(51),Ucg(52),Ucg(53),Ucg(54),
-        Ucg(55),Ucg(56),Ucg(57),Ucg(58),Ucg(59),Ucg(60),Ucg(61),Ucg(62),Ucg(63),
-        Ucg(64),Ucg(65),Ucg(66),Ucg(67),Ucg(68),Ucg(69),Ucg(70),Ucg(71),Ucg(72),
-        Ucg(73),Ucg(74),Ucg(75),Ucg(76),Ucg(77),Ucg(78),Ucg(79),Ucg(80),Ucg(81),
-        Ucg(82),Ucg(83),Ucg(84),Ucg(85),Ucg(86),Ucg(87),Ucg(88),Ucg(89),Ucg(90),
-        Ucg(91),Ucg(92),Ucg(93),Ucg(94),Ucg(95),Ucg(96),Ucg(97),Ucg(98),Ucg(99),
-        Ucg(100),Ucg(101),Ucg(102),Ucg(103),Ucg(104),Ucg(105),Ucg(106),Ucg(107),
-        Ucg(108),Ucg(109),Ucg(110),Ucg(111),Ucg(112),Ucg(113),Ucg(114),Ucg(115),
-        Ucg(116),Ucg(117),Ucg(118),Ucg(119),Ucg(120),Ucg(121),Ucg(122),Ucg(123),
-        Ucg(124),Ucg(125),Ucg(126),Ucg(127),Ucg(128),Ucg(129),Ucg(130),Ucg(131),
-        Ucg(132),Ucg(133),Ucg(134),Ucg(135),Ucg(136),Ucg(137),Ucg(138),Ucg(139),
-        Ucg(140),Ucg(141),Ucg(142),Ucg(143),Ucg(144),Ucg(145),Ucg(146),Ucg(147),
-        Ucg(148),Ucg(149),Ucg(150),Ucg(151),Ucg(152),Ucg(153),Ucg(154),Ucg(155),
-        Ucg(156),Ucg(157),Ucg(158),Ucg(159),Ucg(160),Ucg(161),Ucg(162),Ucg(163),
-        Ucg(164),Ucg(165),Ucg(166),Ucg(167),Ucg(168),Ucg(169),Ucg(170),Ucg(171),
-        Ucg(172),Ucg(173),Ucg(174),Ucg(175),Ucg(176),Ucg(177),Ucg(178),Ucg(179),
-        Ucg(180),Ucg(181),Ucg(182),Ucg(183),Ucg(184),Ucg(185),Ucg(186),Ucg(187),
-        Ucg(188),Ucg(189),Ucg(190),Ucg(191),Ucg(192),Ucg(193),Ucg(194),Ucg(195),
-        Ucg(196),Ucg(197),Ucg(198),Ucg(199),Ucg(200),Ucg(201),Ucg(202),Ucg(203),
-        Ucg(204),Ucg(205),Ucg(206),Ucg(207),Ucg(208),Ucg(209),Ucg(210),Ucg(211),
-        Ucg(212),Ucg(213),Ucg(214),Ucg(215),Ucg(216),Ucg(217),Ucg(218),Ucg(219),
-        Ucg(220),Ucg(221),Ucg(222),Ucg(223),Ucg(224),Ucg(225),Ucg(226),Ucg(227),
-        Ucg(228),Ucg(229),Ucg(230),Ucg(231),Ucg(232),Ucg(233),Ucg(234),Ucg(235),
-        Ucg(236),Ucg(237),Ucg(238),Ucg(239),Ucg(240),Ucg(241),Ucg(242),Ucg(243),
-        Ucg(244),Ucg(245),Ucg(246),Ucg(247),Ucg(248),Ucg(249),Ucg(250),Ucg(251),
-        Ucg(252),Ucg(253),Ucg(254),Ucg(255)};
-
-   static const int mapUcb[256] = {
-        Ucb(0),Ucb(1),Ucb(2),Ucb(3),Ucb(4),Ucb(5),Ucb(6),Ucb(7),Ucb(8),Ucb(9),
-        Ucb(10),Ucb(11),Ucb(12),Ucb(13),Ucb(14),Ucb(15),Ucb(16),Ucb(17),Ucb(18),
-        Ucb(19),Ucb(20),Ucb(21),Ucb(22),Ucb(23),Ucb(24),Ucb(25),Ucb(26),Ucb(27),
-        Ucb(28),Ucb(29),Ucb(30),Ucb(31),Ucb(32),Ucb(33),Ucb(34),Ucb(35),Ucb(36),
-        Ucb(37),Ucb(38),Ucb(39),Ucb(40),Ucb(41),Ucb(42),Ucb(43),Ucb(44),Ucb(45),
-        Ucb(46),Ucb(47),Ucb(48),Ucb(49),Ucb(50),Ucb(51),Ucb(52),Ucb(53),Ucb(54),
-        Ucb(55),Ucb(56),Ucb(57),Ucb(58),Ucb(59),Ucb(60),Ucb(61),Ucb(62),Ucb(63),
-        Ucb(64),Ucb(65),Ucb(66),Ucb(67),Ucb(68),Ucb(69),Ucb(70),Ucb(71),Ucb(72),
-        Ucb(73),Ucb(74),Ucb(75),Ucb(76),Ucb(77),Ucb(78),Ucb(79),Ucb(80),Ucb(81),
-        Ucb(82),Ucb(83),Ucb(84),Ucb(85),Ucb(86),Ucb(87),Ucb(88),Ucb(89),Ucb(90),
-        Ucb(91),Ucb(92),Ucb(93),Ucb(94),Ucb(95),Ucb(96),Ucb(97),Ucb(98),Ucb(99),
-        Ucb(100),Ucb(101),Ucb(102),Ucb(103),Ucb(104),Ucb(105),Ucb(106),Ucb(107),
-        Ucb(108),Ucb(109),Ucb(110),Ucb(111),Ucb(112),Ucb(113),Ucb(114),Ucb(115),
-        Ucb(116),Ucb(117),Ucb(118),Ucb(119),Ucb(120),Ucb(121),Ucb(122),Ucb(123),
-        Ucb(124),Ucb(125),Ucb(126),Ucb(127),Ucb(128),Ucb(129),Ucb(130),Ucb(131),
-        Ucb(132),Ucb(133),Ucb(134),Ucb(135),Ucb(136),Ucb(137),Ucb(138),Ucb(139),
-        Ucb(140),Ucb(141),Ucb(142),Ucb(143),Ucb(144),Ucb(145),Ucb(146),Ucb(147),
-        Ucb(148),Ucb(149),Ucb(150),Ucb(151),Ucb(152),Ucb(153),Ucb(154),Ucb(155),
-        Ucb(156),Ucb(157),Ucb(158),Ucb(159),Ucb(160),Ucb(161),Ucb(162),Ucb(163),
-        Ucb(164),Ucb(165),Ucb(166),Ucb(167),Ucb(168),Ucb(169),Ucb(170),Ucb(171),
-        Ucb(172),Ucb(173),Ucb(174),Ucb(175),Ucb(176),Ucb(177),Ucb(178),Ucb(179),
-        Ucb(180),Ucb(181),Ucb(182),Ucb(183),Ucb(184),Ucb(185),Ucb(186),Ucb(187),
-        Ucb(188),Ucb(189),Ucb(190),Ucb(191),Ucb(192),Ucb(193),Ucb(194),Ucb(195),
-        Ucb(196),Ucb(197),Ucb(198),Ucb(199),Ucb(200),Ucb(201),Ucb(202),Ucb(203),
-        Ucb(204),Ucb(205),Ucb(206),Ucb(207),Ucb(208),Ucb(209),Ucb(210),Ucb(211),
-        Ucb(212),Ucb(213),Ucb(214),Ucb(215),Ucb(216),Ucb(217),Ucb(218),Ucb(219),
-        Ucb(220),Ucb(221),Ucb(222),Ucb(223),Ucb(224),Ucb(225),Ucb(226),Ucb(227),
-        Ucb(228),Ucb(229),Ucb(230),Ucb(231),Ucb(232),Ucb(233),Ucb(234),Ucb(235),
-        Ucb(236),Ucb(237),Ucb(238),Ucb(239),Ucb(240),Ucb(241),Ucb(242),Ucb(243),
-        Ucb(244),Ucb(245),Ucb(246),Ucb(247),Ucb(248),Ucb(249),Ucb(250),Ucb(251),
-        Ucb(252),Ucb(253),Ucb(254),Ucb(255)};
-
-    static const int mapVcr[256] = {
-        Vcr(0),Vcr(1),Vcr(2),Vcr(3),Vcr(4),Vcr(5),Vcr(6),Vcr(7),Vcr(8),Vcr(9),
-        Vcr(10),Vcr(11),Vcr(12),Vcr(13),Vcr(14),Vcr(15),Vcr(16),Vcr(17),Vcr(18),
-        Vcr(19),Vcr(20),Vcr(21),Vcr(22),Vcr(23),Vcr(24),Vcr(25),Vcr(26),Vcr(27),
-        Vcr(28),Vcr(29),Vcr(30),Vcr(31),Vcr(32),Vcr(33),Vcr(34),Vcr(35),Vcr(36),
-        Vcr(37),Vcr(38),Vcr(39),Vcr(40),Vcr(41),Vcr(42),Vcr(43),Vcr(44),Vcr(45),
-        Vcr(46),Vcr(47),Vcr(48),Vcr(49),Vcr(50),Vcr(51),Vcr(52),Vcr(53),Vcr(54),
-        Vcr(55),Vcr(56),Vcr(57),Vcr(58),Vcr(59),Vcr(60),Vcr(61),Vcr(62),Vcr(63),
-        Vcr(64),Vcr(65),Vcr(66),Vcr(67),Vcr(68),Vcr(69),Vcr(70),Vcr(71),Vcr(72),
-        Vcr(73),Vcr(74),Vcr(75),Vcr(76),Vcr(77),Vcr(78),Vcr(79),Vcr(80),Vcr(81),
-        Vcr(82),Vcr(83),Vcr(84),Vcr(85),Vcr(86),Vcr(87),Vcr(88),Vcr(89),Vcr(90),
-        Vcr(91),Vcr(92),Vcr(93),Vcr(94),Vcr(95),Vcr(96),Vcr(97),Vcr(98),Vcr(99),
-        Vcr(100),Vcr(101),Vcr(102),Vcr(103),Vcr(104),Vcr(105),Vcr(106),Vcr(107),
-        Vcr(108),Vcr(109),Vcr(110),Vcr(111),Vcr(112),Vcr(113),Vcr(114),Vcr(115),
-        Vcr(116),Vcr(117),Vcr(118),Vcr(119),Vcr(120),Vcr(121),Vcr(122),Vcr(123),
-        Vcr(124),Vcr(125),Vcr(126),Vcr(127),Vcr(128),Vcr(129),Vcr(130),Vcr(131),
-        Vcr(132),Vcr(133),Vcr(134),Vcr(135),Vcr(136),Vcr(137),Vcr(138),Vcr(139),
-        Vcr(140),Vcr(141),Vcr(142),Vcr(143),Vcr(144),Vcr(145),Vcr(146),Vcr(147),
-        Vcr(148),Vcr(149),Vcr(150),Vcr(151),Vcr(152),Vcr(153),Vcr(154),Vcr(155),
-        Vcr(156),Vcr(157),Vcr(158),Vcr(159),Vcr(160),Vcr(161),Vcr(162),Vcr(163),
-        Vcr(164),Vcr(165),Vcr(166),Vcr(167),Vcr(168),Vcr(169),Vcr(170),Vcr(171),
-        Vcr(172),Vcr(173),Vcr(174),Vcr(175),Vcr(176),Vcr(177),Vcr(178),Vcr(179),
-        Vcr(180),Vcr(181),Vcr(182),Vcr(183),Vcr(184),Vcr(185),Vcr(186),Vcr(187),
-        Vcr(188),Vcr(189),Vcr(190),Vcr(191),Vcr(192),Vcr(193),Vcr(194),Vcr(195),
-        Vcr(196),Vcr(197),Vcr(198),Vcr(199),Vcr(200),Vcr(201),Vcr(202),Vcr(203),
-        Vcr(204),Vcr(205),Vcr(206),Vcr(207),Vcr(208),Vcr(209),Vcr(210),Vcr(211),
-        Vcr(212),Vcr(213),Vcr(214),Vcr(215),Vcr(216),Vcr(217),Vcr(218),Vcr(219),
-        Vcr(220),Vcr(221),Vcr(222),Vcr(223),Vcr(224),Vcr(225),Vcr(226),Vcr(227),
-        Vcr(228),Vcr(229),Vcr(230),Vcr(231),Vcr(232),Vcr(233),Vcr(234),Vcr(235),
-        Vcr(236),Vcr(237),Vcr(238),Vcr(239),Vcr(240),Vcr(241),Vcr(242),Vcr(243),
-        Vcr(244),Vcr(245),Vcr(246),Vcr(247),Vcr(248),Vcr(249),Vcr(250),Vcr(251),
-        Vcr(252),Vcr(253),Vcr(254),Vcr(255)};
-
-
-         static const int mapVcg[256] = {
-        Vcg(0),Vcg(1),Vcg(2),Vcg(3),Vcg(4),Vcg(5),Vcg(6),Vcg(7),Vcg(8),Vcg(9),
-        Vcg(10),Vcg(11),Vcg(12),Vcg(13),Vcg(14),Vcg(15),Vcg(16),Vcg(17),Vcg(18),
-        Vcg(19),Vcg(20),Vcg(21),Vcg(22),Vcg(23),Vcg(24),Vcg(25),Vcg(26),Vcg(27),
-        Vcg(28),Vcg(29),Vcg(30),Vcg(31),Vcg(32),Vcg(33),Vcg(34),Vcg(35),Vcg(36),
-        Vcg(37),Vcg(38),Vcg(39),Vcg(40),Vcg(41),Vcg(42),Vcg(43),Vcg(44),Vcg(45),
-        Vcg(46),Vcg(47),Vcg(48),Vcg(49),Vcg(50),Vcg(51),Vcg(52),Vcg(53),Vcg(54),
-        Vcg(55),Vcg(56),Vcg(57),Vcg(58),Vcg(59),Vcg(60),Vcg(61),Vcg(62),Vcg(63),
-        Vcg(64),Vcg(65),Vcg(66),Vcg(67),Vcg(68),Vcg(69),Vcg(70),Vcg(71),Vcg(72),
-        Vcg(73),Vcg(74),Vcg(75),Vcg(76),Vcg(77),Vcg(78),Vcg(79),Vcg(80),Vcg(81),
-        Vcg(82),Vcg(83),Vcg(84),Vcg(85),Vcg(86),Vcg(87),Vcg(88),Vcg(89),Vcg(90),
-        Vcg(91),Vcg(92),Vcg(93),Vcg(94),Vcg(95),Vcg(96),Vcg(97),Vcg(98),Vcg(99),
-        Vcg(100),Vcg(101),Vcg(102),Vcg(103),Vcg(104),Vcg(105),Vcg(106),Vcg(107),
-        Vcg(108),Vcg(109),Vcg(110),Vcg(111),Vcg(112),Vcg(113),Vcg(114),Vcg(115),
-        Vcg(116),Vcg(117),Vcg(118),Vcg(119),Vcg(120),Vcg(121),Vcg(122),Vcg(123),
-        Vcg(124),Vcg(125),Vcg(126),Vcg(127),Vcg(128),Vcg(129),Vcg(130),Vcg(131),
-        Vcg(132),Vcg(133),Vcg(134),Vcg(135),Vcg(136),Vcg(137),Vcg(138),Vcg(139),
-        Vcg(140),Vcg(141),Vcg(142),Vcg(143),Vcg(144),Vcg(145),Vcg(146),Vcg(147),
-        Vcg(148),Vcg(149),Vcg(150),Vcg(151),Vcg(152),Vcg(153),Vcg(154),Vcg(155),
-        Vcg(156),Vcg(157),Vcg(158),Vcg(159),Vcg(160),Vcg(161),Vcg(162),Vcg(163),
-        Vcg(164),Vcg(165),Vcg(166),Vcg(167),Vcg(168),Vcg(169),Vcg(170),Vcg(171),
-        Vcg(172),Vcg(173),Vcg(174),Vcg(175),Vcg(176),Vcg(177),Vcg(178),Vcg(179),
-        Vcg(180),Vcg(181),Vcg(182),Vcg(183),Vcg(184),Vcg(185),Vcg(186),Vcg(187),
-        Vcg(188),Vcg(189),Vcg(190),Vcg(191),Vcg(192),Vcg(193),Vcg(194),Vcg(195),
-        Vcg(196),Vcg(197),Vcg(198),Vcg(199),Vcg(200),Vcg(201),Vcg(202),Vcg(203),
-        Vcg(204),Vcg(205),Vcg(206),Vcg(207),Vcg(208),Vcg(209),Vcg(210),Vcg(211),
-        Vcg(212),Vcg(213),Vcg(214),Vcg(215),Vcg(216),Vcg(217),Vcg(218),Vcg(219),
-        Vcg(220),Vcg(221),Vcg(222),Vcg(223),Vcg(224),Vcg(225),Vcg(226),Vcg(227),
-        Vcg(228),Vcg(229),Vcg(230),Vcg(231),Vcg(232),Vcg(233),Vcg(234),Vcg(235),
-        Vcg(236),Vcg(237),Vcg(238),Vcg(239),Vcg(240),Vcg(241),Vcg(242),Vcg(243),
-        Vcg(244),Vcg(245),Vcg(246),Vcg(247),Vcg(248),Vcg(249),Vcg(250),Vcg(251),
-        Vcg(252),Vcg(253),Vcg(254),Vcg(255)};
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif
-
diff --git a/files/source/convert.cc b/files/source/convert.cc
index 0882c92b..e332bc50 100644
--- a/files/source/convert.cc
+++ b/files/source/convert.cc
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -12,13 +12,9 @@
 
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
-#include "libyuv/video_common.h"
+#include "libyuv/scale.h"  // For ScalePlane()
 #include "libyuv/row.h"
 
 #ifdef __cplusplus
@@ -26,7 +22,43 @@ namespace libyuv {
 extern "C" {
 #endif
 
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// Any I4xx To I420 format with mirroring.
+static int I4xxToI420(const uint8* src_y, int src_stride_y,
+                      const uint8* src_u, int src_stride_u,
+                      const uint8* src_v, int src_stride_v,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int src_y_width, int src_y_height,
+                      int src_uv_width, int src_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  if (src_y_width == 0 || src_y_height == 0 ||
+      src_uv_width == 0 || src_uv_height == 0) {
+    return -1;
+  }
+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+             dst_y, dst_stride_y, dst_y_width, dst_y_height,
+             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  return 0;
+}
+
 // Copy I420 with optional flipping
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
 LIBYUV_API
 int I420Copy(const uint8* src_y, int src_stride_y,
              const uint8* src_u, int src_stride_u,
@@ -35,6 +67,8 @@ int I420Copy(const uint8* src_y, int src_stride_y,
              uint8* dst_u, int dst_stride_u,
              uint8* dst_v, int dst_stride_v,
              int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
   if (!src_y || !src_u || !src_v ||
       !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -43,7 +77,7 @@ int I420Copy(const uint8* src_y, int src_stride_y,
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    int halfheight = (height + 1) >> 1;
+    halfheight = (height + 1) >> 1;
     src_y = src_y + (height - 1) * src_stride_y;
     src_u = src_u + (halfheight - 1) * src_stride_u;
     src_v = src_v + (halfheight - 1) * src_stride_v;
@@ -52,76 +86,17 @@ int I420Copy(const uint8* src_y, int src_stride_y,
     src_stride_v = -src_stride_v;
   }
 
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
   if (dst_y) {
     CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
+  // Copy UV planes.
   CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
   CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
   return 0;
 }
 
-// Move to row_win etc.
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_HALFROW_SSE2
-__declspec(naked) __declspec(align(16))
-static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
-                         uint8* dst_uv, int pix) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // src_uv_stride
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    sub        edi, eax
-
-    align      16
-  convertloop:
-    movdqa     xmm0, [eax]
-    pavgb      xmm0, [eax + edx]
-    sub        ecx, 16
-    movdqa     [eax + edi], xmm0
-    lea        eax,  [eax + 16]
-    jg         convertloop
-    pop        edi
-    ret
-  }
-}
-
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-#define HAS_HALFROW_SSE2
-static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
-                         uint8* dst_uv, int pix) {
-  asm volatile (
-  "sub        %0,%1                            \n"
-  ".p2align  4                                 \n"
-"1:                                            \n"
-  "movdqa     (%0),%%xmm0                      \n"
-  "pavgb      (%0,%3),%%xmm0                   \n"
-  "sub        $0x10,%2                         \n"
-  "movdqa     %%xmm0,(%0,%1)                   \n"
-  "lea        0x10(%0),%0                      \n"
-  "jg         1b                               \n"
-  : "+r"(src_uv),  // %0
-    "+r"(dst_uv),  // %1
-    "+r"(pix)      // %2
-  : "r"(static_cast<intptr_t>(src_uv_stride))  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0"
-#endif
-);
-}
-#endif
-
-static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
-                      uint8* dst_uv, int pix) {
-  for (int x = 0; x < pix; ++x) {
-    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
-  }
-}
-
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
 int I422ToI420(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
@@ -130,78 +105,19 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  int halfwidth = (width + 1) >> 1;
-  void (*HalfRow)(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix) = HalfRow_C;
-#if defined(HAS_HALFROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(halfwidth, 16) &&
-      IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
-      IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
-      IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
-      IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
-    HalfRow = HalfRow_SSE2;
-  }
-#endif
-
-  // Copy Y plane
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  // SubSample U plane.
-  int y;
-  for (y = 0; y < height - 1; y += 2) {
-    HalfRow(src_u, src_stride_u, dst_u, halfwidth);
-    src_u += src_stride_u * 2;
-    dst_u += dst_stride_u;
-  }
-  if (height & 1) {
-    HalfRow(src_u, 0, dst_u, halfwidth);
-  }
-
-  // SubSample V plane.
-  for (y = 0; y < height - 1; y += 2) {
-    HalfRow(src_v, src_stride_v, dst_v, halfwidth);
-    src_v += src_stride_v * 2;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    HalfRow(src_v, 0, dst_v, halfwidth);
-  }
-  return 0;
+  const int src_uv_width = SUBSAMPLE(width, 1, 1);
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    src_uv_width, height);
 }
 
-// Blends 32x2 pixels to 16x1
-// source in scale.cc
-#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_SCALEROWDOWN2_NEON
-void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width);
-#elif !defined(YUV_DISABLE_ASM) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-
-void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-#endif
-void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
 int I444ToI420(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
@@ -210,74 +126,16 @@ int I444ToI420(const uint8* src_y, int src_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  int halfwidth = (width + 1) >> 1;
-  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) = ScaleRowDown2Int_C;
-#if defined(HAS_SCALEROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON) &&
-      IS_ALIGNED(halfwidth, 16)) {
-    ScaleRowDown2 = ScaleRowDown2Int_NEON;
-  }
-#elif defined(HAS_SCALEROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(halfwidth, 16) &&
-      IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
-      IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
-      IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
-      IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
-    ScaleRowDown2 = ScaleRowDown2Int_SSE2;
-  }
-#endif
-
-  // Copy Y plane
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  // SubSample U plane.
-  int y;
-  for (y = 0; y < height - 1; y += 2) {
-    ScaleRowDown2(src_u, src_stride_u, dst_u, halfwidth);
-    src_u += src_stride_u * 2;
-    dst_u += dst_stride_u;
-  }
-  if (height & 1) {
-    ScaleRowDown2(src_u, 0, dst_u, halfwidth);
-  }
-
-  // SubSample V plane.
-  for (y = 0; y < height - 1; y += 2) {
-    ScaleRowDown2(src_v, src_stride_v, dst_v, halfwidth);
-    src_v += src_stride_v * 2;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    ScaleRowDown2(src_v, 0, dst_v, halfwidth);
-  }
-  return 0;
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    width, height);
 }
 
-// use Bilinear for upsampling chroma
-void ScalePlaneBilinear(int src_width, int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_ptr, uint8* dst_ptr);
-
 // 411 chroma is 1/4 width, 1x height
 // 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
@@ -288,45 +146,15 @@ int I411ToI420(const uint8* src_y, int src_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (height - 1) * dst_stride_u;
-    dst_v = dst_v + (height - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-
-  // Copy Y plane
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  int quarterwidth = (width + 3) >> 2;
-
-  // Resample U plane.
-  ScalePlaneBilinear(quarterwidth, height,  // from 1/4 width, 1x height
-                     halfwidth, halfheight,  // to 1/2 width, 1/2 height
-                     src_stride_u,
-                     dst_stride_u,
-                     src_u, dst_u);
-
-  // Resample V plane.
-  ScalePlaneBilinear(quarterwidth, height,  // from 1/4 width, 1x height
-                     halfwidth, halfheight,  // to 1/2 width, 1/2 height
-                     src_stride_v,
-                     dst_stride_v,
-                     src_v, dst_v);
-  return 0;
+  const int src_uv_width = SUBSAMPLE(width, 3, 2);
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    src_uv_width, height);
 }
 
 // I400 is greyscale typically used in MJPG
@@ -336,6 +164,8 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
   if (!src_y || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
     return -1;
@@ -343,11 +173,10 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
+    halfheight = (height + 1) >> 1;
     src_y = src_y + (height - 1) * src_stride_y;
     src_stride_y = -src_stride_y;
   }
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
   CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
   SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
@@ -355,33 +184,42 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
 }
 
 static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
-                       uint8* dst, int dst_stride_frame,
+                       uint8* dst, int dst_stride,
                        int width, int height) {
+  int y;
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
 #if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
-    CopyRow = CopyRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
-#elif defined(HAS_COPYROW_X86)
-  if (IS_ALIGNED(width, 4)) {
-    CopyRow = CopyRow_X86;
-#if defined(HAS_COPYROW_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) &&
-        IS_ALIGNED(width, 32) && IS_ALIGNED(src, 16) &&
-        IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
-        IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
-      CopyRow = CopyRow_SSE2;
-    }
 #endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
   }
 #endif
 
   // Copy plane
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     CopyRow(src, dst, width);
-    CopyRow(src + src_stride_0, dst + dst_stride_frame, width);
+    CopyRow(src + src_stride_0, dst + dst_stride, width);
     src += src_stride_0 + src_stride_1;
-    dst += dst_stride_frame * 2;
+    dst += dst_stride * 2;
   }
   if (height & 1) {
     CopyRow(src, dst, width);
@@ -404,6 +242,11 @@ static int X420ToI420(const uint8* src_y,
                       uint8* dst_u, int dst_stride_u,
                       uint8* dst_v, int dst_stride_v,
                       int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) = SplitUVRow_C;
   if (!src_y || !src_uv ||
       !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -412,7 +255,7 @@ static int X420ToI420(const uint8* src_y,
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    int halfheight = (height + 1) >> 1;
+    halfheight = (height + 1) >> 1;
     dst_y = dst_y + (height - 1) * dst_stride_y;
     dst_u = dst_u + (halfheight - 1) * dst_stride_u;
     dst_v = dst_v + (halfheight - 1) * dst_stride_v;
@@ -420,33 +263,70 @@ static int X420ToI420(const uint8* src_y,
     dst_stride_u = -dst_stride_u;
     dst_stride_v = -dst_stride_v;
   }
-
-  int halfwidth = (width + 1) >> 1;
-  void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
-      SplitUV_C;
-#if defined(HAS_SPLITUV_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
-    SplitUV = SplitUV_NEON;
+  // Coalesce rows.
+  if (src_stride_y0 == width &&
+      src_stride_y1 == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == halfwidth * 2 &&
+      dst_stride_u == halfwidth &&
+      dst_stride_v == halfwidth) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_uv = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
   }
-#elif defined(HAS_SPLITUV_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(halfwidth, 16) &&
-      IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
-      IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
-      IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
-    SplitUV = SplitUV_SSE2;
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
+      IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
+      IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
+    SplitUVRow = SplitUVRow_Any_DSPR2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_DSPR2;
+    }
   }
 #endif
 
   if (dst_y) {
-    CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
-               width, height);
+    if (src_stride_y0 == src_stride_y1) {
+      CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
+    } else {
+      CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
+                 width, height);
+    }
   }
 
-  int halfheight = (height + 1) >> 1;
-  for (int y = 0; y < halfheight; ++y) {
+  for (y = 0; y < halfheight; ++y) {
     // Copy a row of UV.
-    SplitUV(src_uv, dst_u, dst_v, halfwidth);
+    SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
     dst_u += dst_stride_u;
     dst_v += dst_stride_v;
     src_uv += src_stride_uv;
@@ -470,173 +350,36 @@ int NV12ToI420(const uint8* src_y, int src_stride_y,
                     width, height);
 }
 
-// Convert M420 to I420.
+// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
 LIBYUV_API
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
-                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
+  return X420ToI420(src_y, src_stride_y, src_stride_y,
+                    src_vu, src_stride_vu,
                     dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
                     dst_v, dst_stride_v,
+                    dst_u, dst_stride_u,
                     width, height);
 }
 
-// Convert Q420 to I420.
-// Format is rows of YY/YUYV
+// Convert M420 to I420.
 LIBYUV_API
-int Q420ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_yuy2, int src_stride_yuy2,
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (!src_y || !src_yuy2 ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    int halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-  // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
-    CopyRow = CopyRow_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_X86)
-  if (IS_ALIGNED(width, 4)) {
-    CopyRow = CopyRow_X86;
-  }
-#endif
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-    CopyRow = CopyRow_SSE2;
-  }
-#endif
-
-  void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
-      int pix) = YUY2ToUV422Row_C;
-  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
-      YUY2ToYRow_C;
-#if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width > 16) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Any_SSE2;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
-        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          YUY2ToYRow = YUY2ToYRow_SSE2;
-        }
-      }
-    }
-  }
-#elif defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      YUY2ToYRow = YUY2ToYRow_Any_NEON;
-      if (width > 16) {
-        YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
-      }
-    }
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToYRow = YUY2ToYRow_NEON;
-      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
-    }
-  }
-#endif
-
-  for (int y = 0; y < height - 1; y += 2) {
-    CopyRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-
-    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
-    YUY2ToYRow(src_yuy2, dst_y, width);
-    src_yuy2 += src_stride_yuy2;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    CopyRow(src_y, dst_y, width);
-    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
-  }
-  return 0;
-}
-
-// Test if over reading on source is safe.
-// TODO(fbarchard): Find more efficient solution to safely do odd sizes.
-// Macros to control read policy, from slowest to fastest:
-// READSAFE_NEVER - disables read ahead on systems with strict memory reads
-// READSAFE_ODDHEIGHT - last row of odd height done with C.
-//   This policy assumes that the caller handles the last row of an odd height
-//   image using C.
-// READSAFE_PAGE - enable read ahead within same page.
-//   A page is 4096 bytes. When reading ahead, if the last pixel is near the
-//   end the page, and a read spans the page into the next page, a memory
-//   exception can occur if that page has not been allocated, or is a guard
-//   page. This setting ensures the overread is within the same page.
-// READSAFE_ALWAYS - enables read ahead on systems without memory exceptions
-//   or where buffers are padded by 64 bytes.
-
-#if defined(HAS_RGB24TOARGBROW_SSSE3) || \
-    defined(HAS_RGB24TOARGBROW_SSSE3) || \
-    defined(HAS_RAWTOARGBROW_SSSE3) || \
-    defined(HAS_RGB565TOARGBROW_SSE2) || \
-    defined(HAS_ARGB1555TOARGBROW_SSE2) || \
-    defined(HAS_ARGB4444TOARGBROW_SSE2)
-
-#define READSAFE_ODDHEIGHT
-
-static bool TestReadSafe(const uint8* src_yuy2, int src_stride_yuy2,
-                        int width, int height, int bpp, int overread) {
-  if (width > kMaxStride) {
-    return false;
-  }
-#if defined(READSAFE_ALWAYS)
-  return true;
-#elif defined(READSAFE_NEVER)
-  return false;
-#elif defined(READSAFE_ODDHEIGHT)
-  if (!(width & 15) ||
-      (src_stride_yuy2 >= 0 && (height & 1) && width * bpp >= overread)) {
-    return true;
-  }
-  return false;
-#elif defined(READSAFE_PAGE)
-  if (src_stride_yuy2 >= 0) {
-    src_yuy2 += (height - 1) * src_stride_yuy2;
-  }
-  uintptr_t last_adr = (uintptr_t)(src_yuy2) + width * bpp - 1;
-  uintptr_t last_read_adr = last_adr + overread - 1;
-  if (((last_adr ^ last_read_adr) & ~4095) == 0) {
-    return true;
-  }
-  return false;
-#endif
+  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
+                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
 }
-#endif
 
 // Convert YUY2 to I420.
 LIBYUV_API
@@ -645,43 +388,41 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
+      uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2,
+      uint8* dst_y, int width) = YUY2ToYRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
     src_stride_yuy2 = -src_stride_yuy2;
   }
-  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix);
-  void (*YUY2ToYRow)(const uint8* src_yuy2,
-                     uint8* dst_y, int pix);
-  YUY2ToYRow = YUY2ToYRow_C;
-  YUY2ToUVRow = YUY2ToUVRow_C;
 #if defined(HAS_YUY2TOYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width > 16) {
-      YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Any_SSE2;
-    }
+    YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
-        YUY2ToUVRow = YUY2ToUVRow_SSE2;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          YUY2ToYRow = YUY2ToYRow_SSE2;
-        }
-      }
+      YUY2ToUVRow = YUY2ToUVRow_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
     }
   }
-#elif defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      YUY2ToYRow = YUY2ToYRow_Any_NEON;
-      if (width > 16) {
-        YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
-      }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUVRow = YUY2ToUVRow_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
     }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       YUY2ToYRow = YUY2ToYRow_NEON;
       YUY2ToUVRow = YUY2ToUVRow_NEON;
@@ -689,7 +430,7 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
     YUY2ToYRow(src_yuy2, dst_y, width);
     YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
@@ -712,43 +453,41 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
+      uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C;
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+      uint8* dst_y, int width) = UYVYToYRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
     src_stride_uyvy = -src_stride_uyvy;
   }
-  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix);
-  void (*UYVYToYRow)(const uint8* src_uyvy,
-                     uint8* dst_y, int pix);
-  UYVYToYRow = UYVYToYRow_C;
-  UYVYToUVRow = UYVYToUVRow_C;
 #if defined(HAS_UYVYTOYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width > 16) {
-      UYVYToUVRow = UYVYToUVRow_Any_SSE2;
-      UYVYToYRow = UYVYToYRow_Any_SSE2;
-    }
+    UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
-      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
-        UYVYToUVRow = UYVYToUVRow_SSE2;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          UYVYToYRow = UYVYToYRow_SSE2;
-        }
-      }
+      UYVYToUVRow = UYVYToUVRow_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
     }
   }
-#elif defined(HAS_UYVYTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      UYVYToYRow = UYVYToYRow_Any_NEON;
-      if (width > 16) {
-        UYVYToUVRow = UYVYToUVRow_Any_NEON;
-      }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToUVRow = UYVYToUVRow_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUVRow = UYVYToUVRow_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
     }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    UYVYToUVRow = UYVYToUVRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       UYVYToYRow = UYVYToYRow_NEON;
       UYVYToUVRow = UYVYToUVRow_NEON;
@@ -756,7 +495,7 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
     UYVYToYRow(src_uyvy, dst_y, width);
     UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
@@ -772,199 +511,67 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
   return 0;
 }
 
-// Visual C x86 or GCC little endian.
-#if defined(__x86_64__) || defined(_M_X64) || \
-  defined(__i386__) || defined(_M_IX86) || \
-  defined(__arm__) || defined(_M_ARM) || \
-  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#define LIBYUV_LITTLE_ENDIAN
-#endif
-
-#ifdef LIBYUV_LITTLE_ENDIAN
-#define READWORD(p) (*reinterpret_cast<const uint32*>(p))
-#else
-static inline uint32 READWORD(const uint8* p) {
-  return static_cast<uint32>(p[0]) |
-      (static_cast<uint32>(p[1]) << 8) |
-      (static_cast<uint32>(p[2]) << 16) |
-      (static_cast<uint32>(p[3]) << 24);
-}
-#endif
-
-// Must be multiple of 6 pixels. Will over convert to handle remainder.
-// https://developer.apple.com/quicktime/icefloe/dispatch019.html#v210
-static void V210ToUYVYRow_C(const uint8* src_v210, uint8* dst_uyvy, int width) {
-  for (int x = 0; x < width; x += 6) {
-    uint32 w = READWORD(src_v210 + 0);
-    dst_uyvy[0] = (w >> 2) & 0xff;
-    dst_uyvy[1] = (w >> 12) & 0xff;
-    dst_uyvy[2] = (w >> 22) & 0xff;
-
-    w = READWORD(src_v210 + 4);
-    dst_uyvy[3] = (w >> 2) & 0xff;
-    dst_uyvy[4] = (w >> 12) & 0xff;
-    dst_uyvy[5] = (w >> 22) & 0xff;
-
-    w = READWORD(src_v210 + 8);
-    dst_uyvy[6] = (w >> 2) & 0xff;
-    dst_uyvy[7] = (w >> 12) & 0xff;
-    dst_uyvy[8] = (w >> 22) & 0xff;
-
-    w = READWORD(src_v210 + 12);
-    dst_uyvy[9] = (w >> 2) & 0xff;
-    dst_uyvy[10] = (w >> 12) & 0xff;
-    dst_uyvy[11] = (w >> 22) & 0xff;
-
-    src_v210 += 16;
-    dst_uyvy += 12;
-  }
-}
-
-// Convert V210 to I420.
-// V210 is 10 bit version of UYVY. 16 bytes to store 6 pixels.
-// With is multiple of 48.
+// Convert ARGB to I420.
 LIBYUV_API
-int V210ToI420(const uint8* src_v210, int src_stride_v210,
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (width * 2 * 2 > kMaxStride) {  // 2 rows of UYVY are required.
-    return -1;
-  } else if (!src_v210 || !dst_y || !dst_u || !dst_v ||
-             width <= 0 || height == 0) {
+  int y;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_v210 = src_v210 + (height - 1) * src_stride_v210;
-    src_stride_v210 = -src_stride_v210;
-  }
-  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-  void (*V210ToUYVYRow)(const uint8* src_v210, uint8* dst_uyvy, int pix);
-  V210ToUYVYRow = V210ToUYVYRow_C;
-
-  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix);
-  void (*UYVYToYRow)(const uint8* src_uyvy,
-                     uint8* dst_y, int pix);
-  UYVYToYRow = UYVYToYRow_C;
-  UYVYToUVRow = UYVYToUVRow_C;
-#if defined(HAS_UYVYTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
-    UYVYToUVRow = UYVYToUVRow_SSE2;
-    UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
-    if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-      UYVYToYRow = UYVYToYRow_SSE2;
-    }
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
   }
-#elif defined(HAS_UYVYTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      UYVYToYRow = UYVYToYRow_Any_NEON;
-      if (width > 16) {
-        UYVYToUVRow = UYVYToUVRow_Any_NEON;
-      }
-    }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      UYVYToYRow = UYVYToYRow_NEON;
-      UYVYToUVRow = UYVYToUVRow_NEON;
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
-
-#if defined(HAS_UYVYTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width > 16) {
-      UYVYToUVRow = UYVYToUVRow_Any_SSE2;
-      UYVYToYRow = UYVYToYRow_Any_SSE2;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
-      UYVYToUVRow = UYVYToUVRow_SSE2;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        UYVYToYRow = UYVYToYRow_SSE2;
-      }
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
     }
   }
-#elif defined(HAS_UYVYTOYROW_NEON)
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      UYVYToYRow = UYVYToYRow_Any_NEON;
-      if (width > 16) {
-        UYVYToUVRow = UYVYToUVRow_Any_NEON;
-      }
-    }
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToYRow = UYVYToYRow_NEON;
-      UYVYToUVRow = UYVYToUVRow_NEON;
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
     }
   }
 #endif
-
-  for (int y = 0; y < height - 1; y += 2) {
-    V210ToUYVYRow(src_v210, row, width);
-    V210ToUYVYRow(src_v210 + src_stride_v210, row + kMaxStride, width);
-    UYVYToUVRow(row, kMaxStride, dst_u, dst_v, width);
-    UYVYToYRow(row, dst_y, width);
-    UYVYToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
-    src_v210 += src_stride_v210 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    V210ToUYVYRow(src_v210, row, width);
-    UYVYToUVRow(row, 0, dst_u, dst_v, width);
-    UYVYToYRow(row, dst_y, width);
-  }
-  return 0;
-}
-
-LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  if (!src_argb ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-  ARGBToYRow = ARGBToYRow_C;
-  ARGBToUVRow = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-      ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    }
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUVRow = ARGBToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          ARGBToYRow = ARGBToYRow_SSSE3;
-        }
-      }
+      ARGBToUVRow = ARGBToUVRow_NEON;
     }
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
     ARGBToYRow(src_argb, dst_y, width);
     ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
@@ -980,12 +587,18 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+// Convert BGRA to I420.
 LIBYUV_API
 int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
+      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
+  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) =
+      BGRAToYRow_C;
   if (!src_bgra ||
       !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -997,32 +610,34 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
     src_bgra = src_bgra + (height - 1) * src_stride_bgra;
     src_stride_bgra = -src_stride_bgra;
   }
-  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix);
-  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-  BGRAToYRow = BGRAToYRow_C;
-  BGRAToUVRow = BGRAToUVRow_C;
-#if defined(HAS_BGRATOYROW_SSSE3)
+#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
-      BGRAToYRow = BGRAToYRow_Any_SSSE3;
-    }
+    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+    BGRAToYRow = BGRAToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;
-      BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) {
-        BGRAToUVRow = BGRAToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          BGRAToYRow = BGRAToYRow_SSSE3;
-        }
-      }
+      BGRAToUVRow = BGRAToUVRow_SSSE3;
+      BGRAToYRow = BGRAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    BGRAToYRow = BGRAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      BGRAToYRow = BGRAToYRow_NEON;
     }
   }
 #endif
+#if defined(HAS_BGRATOUVROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      BGRAToUVRow = BGRAToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        BGRAToUVRow = BGRAToUVRow_NEON;
+      }
+    }
+#endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
     BGRAToYRow(src_bgra, dst_y, width);
     BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
@@ -1038,12 +653,18 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
   return 0;
 }
 
+// Convert ABGR to I420.
 LIBYUV_API
 int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
+      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) =
+      ABGRToYRow_C;
   if (!src_abgr ||
       !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1055,32 +676,34 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
     src_abgr = src_abgr + (height - 1) * src_stride_abgr;
     src_stride_abgr = -src_stride_abgr;
   }
-  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix);
-  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-  ABGRToYRow = ABGRToYRow_C;
-  ABGRToUVRow = ABGRToUVRow_C;
-#if defined(HAS_ABGRTOYROW_SSSE3)
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
-      ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_NEON;
     }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;
-      ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) {
-        ABGRToUVRow = ABGRToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          ABGRToYRow = ABGRToYRow_SSSE3;
-        }
-      }
+      ABGRToUVRow = ABGRToUVRow_NEON;
     }
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
     ABGRToYRow(src_abgr, dst_y, width);
     ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
@@ -1096,12 +719,18 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
   return 0;
 }
 
+// Convert RGBA to I420.
 LIBYUV_API
 int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
+      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
+  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) =
+      RGBAToYRow_C;
   if (!src_rgba ||
       !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1113,32 +742,34 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
     src_rgba = src_rgba + (height - 1) * src_stride_rgba;
     src_stride_rgba = -src_stride_rgba;
   }
-  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix);
-  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-  RGBAToYRow = RGBAToYRow_C;
-  RGBAToUVRow = RGBAToUVRow_C;
-#if defined(HAS_RGBATOYROW_SSSE3)
+#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
-      RGBAToYRow = RGBAToYRow_Any_SSSE3;
+    RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+    RGBAToYRow = RGBAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_SSSE3;
+      RGBAToYRow = RGBAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToYRow = RGBAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYRow = RGBAToYRow_NEON;
     }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToUVRow = RGBAToUVRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;
-      RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) {
-        RGBAToUVRow = RGBAToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          RGBAToYRow = RGBAToYRow_SSSE3;
-        }
-      }
+      RGBAToUVRow = RGBAToUVRow_NEON;
     }
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
     RGBAToYRow(src_rgba, dst_y, width);
     RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
@@ -1154,18 +785,30 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
   return 0;
 }
 
+// Convert RGB24 to I420.
 LIBYUV_API
 int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
                 uint8* dst_y, int dst_stride_y,
                 uint8* dst_u, int dst_stride_u,
                 uint8* dst_v, int dst_stride_v,
                 int width, int height) {
-  if (width * 4 > kMaxStride) {  // Row buffer is required.
+  int y;
+#if defined(HAS_RGB24TOYROW_NEON)
+  void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
+      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
+  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) =
+      RGB24ToYRow_C;
+#else
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+      RGB24ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
     return -1;
-  } else if (!src_rgb24 ||
-             !dst_y || !dst_u || !dst_v ||
-             width <= 0 || height == 0) {
-      return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
@@ -1173,70 +816,113 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
     src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
     src_stride_rgb24 = -src_stride_rgb24;
   }
-  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
 
-  RGB24ToARGBRow = RGB24ToARGBRow_C;
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+    RGB24ToYRow = RGB24ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYRow = RGB24ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB24ToUVRow = RGB24ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      TestReadSafe(src_rgb24, src_stride_rgb24, width, height, 3, 48)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
   }
 #endif
-
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-  ARGBToYRow = ARGBToYRow_C;
-  ARGBToUVRow = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    }
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
-
-  for (int y = 0; y < height - 1; y += 2) {
-    RGB24ToARGBRow(src_rgb24, row, width);
-    RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kMaxStride, width);
-    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
-    src_rgb24 += src_stride_rgb24 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
   }
-  if (height & 1) {
-    RGB24ToARGBRow_C(src_rgb24, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYROW_NEON)
+      RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+      RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb24 += src_stride_rgb24 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB24TOYROW_NEON)
+      RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB24TOYROW_NEON)
+    free_aligned_buffer_64(row);
   }
+#endif
   return 0;
 }
 
+// Convert RAW to I420.
 LIBYUV_API
 int RAWToI420(const uint8* src_raw, int src_stride_raw,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height) {
-  if (width * 4 > kMaxStride) {  // Row buffer is required.
+  int y;
+#if defined(HAS_RAWTOYROW_NEON)
+  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
+      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
+  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) =
+      RAWToYRow_C;
+#else
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+      RAWToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_raw || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
     return -1;
-  } else if (!src_raw ||
-             !dst_y || !dst_u || !dst_v ||
-             width <= 0 || height == 0) {
-      return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
@@ -1244,69 +930,112 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
     src_raw = src_raw + (height - 1) * src_stride_raw;
     src_stride_raw = -src_stride_raw;
   }
-  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
 
-  RAWToARGBRow = RAWToARGBRow_C;
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToUVRow = RAWToUVRow_Any_NEON;
+    RAWToYRow = RAWToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYRow = RAWToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RAWToUVRow = RAWToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else
 #if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      TestReadSafe(src_raw, src_stride_raw, width, height, 3, 48)) {
-    RAWToARGBRow = RAWToARGBRow_SSSE3;
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
   }
 #endif
-
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-  ARGBToYRow = ARGBToYRow_C;
-  ARGBToUVRow = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    }
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
-
-  for (int y = 0; y < height - 1; y += 2) {
-    RAWToARGBRow(src_raw, row, width);
-    RAWToARGBRow(src_raw + src_stride_raw, row + kMaxStride, width);
-    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
-    src_raw += src_stride_raw * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
   }
-  if (height & 1) {
-    RAWToARGBRow_C(src_raw, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYROW_NEON)
+      RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+      RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_raw += src_stride_raw * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RAWTOYROW_NEON)
+      RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RAWTOYROW_NEON)
+    free_aligned_buffer_64(row);
   }
+#endif
   return 0;
 }
 
+// Convert RGB565 to I420.
 LIBYUV_API
 int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
                  uint8* dst_y, int dst_stride_y,
                  uint8* dst_u, int dst_stride_u,
                  uint8* dst_v, int dst_stride_v,
                  int width, int height) {
-  if (width * 4 > kMaxStride) {  // Row buffer is required.
-    return -1;
-  } else if (!src_rgb565 ||
-             !dst_y || !dst_u || !dst_v ||
-             width <= 0 || height == 0) {
+  int y;
+#if defined(HAS_RGB565TOYROW_NEON)
+  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
+      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
+  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) =
+      RGB565ToYRow_C;
+#else
+  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+      RGB565ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1315,70 +1044,121 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
     src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
     src_stride_rgb565 = -src_stride_rgb565;
   }
-  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
 
-  RGB565ToARGBRow = RGB565ToARGBRow_C;
+// Neon version does direct RGB565 to YUV.
+#if defined(HAS_RGB565TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+    RGB565ToYRow = RGB565ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToYRow = RGB565ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB565ToUVRow = RGB565ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RGB565 to ARGB.
+#else
 #if defined(HAS_RGB565TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      TestReadSafe(src_rgb565, src_stride_rgb565, width, height, 2, 16)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
   }
 #endif
-
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-  ARGBToYRow = ARGBToYRow_C;
-  ARGBToUVRow = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
     }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
-
-  for (int y = 0; y < height - 1; y += 2) {
-    RGB565ToARGBRow(src_rgb565, row, width);
-    RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kMaxStride, width);
-    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
-    src_rgb565 += src_stride_rgb565 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
   }
-  if (height & 1) {
-    RGB565ToARGBRow_C(src_rgb565, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB565TOYROW_NEON)
+      RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+      RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb565 += src_stride_rgb565 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB565TOYROW_NEON)
+      RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB565TOYROW_NEON)
+    free_aligned_buffer_64(row);
   }
+#endif
   return 0;
 }
 
+// Convert ARGB1555 to I420.
 LIBYUV_API
 int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
-                 uint8* dst_y, int dst_stride_y,
-                 uint8* dst_u, int dst_stride_u,
-                 uint8* dst_v, int dst_stride_v,
-                 int width, int height) {
-  if (width * 4 > kMaxStride) {  // Row buffer is required.
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height) {
+  int y;
+#if defined(HAS_ARGB1555TOYROW_NEON)
+  void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
+      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
+  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) =
+      ARGB1555ToYRow_C;
+#else
+  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+      ARGB1555ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
     return -1;
-  } else if (!src_argb1555 ||
-             !dst_y || !dst_u || !dst_v ||
-             width <= 0 || height == 0) {
-      return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
@@ -1386,71 +1166,123 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
     src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
     src_stride_argb1555 = -src_stride_argb1555;
   }
-  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
 
-  ARGB1555ToARGBRow = ARGB1555ToARGBRow_C;
+// Neon version does direct ARGB1555 to YUV.
+#if defined(HAS_ARGB1555TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from ARGB1555 to ARGB.
+#else
 #if defined(HAS_ARGB1555TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      TestReadSafe(src_argb1555, src_stride_argb1555, width, height, 2, 16)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
   }
 #endif
-
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-  ARGBToYRow = ARGBToYRow_C;
-  ARGBToUVRow = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
     }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
-
-  for (int y = 0; y < height - 1; y += 2) {
-    ARGB1555ToARGBRow(src_argb1555, row, width);
-    ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555,
-                      row + kMaxStride, width);
-    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
-    src_argb1555 += src_stride_argb1555 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
   }
-  if (height & 1) {
-    ARGB1555ToARGBRow_C(src_argb1555, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+      ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+      ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+                     width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_argb1555 += src_stride_argb1555 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+      ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_ARGB1555TOYROW_NEON)
+    free_aligned_buffer_64(row);
   }
+#endif
   return 0;
 }
 
+// Convert ARGB4444 to I420.
 LIBYUV_API
 int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
                    uint8* dst_y, int dst_stride_y,
                    uint8* dst_u, int dst_stride_u,
                    uint8* dst_v, int dst_stride_v,
                    int width, int height) {
-  if (width * 4 > kMaxStride) {  // Row buffer is required.
+  int y;
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
+      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
+  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) =
+      ARGB4444ToYRow_C;
+#else
+  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+      ARGB4444ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
     return -1;
-  } else if (!src_argb4444 ||
-             !dst_y || !dst_u || !dst_v ||
-             width <= 0 || height == 0) {
-      return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
@@ -1458,621 +1290,97 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
     src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
     src_stride_argb4444 = -src_stride_argb4444;
   }
-  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
 
-  ARGB4444ToARGBRow = ARGB4444ToARGBRow_C;
+// Neon version does direct ARGB4444 to YUV.
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+    ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToYRow = ARGB4444ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from ARGB4444 to ARGB.
+#else
 #if defined(HAS_ARGB4444TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      TestReadSafe(src_argb4444, src_stride_argb4444, width, height, 2, 16)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
   }
 #endif
-
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-  ARGBToYRow = ARGBToYRow_C;
-  ARGBToUVRow = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
     }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
-
-  for (int y = 0; y < height - 1; y += 2) {
-    ARGB4444ToARGBRow(src_argb4444, row, width);
-    ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444,
-                      row + kMaxStride, width);
-    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
-    src_argb4444 += src_stride_argb4444 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    ARGB4444ToARGBRow_C(src_argb4444, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-  }
-  return 0;
-}
-
-#ifdef HAVE_JPEG
-struct I420Buffers {
-  uint8* y;
-  int y_stride;
-  uint8* u;
-  int u_stride;
-  uint8* v;
-  int v_stride;
-  int w;
-  int h;
-};
-
-static void JpegCopyI420(void* opaque,
-                         const uint8* const* data,
-                         const int* strides,
-                         int rows) {
-  I420Buffers* dest = static_cast<I420Buffers*>(opaque);
-  I420Copy(data[0], strides[0],
-           data[1], strides[1],
-           data[2], strides[2],
-           dest->y, dest->y_stride,
-           dest->u, dest->u_stride,
-           dest->v, dest->v_stride,
-           dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-static void JpegI422ToI420(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  I420Buffers* dest = static_cast<I420Buffers*>(opaque);
-  I422ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-static void JpegI444ToI420(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  I420Buffers* dest = static_cast<I420Buffers*>(opaque);
-  I444ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-static void JpegI411ToI420(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  I420Buffers* dest = static_cast<I420Buffers*>(opaque);
-  I411ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-static void JpegI400ToI420(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  I420Buffers* dest = static_cast<I420Buffers*>(opaque);
-  I400ToI420(data[0], strides[0],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-// MJPG (Motion JPeg) to I420
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
-LIBYUV_API
-int MJPGToI420(const uint8* sample,
-               size_t sample_size,
-               uint8* y, int y_stride,
-               uint8* u, int u_stride,
-               uint8* v, int v_stride,
-               int w, int h,
-               int dw, int dh) {
-  if (sample_size == kUnknownDataSize) {
-    // ERROR: MJPEG frame size unknown
-    return -1;
-  }
-
-  // TODO(fbarchard): Port to C
-  MJpegDecoder mjpeg_decoder;
-  bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-  if (ret && (mjpeg_decoder.GetWidth() != w ||
-              mjpeg_decoder.GetHeight() != h)) {
-    // ERROR: MJPEG frame has unexpected dimensions
-    mjpeg_decoder.UnloadFrame();
-    return 1;  // runtime failure
-  }
-  if (ret) {
-    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
-    // YUV420
-    if (mjpeg_decoder.GetColorSpace() ==
-            MJpegDecoder::kColorSpaceYCbCr &&
-        mjpeg_decoder.GetNumComponents() == 3 &&
-        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
-        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
-    // YUV422
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
-    // YUV444
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
-    // YUV411
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
-    // YUV400
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceGrayscale &&
-               mjpeg_decoder.GetNumComponents() == 1 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
-    } else {
-      // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice. 411 is supported by libjpeg
-      // ERROR: Unable to convert MJPEG frame because format is not supported
-      mjpeg_decoder.UnloadFrame();
-      return 1;
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
     }
   }
-  return 0;
-}
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
 #endif
 
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
-// src_width is used for source stride computation
-// src_height is used to compute location of planes, and indicate inversion
-// sample_size is measured in bytes and is the size of the frame.
-//   With MJPEG it is the compressed size of the frame.
-LIBYUV_API
-int ConvertToI420(const uint8* sample,
-#ifdef HAVE_JPEG
-                  size_t sample_size,
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+      ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+                     width);
 #else
-                  size_t /* sample_size */,
-#endif
-                  uint8* y, int y_stride,
-                  uint8* u, int u_stride,
-                  uint8* v, int v_stride,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int dst_width, int dst_height,
-                  RotationMode rotation,
-                  uint32 format) {
-  if (!y || !u || !v || !sample ||
-      src_width <= 0 || dst_width <= 0  ||
-      src_height == 0 || dst_height == 0) {
-    return -1;
-  }
-  int aligned_src_width = (src_width + 1) & ~1;
-  const uint8* src;
-  const uint8* src_uv;
-  int abs_src_height = (src_height < 0) ? -src_height : src_height;
-  int inv_dst_height = (dst_height < 0) ? -dst_height : dst_height;
-  if (src_height < 0) {
-    inv_dst_height = -inv_dst_height;
-  }
-  int r = 0;
-
-  // One pass rotation is available for some formats. For the rest, convert
-  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
-  // and then rotate the I420 to the final destination buffer.
-  // For in-place conversion, if destination y is same as source sample,
-  // also enable temporary buffer.
-  bool need_buf = (rotation && format != FOURCC_I420 &&
-      format != FOURCC_NV12 && format != FOURCC_NV21 &&
-      format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
-  uint8* tmp_y = y;
-  uint8* tmp_u = u;
-  uint8* tmp_v = v;
-  int tmp_y_stride = y_stride;
-  int tmp_u_stride = u_stride;
-  int tmp_v_stride = v_stride;
-  uint8* buf = NULL;
-  int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height;
-  if (need_buf) {
-    int y_size = dst_width * abs_dst_height;
-    int uv_size = ((dst_width + 1) / 2) * ((abs_dst_height + 1) / 2);
-    buf = new uint8[y_size + uv_size * 2];
-    if (!buf) {
-      return 1;  // Out of memory runtime error.
-    }
-    y = buf;
-    u = y + y_size;
-    v = u + uv_size;
-    y_stride = dst_width;
-    u_stride = v_stride = ((dst_width + 1) / 2);
-  }
-
-  switch (format) {
-    // Single plane formats
-    case FOURCC_YUY2:
-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToI420(src, aligned_src_width * 2,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_width, inv_dst_height);
-      break;
-    case FOURCC_UYVY:
-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToI420(src, aligned_src_width * 2,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_width, inv_dst_height);
-      break;
-    case FOURCC_V210:
-      // stride is multiple of 48 pixels (128 bytes).
-      // pixels come in groups of 6 = 16 bytes
-      src = sample + (aligned_src_width + 47) / 48 * 128 * crop_y +
-            crop_x / 6 * 16;
-      r = V210ToI420(src, (aligned_src_width + 47) / 48 * 128,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_width, inv_dst_height);
-      break;
-    case FOURCC_24BG:
-      src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RGB24ToI420(src, src_width * 3,
-                      y, y_stride,
-                      u, u_stride,
-                      v, v_stride,
-                      dst_width, inv_dst_height);
-      break;
-    case FOURCC_RAW:
-      src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RAWToI420(src, src_width * 3,
-                    y, y_stride,
-                    u, u_stride,
-                    v, v_stride,
-                    dst_width, inv_dst_height);
-      break;
-    case FOURCC_ARGB:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ARGBToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_width, inv_dst_height);
-      break;
-    case FOURCC_BGRA:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = BGRAToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_width, inv_dst_height);
-      break;
-    case FOURCC_ABGR:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ABGRToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_width, inv_dst_height);
-      break;
-    case FOURCC_RGBA:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = RGBAToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_width, inv_dst_height);
-      break;
-    case FOURCC_RGBP:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = RGB565ToI420(src, src_width * 2,
-                       y, y_stride,
-                       u, u_stride,
-                       v, v_stride,
-                       dst_width, inv_dst_height);
-      break;
-    case FOURCC_RGBO:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB1555ToI420(src, src_width * 2,
-                         y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         dst_width, inv_dst_height);
-      break;
-    case FOURCC_R444:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB4444ToI420(src, src_width * 2,
-                         y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         dst_width, inv_dst_height);
-      break;
-    // TODO(fbarchard): Support cropping Bayer by odd numbers
-    // by adjusting fourcc.
-    case FOURCC_BGGR:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerBGGRToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_width, inv_dst_height);
-      break;
-
-    case FOURCC_GBRG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGBRGToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_width, inv_dst_height);
-      break;
-
-    case FOURCC_GRBG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGRBGToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_width, inv_dst_height);
-      break;
-
-    case FOURCC_RGGB:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerRGGBToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_width, inv_dst_height);
-      break;
-
-    case FOURCC_I400:
-      src = sample + src_width * crop_y + crop_x;
-      r = I400ToI420(src, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_width, inv_dst_height);
-      break;
-
-    // Biplanar formats
-    case FOURCC_NV12:
-      src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
-      r = NV12ToI420Rotate(src, src_width,
-                           src_uv, aligned_src_width,
-                           y, y_stride,
-                           u, u_stride,
-                           v, v_stride,
-                           dst_width, inv_dst_height, rotation);
-      break;
-    case FOURCC_NV21:
-      src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
-      // Call NV12 but with u and v parameters swapped.
-      r = NV12ToI420Rotate(src, src_width,
-                           src_uv, aligned_src_width,
-                           y, y_stride,
-                           v, v_stride,
-                           u, u_stride,
-                           dst_width, inv_dst_height, rotation);
-      break;
-    case FOURCC_M420:
-      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToI420(src, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_width, inv_dst_height);
-      break;
-    case FOURCC_Q420:
-      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
-      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
-               src_width + crop_x * 2;
-      r = Q420ToI420(src, src_width * 3,
-                    src_uv, src_width * 3,
-                    y, y_stride,
-                    u, u_stride,
-                    v, v_stride,
-                    dst_width, inv_dst_height);
-      break;
-    // Triplanar formats
-    case FOURCC_I420:
-    case FOURCC_YU12:
-    case FOURCC_YV12: {
-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8* src_u;
-      const uint8* src_v;
-      int halfwidth = (src_width + 1) / 2;
-      int halfheight = (abs_src_height + 1) / 2;
-      if (format == FOURCC_YV12) {
-        src_v = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      } else {
-        src_u = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      }
-      r = I420Rotate(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_width, inv_dst_height, rotation);
-      break;
-    }
-    case FOURCC_I422:
-    case FOURCC_YV16: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
-      int halfwidth = (src_width + 1) / 2;
-      if (format == FOURCC_YV16) {
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
-      } else {
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
-      }
-      r = I422ToI420(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_width, inv_dst_height);
-      break;
-    }
-    case FOURCC_I444:
-    case FOURCC_YV24: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
-      if (format == FOURCC_YV24) {
-        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
-        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
-      } else {
-        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
-        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
-      }
-      r = I444ToI420(src_y, src_width,
-                     src_u, src_width,
-                     src_v, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_width, inv_dst_height);
-      break;
-    }
-    case FOURCC_I411: {
-      int quarterwidth = (src_width + 3) / 4;
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u = sample + src_width * abs_src_height +
-          quarterwidth * crop_y + crop_x / 4;
-      const uint8* src_v = sample + src_width * abs_src_height +
-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
-      r = I411ToI420(src_y, src_width,
-                     src_u, quarterwidth,
-                     src_v, quarterwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_width, inv_dst_height);
-      break;
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_argb4444 += src_stride_argb4444 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
     }
-#ifdef HAVE_JPEG
-    case FOURCC_MJPG:
-      r = MJPGToI420(sample, sample_size,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     src_width, abs_src_height, dst_width, inv_dst_height);
-      break;
-#endif
-    default:
-      r = -1;  // unknown fourcc - return failure code.
-  }
-
-  if (need_buf) {
-    if (!r) {
-      r = I420Rotate(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     tmp_y, tmp_y_stride,
-                     tmp_u, tmp_u_stride,
-                     tmp_v, tmp_v_stride,
-                     dst_width, abs_dst_height, rotation);
+    if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
     }
-    delete buf;
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+    free_aligned_buffer_64(row);
   }
-
-  return r;
+#endif
+  return 0;
 }
 
 #ifdef __cplusplus
diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc
index 1c5aa9d9..fb9582d6 100644
--- a/files/source/convert_argb.cc
+++ b/files/source/convert_argb.cc
@@ -4,22 +4,20 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #include "libyuv/convert_argb.h"
 
-#include <string.h>  // for memset()
-
 #include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
 #ifdef HAVE_JPEG
 #include "libyuv/mjpeg_decoder.h"
 #endif
+#include "libyuv/planar_functions.h"  // For CopyPlane and ARGBShuffle.
 #include "libyuv/rotate_argb.h"
-#include "libyuv/video_common.h"
 #include "libyuv/row.h"
+#include "libyuv/video_common.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -47,13 +45,180 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
-// Convert I444 to ARGB.
+// Convert I422 to ARGB with matrix
+static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
+                            const uint8* src_u, int src_stride_u,
+                            const uint8* src_v, int src_stride_v,
+                            uint8* dst_argb, int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB.
 LIBYUV_API
-int I444ToARGB(const uint8* src_y, int src_stride_y,
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvI601Constants,
+                          width, height);
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants,
+                          width, height);
+}
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_abgr, dst_stride_abgr,
+                          &kYvuJPEGConstants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvH709Constants,
+                          width, height);
+}
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I422 to ARGB with matrix
+static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
+                            const uint8* src_u, int src_stride_u,
+                            const uint8* src_v, int src_stride_v,
+                            uint8* dst_argb, int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = I422ToARGBRow_C;
   if (!src_y || !src_u || !src_v ||
       !dst_argb ||
       width <= 0 || height == 0) {
@@ -65,25 +230,51 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*I444ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I444ToARGBRow_C;
-#if defined(HAS_I444TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I444ToARGBRow = I444ToARGBRow_SSSE3;
-      }
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_DSPR2;
+  }
+#endif
 
-  for (int y = 0; y < height; ++y) {
-    I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     src_u += src_stride_u;
@@ -99,6 +290,103 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
                const uint8* src_v, int src_stride_v,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvI601Constants,
+                          width, height);
+}
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants,
+                          width, height);
+}
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_abgr, dst_stride_abgr,
+                          &kYvuJPEGConstants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvH709Constants,
+                          width, height);
+}
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I444 to ARGB with matrix
+static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
+                            const uint8* src_u, int src_stride_u,
+                            const uint8* src_v, int src_stride_v,
+                            uint8* dst_argb, int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width, int height) {
+  int y;
+  void (*I444ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = I444ToARGBRow_C;
   if (!src_y || !src_u || !src_v ||
       !dst_argb ||
       width <= 0 || height == 0) {
@@ -110,32 +398,42 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u == width &&
+      src_stride_v == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
+      I444ToARGBRow = I444ToARGBRow_AVX2;
     }
   }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToARGBRow = I444ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I422ToARGBRow = I422ToARGBRow_SSSE3;
-      }
+      I444ToARGBRow = I444ToARGBRow_NEON;
     }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+  for (y = 0; y < height; ++y) {
+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     src_u += src_stride_u;
@@ -144,6 +442,51 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvI601Constants,
+                          width, height);
+}
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants,
+                          width, height);
+}
+
 // Convert I411 to ARGB.
 LIBYUV_API
 int I411ToARGB(const uint8* src_y, int src_stride_y,
@@ -151,6 +494,13 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
                const uint8* src_v, int src_stride_v,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*I411ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = I411ToARGBRow_C;
   if (!src_y || !src_u || !src_v ||
       !dst_argb ||
       width <= 0 || height == 0) {
@@ -162,25 +512,42 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*I411ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I411ToARGBRow_C;
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 4 == width &&
+      src_stride_v * 4 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
 #if defined(HAS_I411TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I411ToARGBRow = I411ToARGBRow_SSSE3;
-      }
+      I411ToARGBRow = I411ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I411TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I411ToARGBRow = I411ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I411ToARGBRow = I411ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I411TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I411ToARGBRow = I411ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I411ToARGBRow = I411ToARGBRow_NEON;
     }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+  for (y = 0; y < height; ++y) {
+    I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     src_u += src_stride_u;
@@ -189,13 +556,25 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-
-// Convert I400 to ARGB.
-LIBYUV_API
-int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
-                         uint8* dst_argb, int dst_stride_argb,
-                         int width, int height) {
-  if (!src_y || !dst_argb ||
+// Convert I420 with Alpha to preattenuated ARGB.
+static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
+                                 const uint8* src_u, int src_stride_u,
+                                 const uint8* src_v, int src_stride_v,
+                                 const uint8* src_a, int src_stride_a,
+                                 uint8* dst_argb, int dst_stride_argb,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width, int height, int attenuate) {
+  int y;
+  void (*I422AlphaToARGBRow)(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             const uint8* a_buf,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I422AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
   }
@@ -205,30 +584,124 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*YToARGBRow)(const uint8* y_buf,
-                     uint8* rgb_buf,
-                     int width) = YToARGBRow_C;
-#if defined(HAS_YTOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    YToARGBRow = YToARGBRow_SSE2;
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    YToARGBRow(src_y, dst_argb, width);
+  for (y = 0; y < height; ++y) {
+    I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
     dst_argb += dst_stride_argb;
+    src_a += src_stride_a;
     src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
   }
   return 0;
 }
 
+// Convert I420 with Alpha to ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    const uint8* src_a, int src_stride_a,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height, int attenuate) {
+  return I420AlphaToARGBMatrix(src_y, src_stride_y,
+                               src_u, src_stride_u,
+                               src_v, src_stride_v,
+                               src_a, src_stride_a,
+                               dst_argb, dst_stride_argb,
+                               &kYuvI601Constants,
+                               width, height, attenuate);
+}
+
+// Convert I420 with Alpha to ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    const uint8* src_a, int src_stride_a,
+                    uint8* dst_abgr, int dst_stride_abgr,
+                    int width, int height, int attenuate) {
+  return I420AlphaToARGBMatrix(src_y, src_stride_y,
+                               src_v, src_stride_v,  // Swap U and V
+                               src_u, src_stride_u,
+                               src_a, src_stride_a,
+                               dst_abgr, dst_stride_abgr,
+                               &kYvuI601Constants,  // Use Yvu matrix
+                               width, height, attenuate);
+}
+
 // Convert I400 to ARGB.
 LIBYUV_API
 int I400ToARGB(const uint8* src_y, int src_stride_y,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*I400ToARGBRow)(const uint8* y_buf,
+                     uint8* rgb_buf,
+                     int width) = I400ToARGBRow_C;
   if (!src_y || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -236,94 +709,163 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
   }
-  void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
-      I400ToARGBRow_C;
 #if defined(HAS_I400TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(src_y, 8) && IS_ALIGNED(src_stride_y, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    I400ToARGBRow = I400ToARGBRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I400ToARGBRow = I400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I400ToARGBRow = I400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_NEON;
+    }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I400ToARGBRow(src_y, dst_argb, width);
-    src_y += src_stride_y;
     dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
   }
   return 0;
 }
 
-// Convert BGRA to ARGB.
+// Convert J400 to ARGB.
 LIBYUV_API
-int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
+int J400ToARGB(const uint8* src_y, int src_stride_y,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
-  if (!src_bgra || !dst_argb ||
+  int y;
+  void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) =
+      J400ToARGBRow_C;
+  if (!src_y || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
-    src_stride_bgra = -src_stride_bgra;
-  }
-  void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix) =
-      BGRAToARGBRow_C;
-#if defined(HAS_BGRATOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    BGRAToARGBRow = BGRAToARGBRow_SSSE3;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_J400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      J400ToARGBRow = J400ToARGBRow_SSE2;
+    }
   }
 #endif
-
-  for (int y = 0; y < height; ++y) {
-    BGRAToARGBRow(src_bgra, dst_argb, width);
-    src_bgra += src_stride_bgra;
+#if defined(HAS_J400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J400ToARGBRow = J400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J400ToARGBRow = J400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J400ToARGBRow = J400ToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    J400ToARGBRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
     dst_argb += dst_stride_argb;
   }
   return 0;
 }
 
+// Shuffle table for converting BGRA to ARGB.
+static uvec8 kShuffleMaskBGRAToARGB = {
+  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+// Shuffle table for converting ABGR to ARGB.
+static uvec8 kShuffleMaskABGRToARGB = {
+  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting RGBA to ARGB.
+static uvec8 kShuffleMaskRGBAToARGB = {
+  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
+};
+
+// Convert BGRA to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskBGRAToARGB),
+                     width, height);
+}
+
+// Convert ARGB to BGRA (same as BGRAToARGB).
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskBGRAToARGB),
+                     width, height);
+}
+
 // Convert ABGR to ARGB.
 LIBYUV_API
 int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
-  if (!src_abgr || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
-    src_stride_abgr = -src_stride_abgr;
-  }
-  void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix) =
-      ABGRToARGBRow_C;
-#if defined(HAS_ABGRTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ABGRToARGBRow = ABGRToARGBRow_SSSE3;
-  }
-#endif
+  return ARGBShuffle(src_abgr, src_stride_abgr,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskABGRToARGB),
+                     width, height);
+}
 
-  for (int y = 0; y < height; ++y) {
-    ABGRToARGBRow(src_abgr, dst_argb, width);
-    src_abgr += src_stride_abgr;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
+// Convert ARGB to ABGR to (same as ABGRToARGB).
+LIBYUV_API
+int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskABGRToARGB),
+                     width, height);
 }
 
 // Convert RGBA to ARGB.
@@ -331,30 +873,57 @@ LIBYUV_API
 int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
-  if (!src_rgba || !dst_argb ||
+  return ARGBShuffle(src_rgba, src_stride_rgba,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskRGBAToARGB),
+                     width, height);
+}
+
+// Convert RGB24 to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height) {
+  int y;
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+      RGB24ToARGBRow_C;
+  if (!src_rgb24 || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
-    src_stride_rgba = -src_stride_rgba;
-  }
-  void (*RGBAToARGBRow)(const uint8* src_rgba, uint8* dst_argb, int pix) =
-      RGBAToARGBRow_C;
-#if defined(HAS_RGBATOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    RGBAToARGBRow = RGBAToARGBRow_SSSE3;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb24 == width * 3 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb24 = dst_stride_argb = 0;
+  }
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+    }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    RGBAToARGBRow(src_rgba, dst_argb, width);
-    src_rgba += src_stride_rgba;
+  for (y = 0; y < height; ++y) {
+    RGB24ToARGBRow(src_rgb24, dst_argb, width);
+    src_rgb24 += src_stride_rgb24;
     dst_argb += dst_stride_argb;
   }
   return 0;
@@ -365,6 +934,9 @@ LIBYUV_API
 int RAWToARGB(const uint8* src_raw, int src_stride_raw,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height) {
+  int y;
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+      RAWToARGBRow_C;
   if (!src_raw || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -375,52 +947,33 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
     src_raw = src_raw + (height - 1) * src_stride_raw;
     src_stride_raw = -src_stride_raw;
   }
-  void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix) =
-      RAWToARGBRow_C;
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_argb = 0;
+  }
 #if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    RAWToARGBRow = RAWToARGBRow_SSSE3;
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
   }
 #endif
-
-  for (int y = 0; y < height; ++y) {
-    RAWToARGBRow(src_raw, dst_argb, width);
-    src_raw += src_stride_raw;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert RGB24 to ARGB.
-LIBYUV_API
-int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height) {
-  if (!src_rgb24 || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
-  }
-  void (*RGB24ToARGBRow)(const uint8* src_rgb24, uint8* dst_argb, int pix) =
-      RGB24ToARGBRow_C;
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+#if defined(HAS_RAWTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToARGBRow = RAWToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_NEON;
+    }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    RGB24ToARGBRow(src_rgb24, dst_argb, width);
-    src_rgb24 += src_stride_rgb24;
+  for (y = 0; y < height; ++y) {
+    RAWToARGBRow(src_raw, dst_argb, width);
+    src_raw += src_stride_raw;
     dst_argb += dst_stride_argb;
   }
   return 0;
@@ -431,6 +984,9 @@ LIBYUV_API
 int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
                  uint8* dst_argb, int dst_stride_argb,
                  int width, int height) {
+  int y;
+  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) =
+      RGB565ToARGBRow_C;
   if (!src_rgb565 || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -441,17 +997,39 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
     src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
     src_stride_rgb565 = -src_stride_rgb565;
   }
-  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
-      RGB565ToARGBRow_C;
+  // Coalesce rows.
+  if (src_stride_rgb565 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb565 = dst_stride_argb = 0;
+  }
 #if defined(HAS_RGB565TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+    }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     RGB565ToARGBRow(src_rgb565, dst_argb, width);
     src_rgb565 += src_stride_rgb565;
     dst_argb += dst_stride_argb;
@@ -464,8 +1042,11 @@ LIBYUV_API
 int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
                    uint8* dst_argb, int dst_stride_argb,
                    int width, int height) {
+  int y;
+  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
+      int width) = ARGB1555ToARGBRow_C;
   if (!src_argb1555 || !dst_argb ||
-       width <= 0 || height == 0) {
+      width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -474,17 +1055,39 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
     src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
     src_stride_argb1555 = -src_stride_argb1555;
   }
-  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
-                            int pix) = ARGB1555ToARGBRow_C;
+  // Coalesce rows.
+  if (src_stride_argb1555 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb1555 = dst_stride_argb = 0;
+  }
 #if defined(HAS_ARGB1555TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
+    }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
     src_argb1555 += src_stride_argb1555;
     dst_argb += dst_stride_argb;
@@ -497,6 +1100,9 @@ LIBYUV_API
 int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
                    uint8* dst_argb, int dst_stride_argb,
                    int width, int height) {
+  int y;
+  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
+      int width) = ARGB4444ToARGBRow_C;
   if (!src_argb4444 || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -507,17 +1113,39 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
     src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
     src_stride_argb4444 = -src_stride_argb4444;
   }
-  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
-                            int pix) = ARGB4444ToARGBRow_C;
+  // Coalesce rows.
+  if (src_stride_argb4444 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb4444 = dst_stride_argb = 0;
+  }
 #if defined(HAS_ARGB4444TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
+    }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
     src_argb4444 += src_stride_argb4444;
     dst_argb += dst_stride_argb;
@@ -531,6 +1159,12 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
                const uint8* src_uv, int src_stride_uv,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = NV12ToARGBRow_C;
   if (!src_y || !src_uv || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -541,23 +1175,24 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        int width) = NV12ToARGBRow_C;
 #if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        NV12ToARGBRow = NV12ToARGBRow_SSSE3;
-      }
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
     }
   }
 #endif
 #if defined(HAS_NV12TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       NV12ToARGBRow = NV12ToARGBRow_NEON;
@@ -565,8 +1200,8 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    NV12ToARGBRow(src_y, src_uv, dst_argb, width);
+  for (y = 0; y < height; ++y) {
+    NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     if (y & 1) {
@@ -582,6 +1217,12 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
                const uint8* src_uv, int src_stride_uv,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*NV21ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = NV21ToARGBRow_C;
   if (!src_y || !src_uv || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -592,23 +1233,24 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*NV21ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        int width) = NV21ToARGBRow_C;
 #if defined(HAS_NV21TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        NV21ToARGBRow = NV21ToARGBRow_SSSE3;
-      }
+      NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToARGBRow = NV21ToARGBRow_AVX2;
     }
   }
 #endif
 #if defined(HAS_NV21TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       NV21ToARGBRow = NV21ToARGBRow_NEON;
@@ -616,8 +1258,8 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    NV21ToARGBRow(src_y, src_uv, dst_argb, width);
+  for (y = 0; y < height; ++y) {
+    NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     if (y & 1) {
@@ -632,6 +1274,12 @@ LIBYUV_API
 int M420ToARGB(const uint8* src_m420, int src_stride_m420,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = NV12ToARGBRow_C;
   if (!src_m420 || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -642,31 +1290,42 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        int width) = NV12ToARGBRow_C;
 #if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        NV12ToARGBRow = NV12ToARGBRow_SSSE3;
-      }
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
     }
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
-    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+  for (y = 0; y < height - 1; y += 2) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
+                  &kYuvI601Constants, width);
     NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
-                  dst_argb + dst_stride_argb, width);
+                  dst_argb + dst_stride_argb, &kYuvI601Constants, width);
     dst_argb += dst_stride_argb * 2;
     src_m420 += src_stride_m420 * 3;
   }
   if (height & 1) {
-    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
+                  &kYuvI601Constants, width);
   }
   return 0;
 }
@@ -676,6 +1335,12 @@ LIBYUV_API
 int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*YUY2ToARGBRow)(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) =
+      YUY2ToARGBRow_C;
   if (!src_yuy2 || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -686,72 +1351,39 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
     src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
     src_stride_yuy2 = -src_stride_yuy2;
   }
-  void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
-      int pix) = YUY2ToUV422Row_C;
-  void (*YUY2ToYRow)(const uint8* src_yuy2,
-                     uint8* dst_y, int pix) = YUY2ToYRow_C;
-#if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width > 16) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Any_SSE2;
-    }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_argb = 0;
+  }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
-        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
-        YUY2ToYRow = YUY2ToYRow_SSE2;
-      }
+      YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      YUY2ToYRow = YUY2ToYRow_Any_NEON;
-      if (width > 16) {
-        YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
-      }
-    }
-    if (IS_ALIGNED(width, 8)) {
-      YUY2ToYRow = YUY2ToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        YUY2ToUV422Row = YUY2ToUV422Row_NEON;
-      }
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
     }
   }
 #endif
-
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* argb_buf,
-                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
+#if defined(HAS_YUY2TOARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_NEON;
     }
   }
 #endif
-
-  SIMD_ALIGNED(uint8 rowy[kMaxStride]);
-  SIMD_ALIGNED(uint8 rowu[kMaxStride]);
-  SIMD_ALIGNED(uint8 rowv[kMaxStride]);
-
-  for (int y = 0; y < height; ++y) {
-    YUY2ToUV422Row(src_yuy2, rowu, rowv, width);
-    YUY2ToYRow(src_yuy2, rowy, width);
-    I422ToARGBRow(rowy, rowu, rowv, dst_argb, width);
+  for (y = 0; y < height; ++y) {
+    YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
     src_yuy2 += src_stride_yuy2;
     dst_argb += dst_stride_argb;
   }
@@ -763,6 +1395,12 @@ LIBYUV_API
 int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*UYVYToARGBRow)(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) =
+      UYVYToARGBRow_C;
   if (!src_uyvy || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -773,527 +1411,45 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
     src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
     src_stride_uyvy = -src_stride_uyvy;
   }
-  void (*UYVYToUV422Row)(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
-      int pix) = UYVYToUV422Row_C;
-  void (*UYVYToYRow)(const uint8* src_uyvy,
-                     uint8* dst_y, int pix) = UYVYToYRow_C;
-#if defined(HAS_UYVYTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width > 16) {
-      UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
-      UYVYToYRow = UYVYToYRow_Any_SSE2;
-    }
+  // Coalesce rows.
+  if (src_stride_uyvy == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_argb = 0;
+  }
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
-      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
-        UYVYToUV422Row = UYVYToUV422Row_SSE2;
-        UYVYToYRow = UYVYToYRow_SSE2;
-      }
+      UYVYToARGBRow = UYVYToARGBRow_SSSE3;
     }
   }
 #endif
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* argb_buf,
-                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToARGBRow = UYVYToARGBRow_AVX2;
     }
   }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
+#endif
+#if defined(HAS_UYVYTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToARGBRow = UYVYToARGBRow_NEON;
     }
   }
 #endif
-
-  SIMD_ALIGNED(uint8 rowy[kMaxStride]);
-  SIMD_ALIGNED(uint8 rowu[kMaxStride]);
-  SIMD_ALIGNED(uint8 rowv[kMaxStride]);
-
-  for (int y = 0; y < height; ++y) {
-    UYVYToUV422Row(src_uyvy, rowu, rowv, width);
-    UYVYToYRow(src_uyvy, rowy, width);
-    I422ToARGBRow(rowy, rowu, rowv, dst_argb, width);
+  for (y = 0; y < height; ++y) {
+    UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
     src_uyvy += src_stride_uyvy;
     dst_argb += dst_stride_argb;
   }
   return 0;
 }
 
-#ifdef HAVE_JPEG
-struct ARGBBuffers {
-  uint8* argb;
-  int argb_stride;
-  int w;
-  int h;
-};
-
-static void JpegI420ToARGB(void* opaque,
-                         const uint8* const* data,
-                         const int* strides,
-                         int rows) {
-  ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
-  I420ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-static void JpegI422ToARGB(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
-  I422ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-static void JpegI444ToARGB(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
-  I444ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-static void JpegI411ToARGB(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
-  I411ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-static void JpegI400ToARGB(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
-  I400ToARGB(data[0], strides[0],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-// MJPG (Motion JPeg) to ARGB
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
-LIBYUV_API
-int MJPGToARGB(const uint8* sample,
-               size_t sample_size,
-               uint8* argb, int argb_stride,
-               int w, int h,
-               int dw, int dh) {
-  if (sample_size == kUnknownDataSize) {
-    // ERROR: MJPEG frame size unknown
-    return -1;
-  }
-
-  // TODO(fbarchard): Port to C
-  MJpegDecoder mjpeg_decoder;
-  bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-  if (ret && (mjpeg_decoder.GetWidth() != w ||
-              mjpeg_decoder.GetHeight() != h)) {
-    // ERROR: MJPEG frame has unexpected dimensions
-    mjpeg_decoder.UnloadFrame();
-    return 1;  // runtime failure
-  }
-  if (ret) {
-    ARGBBuffers bufs = { argb, argb_stride, dw, dh };
-    // YUV420
-    if (mjpeg_decoder.GetColorSpace() ==
-            MJpegDecoder::kColorSpaceYCbCr &&
-        mjpeg_decoder.GetNumComponents() == 3 &&
-        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
-        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
-    // YUV422
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
-    // YUV444
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
-    // YUV411
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
-    // YUV400
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceGrayscale &&
-               mjpeg_decoder.GetNumComponents() == 1 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
-    } else {
-      // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice. 411 is supported by libjpeg
-      // ERROR: Unable to convert MJPEG frame because format is not supported
-      mjpeg_decoder.UnloadFrame();
-      return 1;
-    }
-  }
-  return 0;
-}
-#endif
-
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
-// src_width is used for source stride computation
-// src_height is used to compute location of planes, and indicate inversion
-// sample_size is measured in bytes and is the size of the frame.
-//   With MJPEG it is the compressed size of the frame.
-LIBYUV_API
-int ConvertToARGB(const uint8* sample, size_t sample_size,
-                  uint8* dst_argb, int argb_stride,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int dst_width, int dst_height,
-                  RotationMode rotation,
-                  uint32 format) {
-  if (dst_argb == NULL || sample == NULL ||
-      src_width <= 0 || dst_width <= 0 ||
-      src_height == 0 || dst_height == 0) {
-    return -1;
-  }
-  int aligned_src_width = (src_width + 1) & ~1;
-  const uint8* src;
-  const uint8* src_uv;
-  int abs_src_height = (src_height < 0) ? -src_height : src_height;
-  int inv_dst_height = (dst_height < 0) ? -dst_height : dst_height;
-  if (src_height < 0) {
-    inv_dst_height = -inv_dst_height;
-  }
-  int r = 0;
-
-  // One pass rotation is available for some formats. For the rest, convert
-  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
-  // and then rotate the I420 to the final destination buffer.
-  // For in-place conversion, if destination dst_argb is same as source sample,
-  // also enable temporary buffer.
-  bool need_buf = (rotation && format != FOURCC_ARGB) || dst_argb == sample;
-  uint8* tmp_argb = dst_argb;
-  int tmp_argb_stride = argb_stride;
-  uint8* buf = NULL;
-  int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height;
-  if (need_buf) {
-    int argb_size = dst_width * abs_dst_height * 4;
-    buf = new uint8[argb_size];
-    if (!buf) {
-      return 1;  // Out of memory runtime error.
-    }
-    dst_argb = buf;
-    argb_stride = dst_width;
-  }
-
-  switch (format) {
-    // Single plane formats
-    case FOURCC_YUY2:
-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToARGB(src, aligned_src_width * 2,
-                     dst_argb, argb_stride,
-                     dst_width, inv_dst_height);
-      break;
-    case FOURCC_UYVY:
-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToARGB(src, aligned_src_width * 2,
-                     dst_argb, argb_stride,
-                     dst_width, inv_dst_height);
-      break;
-//    case FOURCC_V210:
-      // stride is multiple of 48 pixels (128 bytes).
-      // pixels come in groups of 6 = 16 bytes
-//      src = sample + (aligned_src_width + 47) / 48 * 128 * crop_y +
-//            crop_x / 6 * 16;
-//      r = V210ToARGB(src, (aligned_src_width + 47) / 48 * 128,
-//                     dst_argb, argb_stride,
-//                     dst_width, inv_dst_height);
-//      break;
-    case FOURCC_24BG:
-      src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RGB24ToARGB(src, src_width * 3,
-                      dst_argb, argb_stride,
-                      dst_width, inv_dst_height);
-      break;
-    case FOURCC_RAW:
-      src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RAWToARGB(src, src_width * 3,
-                    dst_argb, argb_stride,
-                    dst_width, inv_dst_height);
-      break;
-    case FOURCC_ARGB:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ARGBToARGB(src, src_width * 4,
-                     dst_argb, argb_stride,
-                     dst_width, inv_dst_height);
-      break;
-    case FOURCC_BGRA:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = BGRAToARGB(src, src_width * 4,
-                     dst_argb, argb_stride,
-                     dst_width, inv_dst_height);
-      break;
-    case FOURCC_ABGR:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ABGRToARGB(src, src_width * 4,
-                     dst_argb, argb_stride,
-                     dst_width, inv_dst_height);
-      break;
-    case FOURCC_RGBA:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = RGBAToARGB(src, src_width * 4,
-                     dst_argb, argb_stride,
-                     dst_width, inv_dst_height);
-      break;
-    case FOURCC_RGBP:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = RGB565ToARGB(src, src_width * 2,
-                       dst_argb, argb_stride,
-                       dst_width, inv_dst_height);
-      break;
-    case FOURCC_RGBO:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB1555ToARGB(src, src_width * 2,
-                         dst_argb, argb_stride,
-                         dst_width, inv_dst_height);
-      break;
-    case FOURCC_R444:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB4444ToARGB(src, src_width * 2,
-                         dst_argb, argb_stride,
-                         dst_width, inv_dst_height);
-      break;
-    // TODO(fbarchard): Support cropping Bayer by odd numbers
-    // by adjusting fourcc.
-    case FOURCC_BGGR:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerBGGRToARGB(src, src_width,
-                          dst_argb, argb_stride,
-                          dst_width, inv_dst_height);
-      break;
-
-    case FOURCC_GBRG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGBRGToARGB(src, src_width,
-                          dst_argb, argb_stride,
-                          dst_width, inv_dst_height);
-      break;
-
-    case FOURCC_GRBG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGRBGToARGB(src, src_width,
-                          dst_argb, argb_stride,
-                          dst_width, inv_dst_height);
-      break;
-
-    case FOURCC_RGGB:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerRGGBToARGB(src, src_width,
-                          dst_argb, argb_stride,
-                          dst_width, inv_dst_height);
-      break;
-
-    case FOURCC_I400:
-      src = sample + src_width * crop_y + crop_x;
-      r = I400ToARGB(src, src_width,
-                     dst_argb, argb_stride,
-                     dst_width, inv_dst_height);
-      break;
-
-    // Biplanar formats
-    case FOURCC_NV12:
-      src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
-      r = NV12ToARGB(src, src_width,
-                     src_uv, aligned_src_width,
-                     dst_argb, argb_stride,
-                     dst_width, inv_dst_height);
-      break;
-    case FOURCC_NV21:
-      src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
-      // Call NV12 but with u and v parameters swapped.
-      r = NV21ToARGB(src, src_width,
-                     src_uv, aligned_src_width,
-                     dst_argb, argb_stride,
-                     dst_width, inv_dst_height);
-      break;
-    case FOURCC_M420:
-      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToARGB(src, src_width,
-                     dst_argb, argb_stride,
-                     dst_width, inv_dst_height);
-      break;
-//    case FOURCC_Q420:
-//      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
-//      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
-//               src_width + crop_x * 2;
-//      r = Q420ToARGB(src, src_width * 3,
-//                    src_uv, src_width * 3,
-//                    dst_argb, argb_stride,
-//                    dst_width, inv_dst_height);
-//      break;
-    // Triplanar formats
-    case FOURCC_I420:
-    case FOURCC_YU12:
-    case FOURCC_YV12: {
-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8* src_u;
-      const uint8* src_v;
-      int halfwidth = (src_width + 1) / 2;
-      int halfheight = (abs_src_height + 1) / 2;
-      if (format == FOURCC_YV12) {
-        src_v = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      } else {
-        src_u = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      }
-      r = I420ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     dst_argb, argb_stride,
-                     dst_width, inv_dst_height);
-      break;
-    }
-    case FOURCC_I422:
-    case FOURCC_YV16: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
-      int halfwidth = (src_width + 1) / 2;
-      if (format == FOURCC_YV16) {
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
-      } else {
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
-      }
-      r = I422ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     dst_argb, argb_stride,
-                     dst_width, inv_dst_height);
-      break;
-    }
-    case FOURCC_I444:
-    case FOURCC_YV24: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
-      if (format == FOURCC_YV24) {
-        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
-        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
-      } else {
-        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
-        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
-      }
-      r = I444ToARGB(src_y, src_width,
-                     src_u, src_width,
-                     src_v, src_width,
-                     dst_argb, argb_stride,
-                     dst_width, inv_dst_height);
-      break;
-    }
-    case FOURCC_I411: {
-      int quarterwidth = (src_width + 3) / 4;
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u = sample + src_width * abs_src_height +
-          quarterwidth * crop_y + crop_x / 4;
-      const uint8* src_v = sample + src_width * abs_src_height +
-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
-      r = I411ToARGB(src_y, src_width,
-                     src_u, quarterwidth,
-                     src_v, quarterwidth,
-                     dst_argb, argb_stride,
-                     dst_width, inv_dst_height);
-      break;
-    }
-#ifdef HAVE_JPEG
-    case FOURCC_MJPG:
-      r = MJPGToARGB(sample, sample_size,
-                     dst_argb, argb_stride,
-                     src_width, abs_src_height, dst_width, inv_dst_height);
-      break;
-#endif
-    default:
-      r = -1;  // unknown fourcc - return failure code.
-  }
-
-  if (need_buf) {
-    if (!r) {
-      r = ARGBRotate(dst_argb, argb_stride,
-                     tmp_argb, tmp_argb_stride,
-                     dst_width, abs_dst_height, rotation);
-    }
-    delete buf;
-  }
-
-  return r;
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc
index 4ea974ac..46abdebc 100644
--- a/files/source/convert_from.cc
+++ b/files/source/convert_from.cc
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -13,9 +13,9 @@
 #include "libyuv/basic_types.h"
 #include "libyuv/convert.h"  // For I420Copy
 #include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
+#include "libyuv/scale.h"  // For ScalePlane()
 #include "libyuv/video_common.h"
 #include "libyuv/row.h"
 
@@ -24,6 +24,42 @@ namespace libyuv {
 extern "C" {
 #endif
 
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// I420 To any I4xx YUV format with mirroring.
+static int I420ToI4xx(const uint8* src_y, int src_stride_y,
+                      const uint8* src_u, int src_stride_u,
+                      const uint8* src_v, int src_stride_v,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int src_y_width, int src_y_height,
+                      int dst_uv_width, int dst_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
+  const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+  if (src_y_width == 0 || src_y_height == 0 ||
+      dst_uv_width <= 0 || dst_uv_height <= 0) {
+    return -1;
+  }
+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+             dst_y, dst_stride_y, dst_y_width, dst_y_height,
+             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 422 chroma is 1/2 width, 1x height
 LIBYUV_API
 int I420ToI422(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
@@ -32,78 +68,20 @@ int I420ToI422(const uint8* src_y, int src_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (height - 1) * dst_stride_u;
-    dst_v = dst_v + (height - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-  int halfwidth = (width + 1) >> 1;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 64)) {
-    CopyRow = CopyRow_NEON;
-  }
-#elif defined(HAS_COPYROW_X86)
-  if (IS_ALIGNED(halfwidth, 4)) {
-    CopyRow = CopyRow_X86;
-#if defined(HAS_COPYROW_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 32) &&
-        IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
-        IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
-        IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
-        IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
-      CopyRow = CopyRow_SSE2;
-    }
-#endif
-  }
-#endif
-
-  // Copy Y plane
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  // UpSample U plane.
-  int y;
-  for (y = 0; y < height - 1; y += 2) {
-    CopyRow(src_u, dst_u, halfwidth);
-    CopyRow(src_u, dst_u + dst_stride_u, halfwidth);
-    src_u += src_stride_u;
-    dst_u += dst_stride_u * 2;
-  }
-  if (height & 1) {
-    CopyRow(src_u, dst_u, halfwidth);
-  }
-
-  // UpSample V plane.
-  for (y = 0; y < height - 1; y += 2) {
-    CopyRow(src_v, dst_v, halfwidth);
-    CopyRow(src_v, dst_v + dst_stride_v, halfwidth);
-    src_v += src_stride_v;
-    dst_v += dst_stride_v * 2;
-  }
-  if (height & 1) {
-    CopyRow(src_v, dst_v, halfwidth);
-  }
-  return 0;
+  const int dst_uv_width = (Abs(width) + 1) >> 1;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
 }
 
-// use Bilinear for upsampling chroma
-void ScalePlaneBilinear(int src_width, int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_ptr, uint8* dst_ptr);
-
+// 420 chroma is 1/2 width, 1/2 height
+// 444 chroma is 1x width, 1x height
 LIBYUV_API
 int I420ToI444(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
@@ -112,44 +90,16 @@ int I420ToI444(const uint8* src_y, int src_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (!src_y || !src_u|| !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (height - 1) * dst_stride_u;
-    dst_v = dst_v + (height - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-
-  // Copy Y plane
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-
-  // Upsample U plane.
-  ScalePlaneBilinear(halfwidth, halfheight,
-                     width, height,
-                     src_stride_u,
-                     dst_stride_u,
-                     src_u, dst_u);
-
-  // Upsample V plane.
-  ScalePlaneBilinear(halfwidth, halfheight,
-                     width, height,
-                     src_stride_v,
-                     dst_stride_v,
-                     src_v, dst_v);
-  return 0;
+  const int dst_uv_width = Abs(width);
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
 }
 
 // 420 chroma is 1/2 width, 1/2 height
@@ -162,45 +112,16 @@ int I420ToI411(const uint8* src_y, int src_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (height - 1) * dst_stride_u;
-    dst_v = dst_v + (height - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-
-  // Copy Y plane
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  int quarterwidth = (width + 3) >> 2;
-
-  // Resample U plane.
-  ScalePlaneBilinear(halfwidth, halfheight,  // from 1/2 width, 1/2 height
-                     quarterwidth, height,  // to 1/4 width, 1x height
-                     src_stride_u,
-                     dst_stride_u,
-                     src_u, dst_u);
-
-  // Resample V plane.
-  ScalePlaneBilinear(halfwidth, halfheight,  // from 1/2 width, 1/2 height
-                     quarterwidth, height,  // to 1/4 width, 1x height
-                     src_stride_v,
-                     dst_stride_v,
-                     src_v, dst_v);
-  return 0;
+  const int dst_uv_width = (Abs(width) + 3) >> 2;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
 }
 
 // Copy to I400. Source can be I420,422,444,400,NV12,NV21
@@ -222,282 +143,58 @@ int I400Copy(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// YUY2 - Macro-pixel = 2 image pixels
-// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
-
-// UYVY - Macro-pixel = 2 image pixels
-// U0Y0V0Y1
-
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_I42XTOYUY2ROW_SSE2
-__declspec(naked) __declspec(align(16))
-static void I42xToYUY2Row_SSE2(const uint8* src_y,
-                               const uint8* src_u,
-                               const uint8* src_v,
-                               uint8* dst_frame, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_y
-    mov        esi, [esp + 8 + 8]    // src_u
-    mov        edx, [esp + 8 + 12]   // src_v
-    mov        edi, [esp + 8 + 16]   // dst_frame
-    mov        ecx, [esp + 8 + 20]   // width
-    sub        edx, esi
-
-    align      16
-  convertloop:
-    movq       xmm2, qword ptr [esi] // U
-    movq       xmm3, qword ptr [esi + edx] // V
-    lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3 // UV
-    movdqa     xmm0, [eax] // Y
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm2 // YUYV
-    punpckhbw  xmm1, xmm2
-    movdqa     [edi], xmm0
-    movdqa     [edi + 16], xmm1
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#define HAS_I42XTOUYVYROW_SSE2
-__declspec(naked) __declspec(align(16))
-static void I42xToUYVYRow_SSE2(const uint8* src_y,
-                               const uint8* src_u,
-                               const uint8* src_v,
-                               uint8* dst_frame, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_y
-    mov        esi, [esp + 8 + 8]    // src_u
-    mov        edx, [esp + 8 + 12]   // src_v
-    mov        edi, [esp + 8 + 16]   // dst_frame
-    mov        ecx, [esp + 8 + 20]   // width
-    sub        edx, esi
-
-    align      16
-  convertloop:
-    movq       xmm2, qword ptr [esi] // U
-    movq       xmm3, qword ptr [esi + edx] // V
-    lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3 // UV
-    movdqa     xmm0, [eax] // Y
-    movdqa     xmm1, xmm2
-    lea        eax, [eax + 16]
-    punpcklbw  xmm1, xmm0 // UYVY
-    punpckhbw  xmm2, xmm0
-    movdqa     [edi], xmm1
-    movdqa     [edi + 16], xmm2
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-#define HAS_I42XTOYUY2ROW_SSE2
-static void I42xToYUY2Row_SSE2(const uint8* src_y,
-                               const uint8* src_u,
-                               const uint8* src_v,
-                               uint8* dst_frame, int width) {
- asm volatile (
-    "sub        %1,%2                            \n"
-    ".p2align  4                                 \n"
-  "1:                                            \n"
-    "movq      (%1),%%xmm2                       \n"
-    "movq      (%1,%2,1),%%xmm3                  \n"
-    "lea       0x8(%1),%1                        \n"
-    "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqa    (%0),%%xmm0                       \n"
-    "lea       0x10(%0),%0                       \n"
-    "movdqa    %%xmm0,%%xmm1                     \n"
-    "punpcklbw %%xmm2,%%xmm0                     \n"
-    "punpckhbw %%xmm2,%%xmm1                     \n"
-    "movdqa    %%xmm0,(%3)                       \n"
-    "movdqa    %%xmm1,0x10(%3)                   \n"
-    "lea       0x20(%3),%3                       \n"
-    "sub       $0x10,%4                          \n"
-    "jg         1b                               \n"
-    : "+r"(src_y),  // %0
-      "+r"(src_u),  // %1
-      "+r"(src_v),  // %2
-      "+r"(dst_frame),  // %3
-      "+rm"(width)  // %4
-    :
-    : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
-  );
-}
-
-#define HAS_I42XTOUYVYROW_SSE2
-static void I42xToUYVYRow_SSE2(const uint8* src_y,
-                               const uint8* src_u,
-                               const uint8* src_v,
-                               uint8* dst_frame, int width) {
- asm volatile (
-    "sub        %1,%2                            \n"
-    ".p2align  4                                 \n"
-  "1:                                            \n"
-    "movq      (%1),%%xmm2                       \n"
-    "movq      (%1,%2,1),%%xmm3                  \n"
-    "lea       0x8(%1),%1                        \n"
-    "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqa    (%0),%%xmm0                       \n"
-    "movdqa    %%xmm2,%%xmm1                     \n"
-    "lea       0x10(%0),%0                       \n"
-    "punpcklbw %%xmm0,%%xmm1                     \n"
-    "punpckhbw %%xmm0,%%xmm2                     \n"
-    "movdqa    %%xmm1,(%3)                       \n"
-    "movdqa    %%xmm2,0x10(%3)                   \n"
-    "lea       0x20(%3),%3                       \n"
-    "sub       $0x10,%4                          \n"
-    "jg         1b                               \n"
-    : "+r"(src_y),  // %0
-      "+r"(src_u),  // %1
-      "+r"(src_v),  // %2
-      "+r"(dst_frame),  // %3
-      "+rm"(width)  // %4
-    :
-    : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
-  );
-}
-#endif
-
-static void I42xToYUY2Row_C(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_frame, int width) {
-    for (int x = 0; x < width - 1; x += 2) {
-      dst_frame[0] = src_y[0];
-      dst_frame[1] = src_u[0];
-      dst_frame[2] = src_y[1];
-      dst_frame[3] = src_v[0];
-      dst_frame += 4;
-      src_y += 2;
-      src_u += 1;
-      src_v += 1;
-    }
-    if (width & 1) {
-      dst_frame[0] = src_y[0];
-      dst_frame[1] = src_u[0];
-      dst_frame[2] = src_y[0];  // duplicate last y
-      dst_frame[3] = src_v[0];
-    }
-}
-
-static void I42xToUYVYRow_C(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_frame, int width) {
-    for (int x = 0; x < width - 1; x += 2) {
-      dst_frame[0] = src_u[0];
-      dst_frame[1] = src_y[0];
-      dst_frame[2] = src_v[0];
-      dst_frame[3] = src_y[1];
-      dst_frame += 4;
-      src_y += 2;
-      src_u += 1;
-      src_v += 1;
-    }
-    if (width & 1) {
-      dst_frame[0] = src_u[0];
-      dst_frame[1] = src_y[0];
-      dst_frame[2] = src_v[0];
-      dst_frame[3] = src_y[0];  // duplicate last y
-    }
-}
-
-// Visual C x86 or GCC little endian.
-#if defined(__x86_64__) || defined(_M_X64) || \
-  defined(__i386__) || defined(_M_IX86) || \
-  defined(__arm__) || defined(_M_ARM) || \
-  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#define LIBYUV_LITTLE_ENDIAN
-#endif
-
-#ifdef LIBYUV_LITTLE_ENDIAN
-#define WRITEWORD(p, v) *reinterpret_cast<uint32*>(p) = v
-#else
-static inline void WRITEWORD(uint8* p, uint32 v) {
-  p[0] = (uint8)(v & 255);
-  p[1] = (uint8)((v >> 8) & 255);
-  p[2] = (uint8)((v >> 16) & 255);
-  p[3] = (uint8)((v >> 24) & 255);
-}
-#endif
-
-#define EIGHTTOTEN(x) (x << 2 | x >> 6)
-static void UYVYToV210Row_C(const uint8* src_uyvy, uint8* dst_v210, int width) {
-  for (int x = 0; x < width; x += 6) {
-    WRITEWORD(dst_v210 + 0, (EIGHTTOTEN(src_uyvy[0])) |
-                            (EIGHTTOTEN(src_uyvy[1]) << 10) |
-                            (EIGHTTOTEN(src_uyvy[2]) << 20));
-    WRITEWORD(dst_v210 + 4, (EIGHTTOTEN(src_uyvy[3])) |
-                            (EIGHTTOTEN(src_uyvy[4]) << 10) |
-                            (EIGHTTOTEN(src_uyvy[5]) << 20));
-    WRITEWORD(dst_v210 + 8, (EIGHTTOTEN(src_uyvy[6])) |
-                            (EIGHTTOTEN(src_uyvy[7]) << 10) |
-                            (EIGHTTOTEN(src_uyvy[8]) << 20));
-    WRITEWORD(dst_v210 + 12, (EIGHTTOTEN(src_uyvy[9])) |
-                             (EIGHTTOTEN(src_uyvy[10]) << 10) |
-                             (EIGHTTOTEN(src_uyvy[11]) << 20));
-    src_uyvy += 12;
-    dst_v210 += 16;
-  }
-}
-
-// TODO(fbarchard): Deprecate, move or expand 422 support?
 LIBYUV_API
 int I422ToYUY2(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
+               uint8* dst_yuy2, int dst_stride_yuy2,
                int width, int height) {
-  if (!src_y || !src_u || !src_v || !dst_frame ||
+  int y;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
       width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
-    dst_stride_frame = -dst_stride_frame;
-  }
-  void (*I42xToYUY2Row)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_frame, int width) =
-      I42xToYUY2Row_C;
-#if defined(HAS_I42XTOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
-    I42xToYUY2Row = I42xToYUY2Row_SSE2;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    I42xToYUY2Row(src_y, src_u, src_y, dst_frame, width);
+  for (y = 0; y < height; ++y) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
     src_y += src_stride_y;
     src_u += src_stride_u;
     src_v += src_stride_v;
-    dst_frame += dst_stride_frame;
+    dst_yuy2 += dst_stride_yuy2;
   }
   return 0;
 }
@@ -506,80 +203,106 @@ LIBYUV_API
 int I420ToYUY2(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
+               uint8* dst_yuy2, int dst_stride_yuy2,
                int width, int height) {
-  if (!src_y || !src_u || !src_v || !dst_frame ||
+  int y;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
       width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
-    dst_stride_frame = -dst_stride_frame;
-  }
-  void (*I42xToYUY2Row)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_frame, int width) =
-      I42xToYUY2Row_C;
-#if defined(HAS_I42XTOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
-    I42xToYUY2Row = I42xToYUY2Row_SSE2;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
-    I42xToYUY2Row(src_y, src_u, src_v, dst_frame, width);
-    I42xToYUY2Row(src_y + src_stride_y, src_u, src_v,
-                  dst_frame + dst_stride_frame, width);
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+    I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
+                  dst_yuy2 + dst_stride_yuy2, width);
     src_y += src_stride_y * 2;
     src_u += src_stride_u;
     src_v += src_stride_v;
-    dst_frame += dst_stride_frame * 2;
+    dst_yuy2 += dst_stride_yuy2 * 2;
   }
   if (height & 1) {
-    I42xToYUY2Row(src_y, src_u, src_v, dst_frame, width);
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
   }
   return 0;
 }
 
-// TODO(fbarchard): Deprecate, move or expand 422 support?
 LIBYUV_API
 int I422ToUYVY(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
+               uint8* dst_uyvy, int dst_stride_uyvy,
                int width, int height) {
-  if (!src_y || !src_u || !src_v || !dst_frame ||
+  int y;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy ||
       width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
-    dst_stride_frame = -dst_stride_frame;
-  }
-  void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_frame, int width) =
-      I42xToUYVYRow_C;
-#if defined(HAS_I42XTOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
-    I42xToUYVYRow = I42xToUYVYRow_SSE2;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    I42xToUYVYRow(src_y, src_u, src_y, dst_frame, width);
+  for (y = 0; y < height; ++y) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
     src_y += src_stride_y;
     src_u += src_stride_u;
     src_v += src_stride_v;
-    dst_frame += dst_stride_frame;
+    dst_uyvy += dst_stride_uyvy;
   }
   return 0;
 }
@@ -588,249 +311,206 @@ LIBYUV_API
 int I420ToUYVY(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
+               uint8* dst_uyvy, int dst_stride_uyvy,
                int width, int height) {
-  if (!src_y || !src_u || !src_v || !dst_frame ||
+  int y;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy ||
       width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
-    dst_stride_frame = -dst_stride_frame;
-  }
-  void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_frame, int width) =
-      I42xToUYVYRow_C;
-#if defined(HAS_I42XTOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
-    I42xToUYVYRow = I42xToUYVYRow_SSE2;
-  }
-#endif
-
-  for (int y = 0; y < height - 1; y += 2) {
-    I42xToUYVYRow(src_y, src_u, src_v, dst_frame, width);
-    I42xToUYVYRow(src_y + src_stride_y, src_u, src_v,
-                  dst_frame + dst_stride_frame, width);
-    src_y += src_stride_y * 2;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_frame += dst_stride_frame * 2;
-  }
-  if (height & 1) {
-    I42xToUYVYRow(src_y, src_u, src_v, dst_frame, width);
-  }
-  return 0;
-}
-
-LIBYUV_API
-int I420ToV210(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height) {
-  if (width * 16 / 6 > kMaxStride) {  // Row buffer of V210 is required.
-    return -1;
-  } else if (!src_y || !src_u || !src_v || !dst_frame ||
-      width <= 0 || height == 0) {
-    return -1;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
   }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
-    dst_stride_frame = -dst_stride_frame;
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
   }
-
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  void (*UYVYToV210Row)(const uint8* src_uyvy, uint8* dst_v210, int pix);
-  UYVYToV210Row = UYVYToV210Row_C;
-
-  void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_frame, int width) =
-      I42xToUYVYRow_C;
-#if defined(HAS_I42XTOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16)) {
-    I42xToUYVYRow = I42xToUYVYRow_SSE2;
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
-    I42xToUYVYRow(src_y, src_u, src_v, row, width);
-    UYVYToV210Row(row, dst_frame, width);
-    I42xToUYVYRow(src_y + src_stride_y, src_u, src_v, row, width);
-    UYVYToV210Row(row, dst_frame + dst_stride_frame, width);
-
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+    I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
+                  dst_uyvy + dst_stride_uyvy, width);
     src_y += src_stride_y * 2;
     src_u += src_stride_u;
     src_v += src_stride_v;
-    dst_frame += dst_stride_frame * 2;
+    dst_uyvy += dst_stride_uyvy * 2;
   }
   if (height & 1) {
-    I42xToUYVYRow(src_y, src_u, src_v, row, width);
-    UYVYToV210Row(row, dst_frame, width);
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
   }
   return 0;
 }
 
-// Convert I420 to ARGB.
 LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
+int I420ToNV12(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
                int width, int height) {
-  if (!src_y || !src_u || !src_v || !dst_argb ||
+  int y;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+      int width) = MergeUVRow_C;
+  // Coalesce rows.
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
       width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_uv = -dst_stride_uv;
+  }
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+  // Coalesce rows.
+  if (src_stride_u == halfwidth &&
+      src_stride_v == halfwidth &&
+      dst_stride_uv == halfwidth * 2) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_u = src_stride_v = dst_stride_uv = 0;
+  }
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
   }
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
     }
   }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I422ToARGBRow = I422ToARGBRow_SSSE3;
-      }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
     }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  for (y = 0; y < halfheight; ++y) {
+    // Merge a row of U and V into a row of UV.
+    MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += dst_stride_uv;
   }
   return 0;
 }
 
-// Convert I420 to BGRA.
 LIBYUV_API
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
+int I420ToNV21(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
                int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_bgra ||
+  return I420ToNV12(src_y, src_stride_y,
+                    src_v, src_stride_v,
+                    src_u, src_stride_u,
+                    dst_y, dst_stride_y,
+                    dst_vu, dst_stride_vu,
+                    width, height);
+}
+
+// Convert I422 to RGBA with matrix
+static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
+                            const uint8* src_u, int src_stride_u,
+                            const uint8* src_v, int src_stride_v,
+                            uint8* dst_rgba, int dst_stride_rgba,
+                            const struct YuvConstants* yuvconstants,
+                            int width, int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba ||
       width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
-    dst_stride_bgra = -dst_stride_bgra;
-  }
-  void (*I422ToBGRARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToBGRARow_C;
-#if defined(HAS_I422TOBGRAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToBGRARow = I422ToBGRARow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToBGRARow = I422ToBGRARow_NEON;
-    }
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
   }
-#elif defined(HAS_I422TOBGRAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
-        I422ToBGRARow = I422ToBGRARow_SSSE3;
-      }
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
     }
   }
 #endif
-
-  for (int y = 0; y < height; ++y) {
-    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
-    dst_bgra += dst_stride_bgra;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to ABGR.
-LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_abgr ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
-    dst_stride_abgr = -dst_stride_abgr;
-  }
-  void (*I422ToABGRRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToABGRRow_C;
-#if defined(HAS_I422TOABGRROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToABGRRow = I422ToABGRRow_Any_NEON;
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      I422ToABGRRow = I422ToABGRRow_NEON;
+      I422ToRGBARow = I422ToRGBARow_AVX2;
     }
   }
-#elif defined(HAS_I422TOABGRROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
-        I422ToABGRRow = I422ToABGRRow_SSSE3;
-      }
+      I422ToRGBARow = I422ToRGBARow_NEON;
     }
   }
 #endif
+#if defined(HAS_I422TORGBAROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
+    I422ToRGBARow = I422ToRGBARow_DSPR2;
+  }
+#endif
 
-  for (int y = 0; y < height; ++y) {
-    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
-    dst_abgr += dst_stride_abgr;
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+    dst_rgba += dst_stride_rgba;
     src_y += src_stride_y;
     if (y & 1) {
       src_u += src_stride_u;
@@ -847,44 +527,81 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y,
                const uint8* src_v, int src_stride_v,
                uint8* dst_rgba, int dst_stride_rgba,
                int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_rgba ||
+  return I420ToRGBAMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants,
+                          width, height);
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_bgra, dst_stride_bgra,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I420 to RGB24 with matrix
+static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
+                             const uint8* src_u, int src_stride_u,
+                             const uint8* src_v, int src_stride_v,
+                             uint8* dst_rgb24, int dst_stride_rgb24,
+                             const struct YuvConstants* yuvconstants,
+                             int width, int height) {
+  int y;
+  void (*I422ToRGB24Row)(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         const struct YuvConstants* yuvconstants,
+                         int width) = I422ToRGB24Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb24 ||
       width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
-    dst_stride_rgba = -dst_stride_rgba;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
   }
-  void (*I422ToRGBARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToRGBARow_C;
-#if defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      I422ToRGBARow = I422ToRGBARow_NEON;
+      I422ToRGB24Row = I422ToRGB24Row_AVX2;
     }
   }
-#elif defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
-        I422ToRGBARow = I422ToRGBARow_SSSE3;
-      }
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
     }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
-    dst_rgba += dst_stride_rgba;
+  for (y = 0; y < height; ++y) {
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
     src_y += src_stride_y;
     if (y & 1) {
       src_u += src_stride_u;
@@ -901,44 +618,82 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
                 const uint8* src_v, int src_stride_v,
                 uint8* dst_rgb24, int dst_stride_rgb24,
                 int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_rgb24 ||
+  return I420ToRGB24Matrix(src_y, src_stride_y,
+                           src_u, src_stride_u,
+                           src_v, src_stride_v,
+                           dst_rgb24, dst_stride_rgb24,
+                           &kYuvI601Constants,
+                           width, height);
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              uint8* dst_raw, int dst_stride_raw,
+              int width, int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y,
+                           src_v, src_stride_v,  // Swap U and V
+                           src_u, src_stride_u,
+                           dst_raw, dst_stride_raw,
+                           &kYvuI601Constants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height) {
+  int y;
+  void (*I422ToARGB1555Row)(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            const struct YuvConstants* yuvconstants,
+                            int width) = I422ToARGB1555Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb1555 ||
       width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
-    dst_stride_rgb24 = -dst_stride_rgb24;
+    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+    dst_stride_argb1555 = -dst_stride_argb1555;
   }
-  void (*I422ToRGB24Row)(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         int width) = I422ToRGB24Row_C;
-#if defined(HAS_I422TORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      I422ToRGB24Row = I422ToRGB24Row_NEON;
+      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
     }
   }
-#elif defined(HAS_I422TORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      I422ToRGB24Row = I422ToRGB24Row_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
-        I422ToRGB24Row = I422ToRGB24Row_SSSE3;
-      }
+      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
     }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
-    dst_rgb24 += dst_stride_rgb24;
+  for (y = 0; y < height; ++y) {
+    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
+                      width);
+    dst_argb1555 += dst_stride_argb1555;
     src_y += src_stride_y;
     if (y & 1) {
       src_u += src_stride_u;
@@ -948,51 +703,60 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// Convert I420 to RAW.
+
+// Convert I420 to ARGB4444.
 LIBYUV_API
-int I420ToRAW(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              uint8* dst_raw, int dst_stride_raw,
-              int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_raw ||
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height) {
+  int y;
+  void (*I422ToARGB4444Row)(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            const struct YuvConstants* yuvconstants,
+                            int width) = I422ToARGB4444Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb4444 ||
       width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_raw = dst_raw + (height - 1) * dst_stride_raw;
-    dst_stride_raw = -dst_stride_raw;
-  }
-  void (*I422ToRAWRow)(const uint8* y_buf,
-                       const uint8* u_buf,
-                       const uint8* v_buf,
-                       uint8* rgb_buf,
-                       int width) = I422ToRAWRow_C;
-#if defined(HAS_I422TORAWROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRAWRow = I422ToRAWRow_Any_NEON;
+    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+    dst_stride_argb4444 = -dst_stride_argb4444;
+  }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      I422ToRAWRow = I422ToRAWRow_NEON;
+      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
     }
   }
-#elif defined(HAS_I422TORAWROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      I422ToRAWRow = I422ToRAWRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
-        I422ToRAWRow = I422ToRAWRow_SSSE3;
-      }
+      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
     }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
-    dst_raw += dst_stride_raw;
+  for (y = 0; y < height; ++y) {
+    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
+                      width);
+    dst_argb4444 += dst_stride_argb4444;
     src_y += src_stride_y;
     if (y & 1) {
       src_u += src_stride_u;
@@ -1007,52 +771,53 @@ LIBYUV_API
 int I420ToRGB565(const uint8* src_y, int src_stride_y,
                  const uint8* src_u, int src_stride_u,
                  const uint8* src_v, int src_stride_v,
-                 uint8* dst_rgb, int dst_stride_rgb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
                  int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_rgb ||
+  int y;
+  void (*I422ToRGB565Row)(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width) = I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
       width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
-    dst_stride_rgb = -dst_stride_rgb;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
   }
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_NEON;
-  }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
+#if defined(HAS_I422TORGB565ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_SSSE3;
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
   }
 #endif
-
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  void (*ARGBToRGB565Row)(const uint8* src_rgb, uint8* dst_rgb, int pix) =
-      ARGBToRGB565Row_C;
-#if defined(HAS_ARGBTORGB565ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width * 2 <= kMaxStride) {
-      ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
     }
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
     }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, row, width);
-    ARGBToRGB565Row(row, dst_rgb, width);
-    dst_rgb += dst_stride_rgb;
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+    dst_rgb565 += dst_stride_rgb565;
     src_y += src_stride_y;
     if (y & 1) {
       src_u += src_stride_u;
@@ -1062,123 +827,115 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// Convert I420 to ARGB1555.
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+  0, 4, 1, 5,
+  6, 2, 7, 3,
+  1, 5, 0, 4,
+  7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
 LIBYUV_API
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
+int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = I422ToARGBRow_C;
+  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
       width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
   }
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_NEON;
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
   }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
+#if defined(HAS_I422TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_SSSE3;
-  }
-#endif
-
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToARGB1555Row_C;
-#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width * 2 <= kMaxStride) {
-      ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
-    }
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
     }
   }
 #endif
-
-  for (int y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, row, width);
-    ARGBToARGB1555Row(row, dst_argb, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
     }
   }
-  return 0;
-}
-
-// Convert I420 to ARGB4444.
-LIBYUV_API
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
+#endif
 #if defined(HAS_I422TOARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_NEON;
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
   }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_SSSE3;
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
+    I422ToARGBRow = I422ToARGBRow_DSPR2;
   }
 #endif
-
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-     ARGBToARGB4444Row_C;
-#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width * 2 <= kMaxStride) {
-      ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
-    }
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
-      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
     }
   }
 #endif
-
-  for (int y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, row, width);
-    ARGBToARGB4444Row(row, dst_argb, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
     }
   }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a row of argb.
+    align_buffer_64(row_argb, width * 4);
+    for (y = 0; y < height; ++y) {
+      I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
+      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+                            *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+      dst_rgb565 += dst_stride_rgb565;
+      src_y += src_stride_y;
+      if (y & 1) {
+        src_u += src_stride_u;
+        src_v += src_stride_v;
+      }
+    }
+    free_aligned_buffer_64(row_argb);
+  }
   return 0;
 }
 
@@ -1189,12 +946,13 @@ int ConvertFromI420(const uint8* y, int y_stride,
                     const uint8* v, int v_stride,
                     uint8* dst_sample, int dst_sample_stride,
                     int width, int height,
-                    uint32 format) {
+                    uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int r = 0;
   if (!y || !u|| !v || !dst_sample ||
       width <= 0 || height == 0) {
     return -1;
   }
-  int r = 0;
   switch (format) {
     // Single plane formats
     case FOURCC_YUY2:
@@ -1213,15 +971,6 @@ int ConvertFromI420(const uint8* y, int y_stride,
                      dst_sample_stride ? dst_sample_stride : width * 2,
                      width, height);
       break;
-    case FOURCC_V210:
-      r = I420ToV210(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride :
-                         (width + 47) / 48 * 128,
-                     width, height);
-      break;
     case FOURCC_RGBP:
       r = I420ToRGB565(y, y_stride,
                        u, u_stride,
@@ -1294,48 +1043,40 @@ int ConvertFromI420(const uint8* y, int y_stride,
                      dst_sample_stride ? dst_sample_stride : width * 4,
                      width, height);
       break;
-    case FOURCC_BGGR:
-      r = I420ToBayerBGGR(y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_sample,
-                          dst_sample_stride ? dst_sample_stride : width,
-                          width, height);
-      break;
-    case FOURCC_GBRG:
-      r = I420ToBayerGBRG(y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_sample,
-                          dst_sample_stride ? dst_sample_stride : width,
-                          width, height);
-      break;
-    case FOURCC_GRBG:
-      r = I420ToBayerGRBG(y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_sample,
-                          dst_sample_stride ? dst_sample_stride : width,
-                          width, height);
-      break;
-    case FOURCC_RGGB:
-      r = I420ToBayerRGGB(y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_sample,
-                          dst_sample_stride ? dst_sample_stride : width,
-                          width, height);
-      break;
     case FOURCC_I400:
       r = I400Copy(y, y_stride,
                    dst_sample,
                    dst_sample_stride ? dst_sample_stride : width,
                    width, height);
       break;
+    case FOURCC_NV12: {
+      uint8* dst_uv = dst_sample + width * height;
+      r = I420ToNV12(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     dst_uv,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     width, height);
+      break;
+    }
+    case FOURCC_NV21: {
+      uint8* dst_vu = dst_sample + width * height;
+      r = I420ToNV21(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     dst_vu,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     width, height);
+      break;
+    }
+    // TODO(fbarchard): Add M420.
     // Triplanar formats
     // TODO(fbarchard): halfstride instead of halfwidth
     case FOURCC_I420:
-    case FOURCC_YU12:
     case FOURCC_YV12: {
       int halfwidth = (width + 1) / 2;
       int halfheight = (height + 1) / 2;
diff --git a/files/source/convert_from_argb.cc b/files/source/convert_from_argb.cc
new file mode 100644
index 00000000..2a8682b7
--- /dev/null
+++ b/files/source/convert_from_argb.cc
@@ -0,0 +1,1286 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from_argb.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGB little endian (bgra in memory) to I444
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int width) = ARGBToUV444Row_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u == width &&
+      dst_stride_v == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+      }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToUV444Row = ARGBToUV444Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV444Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to I422
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to I411
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int width) = ARGBToUV411Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 4 == width &&
+      dst_stride_v * 4 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV411ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUV411Row = ARGBToUV411Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV411Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width) = MergeUVRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width) = MergeUVRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Convert ARGB to YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+      const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
+
+  if (!src_argb || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8* row_u = row_y + ((width + 63) & ~63);
+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+    for (y = 0; y < height; ++y) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
+      src_argb += src_stride_argb;
+      dst_yuy2 += dst_stride_yuy2;
+    }
+
+    free_aligned_buffer_64(row_y);
+  }
+  return 0;
+}
+
+// Convert ARGB to UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+      const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
+
+  if (!src_argb || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8* row_u = row_y + ((width + 63) & ~63);
+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+    for (y = 0; y < height; ++y) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
+      src_argb += src_stride_argb;
+      dst_uyvy += dst_stride_uyvy;
+    }
+
+    free_aligned_buffer_64(row_y);
+  }
+  return 0;
+}
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+// Shuffle table for converting ARGB to RGBA.
+static uvec8 kShuffleMaskARGBToRGBA = {
+  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+};
+
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  return ARGBShuffle(src_argb, src_stride_argb,
+                     dst_rgba, dst_stride_rgba,
+                     (const uint8*)(&kShuffleMaskARGBToRGBA),
+                     width, height);
+}
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height) {
+  int y;
+  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+      ARGBToRGB24Row_C;
+  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_rgb24 == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb24 = 0;
+  }
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB24Row(src_argb, dst_rgb24, width);
+    src_argb += src_stride_argb;
+    dst_rgb24 += dst_stride_rgb24;
+  }
+  return 0;
+}
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_raw, int dst_stride_raw,
+              int width, int height) {
+  int y;
+  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =
+      ARGBToRAWRow_C;
+  if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_raw == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_raw = 0;
+  }
+#if defined(HAS_ARGBTORAWROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRAWRow = ARGBToRAWRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRAWRow(src_argb, dst_raw, width);
+    src_argb += src_stride_argb;
+    dst_raw += dst_stride_raw;
+  }
+  return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+  0, 4, 1, 5,
+  6, 2, 7, 3,
+  1, 5, 0, 4,
+  7, 3, 6, 2,
+};
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height) {
+  int y;
+  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565DitherRow(src_argb, dst_rgb565,
+                          *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
+// Convert ARGB To RGB565.
+// TODO(fbarchard): Consider using dither function low level with zeros.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+      ARGBToRGB565Row_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_rgb565 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb565 = 0;
+  }
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565Row(src_argb, dst_rgb565, width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height) {
+  int y;
+  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+      ARGBToARGB1555Row_C;
+  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb1555 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb1555 = 0;
+  }
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToARGB1555Row(src_argb, dst_argb1555, width);
+    src_argb += src_stride_argb;
+    dst_argb1555 += dst_stride_argb1555;
+  }
+  return 0;
+}
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height) {
+  int y;
+  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+      ARGBToARGB4444Row_C;
+  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb4444 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb4444 = 0;
+  }
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToARGB4444Row(src_argb, dst_argb4444, width);
+    src_argb += src_stride_argb;
+    dst_argb4444 += dst_stride_argb4444;
+  }
+  return 0;
+}
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+      ARGBToYJRow_C;
+  if (!src_argb ||
+      !dst_yj || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+    ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
+    src_argb += src_stride_argb * 2;
+    dst_yj += dst_stride_yj * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+  }
+  return 0;
+}
+
+// Convert ARGB to J422. (JPeg full range I422).
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+      ARGBToYJRow_C;
+  if (!src_argb ||
+      !dst_yj || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_yj == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+    src_argb += src_stride_argb;
+    dst_yj += dst_stride_yj;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Convert ARGB to J400.
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               int width, int height) {
+  int y;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yj = 0;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToYJRow(src_argb, dst_yj, width);
+    src_argb += src_stride_argb;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/convert_jpeg.cc b/files/source/convert_jpeg.cc
new file mode 100644
index 00000000..90f550a2
--- /dev/null
+++ b/files/source/convert_jpeg.cc
@@ -0,0 +1,393 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAVE_JPEG
+struct I420Buffers {
+  uint8* y;
+  int y_stride;
+  uint8* u;
+  int u_stride;
+  uint8* v;
+  int v_stride;
+  int w;
+  int h;
+};
+
+static void JpegCopyI420(void* opaque,
+                         const uint8* const* data,
+                         const int* strides,
+                         int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I420Copy(data[0], strides[0],
+           data[1], strides[1],
+           data[2], strides[2],
+           dest->y, dest->y_stride,
+           dest->u, dest->u_stride,
+           dest->v, dest->v_stride,
+           dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I422ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I444ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI411ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I411ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I400ToI420(data[0], strides[0],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+             int* width, int* height) {
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret) {
+    *width = mjpeg_decoder.GetWidth();
+    *height = mjpeg_decoder.GetHeight();
+  }
+  mjpeg_decoder.UnloadFrame();
+  return ret ? 0 : -1;  // -1 for runtime failure.
+}
+
+// MJPG (Motion JPeg) to I420
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToI420(const uint8* sample,
+               size_t sample_size,
+               uint8* y, int y_stride,
+               uint8* u, int u_stride,
+               uint8* v, int v_stride,
+               int w, int h,
+               int dw, int dh) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != w ||
+              mjpeg_decoder.GetHeight() != h)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() ==
+            MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
+    // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
+    // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
+    // YUV411
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
+    // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice. 411 is supported by libjpeg
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+
+#ifdef HAVE_JPEG
+struct ARGBBuffers {
+  uint8* argb;
+  int argb_stride;
+  int w;
+  int h;
+};
+
+static void JpegI420ToARGB(void* opaque,
+                         const uint8* const* data,
+                         const int* strides,
+                         int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I420ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I422ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I444ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI411ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I411ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I400ToARGB(data[0], strides[0],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to ARGB
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample,
+               size_t sample_size,
+               uint8* argb, int argb_stride,
+               int w, int h,
+               int dw, int dh) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != w ||
+              mjpeg_decoder.GetHeight() != h)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() ==
+            MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
+    // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
+    // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
+    // YUV411
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
+    // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice. 411 is supported by libjpeg
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+#endif
+
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/convert_to_argb.cc b/files/source/convert_to_argb.cc
new file mode 100644
index 00000000..aecdc80f
--- /dev/null
+++ b/files/source/convert_to_argb.cc
@@ -0,0 +1,305 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToARGB(const uint8* sample, size_t sample_size,
+                  uint8* crop_argb, int argb_stride,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8* src;
+  const uint8* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int r = 0;
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+  // and then rotate the I420 to the final destination buffer.
+  // For in-place conversion, if destination crop_argb is same as source sample,
+  // also enable temporary buffer.
+  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
+      crop_argb == sample;
+  uint8* dest_argb = crop_argb;
+  int dest_argb_stride = argb_stride;
+  uint8* rotate_buffer = NULL;
+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+  if (crop_argb == NULL || sample == NULL ||
+      src_width <= 0 || crop_width <= 0 ||
+      src_height == 0 || crop_height == 0) {
+    return -1;
+  }
+  if (src_height < 0) {
+    inv_crop_height = -inv_crop_height;
+  }
+
+  if (need_buf) {
+    int argb_size = crop_width * 4 * abs_crop_height;
+    rotate_buffer = (uint8*)malloc(argb_size);
+    if (!rotate_buffer) {
+      return 1;  // Out of memory runtime error.
+    }
+    crop_argb = rotate_buffer;
+    argb_stride = crop_width * 4;
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToARGB(src, aligned_src_width * 2,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToARGB(src, aligned_src_width * 2,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToARGB(src, src_width * 3,
+                      crop_argb, argb_stride,
+                      crop_width, inv_crop_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToARGB(src, src_width * 3,
+                    crop_argb, argb_stride,
+                    crop_width, inv_crop_height);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ARGBToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToARGB(src, src_width * 2,
+                       crop_argb, argb_stride,
+                       crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToARGB(src, src_width * 2,
+                         crop_argb, argb_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToARGB(src, src_width * 2,
+                         crop_argb, argb_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToARGB(src, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      r = NV12ToARGB(src, src_width,
+                     src_uv, aligned_src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      // Call NV12 but with u and v parameters swapped.
+      r = NV21ToARGB(src, src_width,
+                     src_uv, aligned_src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToARGB(src, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YV12: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_J420: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      src_u = sample + src_width * abs_src_height +
+          (halfwidth * crop_y + crop_x) / 2;
+      src_v = sample + src_width * abs_src_height +
+          halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = J420ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToARGB(src_y, src_width,
+                     src_u, src_width,
+                     src_v, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (src_width + 3) / 4;
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u = sample + src_width * abs_src_height +
+          quarterwidth * crop_y + crop_x / 4;
+      const uint8* src_v = sample + src_width * abs_src_height +
+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+      r = I411ToARGB(src_y, src_width,
+                     src_u, quarterwidth,
+                     src_v, quarterwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToARGB(sample, sample_size,
+                     crop_argb, argb_stride,
+                     src_width, abs_src_height, crop_width, inv_crop_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = ARGBRotate(crop_argb, argb_stride,
+                     dest_argb, dest_argb_stride,
+                     crop_width, abs_crop_height, rotation);
+    }
+    free(rotate_buffer);
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/convert_to_i420.cc b/files/source/convert_to_i420.cc
new file mode 100644
index 00000000..e5f307c4
--- /dev/null
+++ b/files/source/convert_to_i420.cc
@@ -0,0 +1,337 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "libyuv/convert.h"
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToI420(const uint8* sample,
+                  size_t sample_size,
+                  uint8* y, int y_stride,
+                  uint8* u, int u_stride,
+                  uint8* v, int v_stride,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8* src;
+  const uint8* src_uv;
+  const int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  // TODO(nisse): Why allow crop_height < 0?
+  const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int r = 0;
+  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
+      format != FOURCC_NV12 && format != FOURCC_NV21 &&
+      format != FOURCC_YV12) || y == sample;
+  uint8* tmp_y = y;
+  uint8* tmp_u = u;
+  uint8* tmp_v = v;
+  int tmp_y_stride = y_stride;
+  int tmp_u_stride = u_stride;
+  int tmp_v_stride = v_stride;
+  uint8* rotate_buffer = NULL;
+  const int inv_crop_height =
+      (src_height < 0) ? -abs_crop_height : abs_crop_height;
+
+  if (!y || !u || !v || !sample ||
+      src_width <= 0 || crop_width <= 0  ||
+      src_height == 0 || crop_height == 0) {
+    return -1;
+  }
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+  // and then rotate the I420 to the final destination buffer.
+  // For in-place conversion, if destination y is same as source sample,
+  // also enable temporary buffer.
+  if (need_buf) {
+    int y_size = crop_width * abs_crop_height;
+    int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
+    rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
+    if (!rotate_buffer) {
+      return 1;  // Out of memory runtime error.
+    }
+    y = rotate_buffer;
+    u = y + y_size;
+    v = u + uv_size;
+    y_stride = crop_width;
+    u_stride = v_stride = ((crop_width + 1) / 2);
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToI420(src, aligned_src_width * 2,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToI420(src, aligned_src_width * 2,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToI420(src, src_width * 2,
+                       y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToI420(src, src_width * 2,
+                         y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToI420(src, src_width * 2,
+                         y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToI420(src, src_width * 3,
+                      y, y_stride,
+                      u, u_stride,
+                      v, v_stride,
+                      crop_width, inv_crop_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToI420(src, src_width * 3,
+                    y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    crop_width, inv_crop_height);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ARGBToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToI420(src, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      r = NV12ToI420Rotate(src, src_width,
+                           src_uv, aligned_src_width,
+                           y, y_stride,
+                           u, u_stride,
+                           v, v_stride,
+                           crop_width, inv_crop_height, rotation);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      // Call NV12 but with u and v parameters swapped.
+      r = NV12ToI420Rotate(src, src_width,
+                           src_uv, aligned_src_width,
+                           y, y_stride,
+                           v, v_stride,
+                           u, u_stride,
+                           crop_width, inv_crop_height, rotation);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToI420(src, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YV12: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420Rotate(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height, rotation);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToI420(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToI420(src_y, src_width,
+                     src_u, src_width,
+                     src_v, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (src_width + 3) / 4;
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u = sample + src_width * abs_src_height +
+          quarterwidth * crop_y + crop_x / 4;
+      const uint8* src_v = sample + src_width * abs_src_height +
+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+      r = I411ToI420(src_y, src_width,
+                     src_u, quarterwidth,
+                     src_v, quarterwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToI420(sample, sample_size,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     src_width, abs_src_height, crop_width, inv_crop_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = I420Rotate(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     tmp_y, tmp_y_stride,
+                     tmp_u, tmp_u_stride,
+                     tmp_v, tmp_v_stride,
+                     crop_width, abs_crop_height, rotation);
+    }
+    free(rotate_buffer);
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc
index 2e96d9b9..84927ebc 100644
--- a/files/source/cpu_id.cc
+++ b/files/source/cpu_id.cc
@@ -4,21 +4,24 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #include "libyuv/cpu_id.h"
 
-#ifdef _MSC_VER
-#include <intrin.h>  // For __cpuid()
+#if defined(_MSC_VER)
+#include <intrin.h>  // For __cpuidex()
 #endif
-#if !defined(__CLR_VER) && defined(_M_X64) && \
-    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
+    defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
 #include <immintrin.h>  // For _xgetbv()
 #endif
 
+#if !defined(__native_client__)
 #include <stdlib.h>  // For getenv()
+#endif
 
 // For ArmCpuCaps() but unittested on all platforms
 #include <stdio.h>
@@ -26,186 +29,269 @@
 
 #include "libyuv/basic_types.h"  // For CPU_X86
 
-// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
-#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
-static __inline void __cpuid(int cpu_info[4], int info_type) {
-  asm volatile (  // NOLINT
-    "mov %%ebx, %%edi                          \n"
-    "cpuid                                     \n"
-    "xchg %%edi, %%ebx                         \n"
-    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
-}
-#elif defined(__i386__) || defined(__x86_64__)
-static __inline void __cpuid(int cpu_info[4], int info_type) {
-  asm volatile (  // NOLINT
-    "cpuid                                     \n"
-    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
-}
-#endif
-
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-// Low level cpuid for X86. Returns zeros on other CPUs.
-#if !defined(__CLR_VER) && (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__i386__) || defined(__x86_64__))
+// For functions that use the stack and have runtime checks for overflow,
+// use SAFEBUFFERS to avoid additional check.
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
+    !defined(__clang__)
+#define SAFEBUFFERS __declspec(safebuffers)
+#else
+#define SAFEBUFFERS
+#endif
+
+// Low level cpuid for X86.
+#if (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER)
 LIBYUV_API
-void CpuId(int cpu_info[4], int info_type) {
-  __cpuid(cpu_info, info_type);
-}
+void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
+#if defined(_MSC_VER)
+// Visual C version uses intrinsic or inline x86 assembly.
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+  __cpuidex((int*)(cpu_info), info_eax, info_ecx);
+#elif defined(_M_IX86)
+  __asm {
+    mov        eax, info_eax
+    mov        ecx, info_ecx
+    mov        edi, cpu_info
+    cpuid
+    mov        [edi], eax
+    mov        [edi + 4], ebx
+    mov        [edi + 8], ecx
+    mov        [edi + 12], edx
+  }
+#else  // Visual C but not x86
+  if (info_ecx == 0) {
+    __cpuid((int*)(cpu_info), info_eax);
+  } else {
+    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
+  }
+#endif
+// GCC version uses inline x86 assembly.
+#else  // defined(_MSC_VER)
+  uint32 info_ebx, info_edx;
+  asm volatile (
+#if defined( __i386__) && defined(__PIC__)
+    // Preserve ebx for fpic 32 bit.
+    "mov %%ebx, %%edi                          \n"
+    "cpuid                                     \n"
+    "xchg %%edi, %%ebx                         \n"
+    : "=D" (info_ebx),
 #else
+    "cpuid                                     \n"
+    : "=b" (info_ebx),
+#endif  //  defined( __i386__) && defined(__PIC__)
+      "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
+  cpu_info[0] = info_eax;
+  cpu_info[1] = info_ebx;
+  cpu_info[2] = info_ecx;
+  cpu_info[3] = info_edx;
+#endif  // defined(_MSC_VER)
+}
+#else  // (defined(_M_IX86) || defined(_M_X64) ...
 LIBYUV_API
-void CpuId(int cpu_info[4], int) {
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
   cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
 }
 #endif
 
-// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-#if !defined(__CLR_VER) && defined(_M_X64) && \
-    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
-#define HAS_XGETBV
-static uint32 XGetBV(unsigned int xcr) {
-  return static_cast<uint32>(_xgetbv(xcr));
-}
-#elif !defined(__CLR_VER) && defined(_M_IX86)
+// For VS2010 and earlier emit can be used:
+//   _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
+//  __asm {
+//    xor        ecx, ecx    // xcr 0
+//    xgetbv
+//    mov        xcr0, eax
+//  }
+// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
+// https://code.google.com/p/libyuv/issues/detail?id=529
+#if defined(_M_IX86) && (_MSC_VER < 1900)
+#pragma optimize("g", off)
+#endif
+#if (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
 #define HAS_XGETBV
-__declspec(naked) __declspec(align(16))
-static uint32 XGetBV(unsigned int xcr) {
-  __asm {
-    mov        ecx, [esp + 4]    // xcr
-    _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // xgetbv for vs2005.
-    ret
-  }
-}
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+int GetXCR0() {
+  uint32 xcr0 = 0u;
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
 #elif defined(__i386__) || defined(__x86_64__)
-#define HAS_XGETBV
-static uint32 XGetBV(unsigned int xcr) {
-  uint32 xcr_feature_mask;
-  asm volatile (  // NOLINT
-    ".byte 0x0f, 0x01, 0xd0\n"
-    : "=a"(xcr_feature_mask)
-    : "c"(xcr)
-    : "memory", "cc", "edx");  // edx unused.
-  return xcr_feature_mask;
+  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+#endif  // defined(__i386__) || defined(__x86_64__)
+  return xcr0;
 }
-#endif
-#ifdef HAS_XGETBV
-static const int kXCR_XFEATURE_ENABLED_MASK = 0;
+#endif  // defined(_M_IX86) || defined(_M_X64) ..
+// Return optimization to previous setting.
+#if defined(_M_IX86) && (_MSC_VER < 1900)
+#pragma optimize("g", on)
 #endif
 
 // based on libvpx arm_cpudetect.c
 // For Arm, but public to allow testing on any CPU
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 int ArmCpuCaps(const char* cpuinfo_name) {
-  int flags = 0;
-  FILE* fin = fopen(cpuinfo_name, "r");
-  if (fin) {
-    char buf[512];
-    while (fgets(buf, 511, fin)) {
-      if (memcmp(buf, "Features", 8) == 0) {
-        flags |= kCpuInitialized;
-        char* p = strstr(buf, " neon");
-        if (p && (p[5] == ' ' || p[5] == '\n')) {
-          flags |= kCpuHasNEON;
-          break;
-        }
+  char cpuinfo_line[512];
+  FILE* f = fopen(cpuinfo_name, "r");
+  if (!f) {
+    // Assume Neon if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
+    return kCpuHasNEON;
+  }
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "Features", 8) == 0) {
+      char* p = strstr(cpuinfo_line, " neon");
+      if (p && (p[5] == ' ' || p[5] == '\n')) {
+        fclose(f);
+        return kCpuHasNEON;
+      }
+      // aarch64 uses asimd for Neon.
+      p = strstr(cpuinfo_line, " asimd");
+      if (p && (p[6] == ' ' || p[6] == '\n')) {
+        fclose(f);
+        return kCpuHasNEON;
       }
     }
-    fclose(fin);
   }
-  return flags;
+  fclose(f);
+  return 0;
 }
 
 // CPU detect function for SIMD instruction sets.
 LIBYUV_API
-int cpu_info_ = 0;
+int cpu_info_ = 0;  // cpu_info is not initialized yet.
 
 // Test environment variable for disabling CPU features. Any non-zero value
 // to disable. Zero ignored to make it easy to set the variable on/off.
-static bool TestEnv(const char* name) {
+#if !defined(__native_client__) && !defined(_M_ARM)
+
+static LIBYUV_BOOL TestEnv(const char* name) {
   const char* var = getenv(name);
   if (var) {
     if (var[0] != '0') {
-      return true;
+      return LIBYUV_TRUE;
     }
   }
-  return false;
+  return LIBYUV_FALSE;
+}
+#else  // nacl does not support getenv().
+static LIBYUV_BOOL TestEnv(const char*) {
+  return LIBYUV_FALSE;
 }
+#endif
 
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 int InitCpuFlags(void) {
-#if !defined(__CLR_VER) && defined(CPU_X86)
-  int cpu_info[4];
-  __cpuid(cpu_info, 1);
-  cpu_info_ = ((cpu_info[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
-              ((cpu_info[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
-              ((cpu_info[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
-              ((cpu_info[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
-              (((cpu_info[2] & 0x18000000) == 0x18000000) ? kCpuHasAVX : 0) |
-              kCpuInitialized | kCpuHasX86;
+  // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
+  int cpu_info = 0;
+#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
+  uint32 cpu_info0[4] = { 0, 0, 0, 0 };
+  uint32 cpu_info1[4] = { 0, 0, 0, 0 };
+  uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+  CpuId(0, 0, cpu_info0);
+  CpuId(1, 0, cpu_info1);
+  if (cpu_info0[0] >= 7) {
+    CpuId(7, 0, cpu_info7);
+  }
+  cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+             ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+             ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+             ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
+             ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+             kCpuHasX86;
+
 #ifdef HAS_XGETBV
-  if (cpu_info_ & kCpuHasAVX) {
-    __cpuid(cpu_info, 7);
-    if ((cpu_info[1] & 0x00000020) &&
-        ((XGetBV(kXCR_XFEATURE_ENABLED_MASK) & 0x06) == 0x06)) {
-      cpu_info_ |= kCpuHasAVX2;
+  // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
+  if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave
+      ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
+    cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
+
+    // Detect AVX512bw
+    if ((GetXCR0() & 0xe0) == 0xe0) {
+      cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
     }
   }
 #endif
-  // environment variable overrides for testing.
+
+  // Environment variable overrides for testing.
   if (TestEnv("LIBYUV_DISABLE_X86")) {
-    cpu_info_ &= ~kCpuHasX86;
+    cpu_info &= ~kCpuHasX86;
   }
   if (TestEnv("LIBYUV_DISABLE_SSE2")) {
-    cpu_info_ &= ~kCpuHasSSE2;
+    cpu_info &= ~kCpuHasSSE2;
   }
   if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
-    cpu_info_ &= ~kCpuHasSSSE3;
+    cpu_info &= ~kCpuHasSSSE3;
   }
   if (TestEnv("LIBYUV_DISABLE_SSE41")) {
-    cpu_info_ &= ~kCpuHasSSE41;
+    cpu_info &= ~kCpuHasSSE41;
   }
   if (TestEnv("LIBYUV_DISABLE_SSE42")) {
-    cpu_info_ &= ~kCpuHasSSE42;
+    cpu_info &= ~kCpuHasSSE42;
   }
   if (TestEnv("LIBYUV_DISABLE_AVX")) {
-    cpu_info_ &= ~kCpuHasAVX;
+    cpu_info &= ~kCpuHasAVX;
   }
   if (TestEnv("LIBYUV_DISABLE_AVX2")) {
-    cpu_info_ &= ~kCpuHasAVX2;
+    cpu_info &= ~kCpuHasAVX2;
   }
-  if (TestEnv("LIBYUV_DISABLE_ASM")) {
-    cpu_info_ = kCpuInitialized;
-  }
-#elif defined(__arm__)
-#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-  // linux arm parse text file for neon detect.
-  cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
-#elif defined(__ARM_NEON__)
-  // gcc -mfpu=neon defines __ARM_NEON__
-  // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
-  // to disable Neon on devices that do not have it.
-  cpu_info_ = kCpuHasNEON;
-#endif
-  cpu_info_ |= kCpuInitialized | kCpuHasARM;
+  if (TestEnv("LIBYUV_DISABLE_ERMS")) {
+    cpu_info &= ~kCpuHasERMS;
+  }
+  if (TestEnv("LIBYUV_DISABLE_FMA3")) {
+    cpu_info &= ~kCpuHasFMA3;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVX3")) {
+    cpu_info &= ~kCpuHasAVX3;
+  }
+#endif
+#if defined(__mips__) && defined(__linux__)
+#if defined(__mips_dspr2)
+  cpu_info |= kCpuHasDSPR2;
+#endif
+  cpu_info |= kCpuHasMIPS;
+  if (getenv("LIBYUV_DISABLE_DSPR2")) {
+    cpu_info &= ~kCpuHasDSPR2;
+  }
+#endif
+#if defined(__arm__) || defined(__aarch64__)
+// gcc -mfpu=neon defines __ARM_NEON__
+// __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
+// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
+#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
+  cpu_info = kCpuHasNEON;
+// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+// flag in it.
+// So for aarch64, neon enabling is hard coded here.
+#endif
+#if defined(__aarch64__)
+  cpu_info = kCpuHasNEON;
+#else
+  // Linux arm parse text file for neon detect.
+  cpu_info = ArmCpuCaps("/proc/cpuinfo");
+#endif
+  cpu_info |= kCpuHasARM;
   if (TestEnv("LIBYUV_DISABLE_NEON")) {
-    cpu_info_ &= ~kCpuHasNEON;
+    cpu_info &= ~kCpuHasNEON;
   }
+#endif  // __arm__
   if (TestEnv("LIBYUV_DISABLE_ASM")) {
-    cpu_info_ = kCpuInitialized;
+    cpu_info = 0;
   }
-#endif  // __arm__
-  return cpu_info_;
+  cpu_info  |= kCpuInitialized;
+  cpu_info_ = cpu_info;
+  return cpu_info;
 }
 
+// Note that use of this function is not thread safe.
 LIBYUV_API
 void MaskCpuFlags(int enable_flags) {
-  InitCpuFlags();
-  cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
+  cpu_info_ = InitCpuFlags() & enable_flags;
 }
 
 #ifdef __cplusplus
diff --git a/files/source/format_conversion.cc b/files/source/format_conversion.cc
deleted file mode 100644
index ed12de88..00000000
--- a/files/source/format_conversion.cc
+++ /dev/null
@@ -1,554 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/format_conversion.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/video_common.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
-// and vst would select which 2 components to write. The low level would need
-// to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
-
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_ARGBTOBAYERROW_SSSE3
-__declspec(naked) __declspec(align(16))
-static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
-                                 uint8* dst_bayer, uint32 selector, int pix) {
-  __asm {
-    mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_bayer
-    movd       xmm5, [esp + 12]  // selector
-    mov        ecx, [esp + 16]   // pix
-    pshufd     xmm5, xmm5, 0
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    lea        eax, [eax + 16]
-    pshufb     xmm0, xmm5
-    sub        ecx, 4
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    jg         wloop
-    ret
-  }
-}
-
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-
-#define HAS_ARGBTOBAYERROW_SSSE3
-static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
-                                 uint32 selector, int pix) {
-  asm volatile (
-    "movd   %3,%%xmm5                          \n"
-    "pshufd $0x0,%%xmm5,%%xmm5                 \n"
-    ".p2align  4                               \n"
-"1:                                            \n"
-    "movdqa (%0),%%xmm0                        \n"
-    "lea    0x10(%0),%0                        \n"
-    "pshufb %%xmm5,%%xmm0                      \n"
-    "sub    $0x4,%2                            \n"
-    "movd   %%xmm0,(%1)                        \n"
-    "lea    0x4(%1),%1                         \n"
-    "jg     1b                                 \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_bayer), // %1
-    "+r"(pix)        // %2
-  : "g"(selector)    // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
-
-);
-}
-#endif
-
-static void ARGBToBayerRow_C(const uint8* src_argb,
-                             uint8* dst_bayer, uint32 selector, int pix) {
-  int index0 = selector & 0xff;
-  int index1 = (selector >> 8) & 0xff;
-  // Copy a row of Bayer.
-  for (int x = 0; x < pix - 1; x += 2) {
-    dst_bayer[0] = src_argb[index0];
-    dst_bayer[1] = src_argb[index1];
-    src_argb += 8;
-    dst_bayer += 2;
-  }
-  if (pix & 1) {
-    dst_bayer[0] = src_argb[index0];
-  }
-}
-
-// generate a selector mask useful for pshufb
-static uint32 GenerateSelector(int select0, int select1) {
-  return static_cast<uint32>(select0) |
-         static_cast<uint32>((select1 + 4) << 8) |
-         static_cast<uint32>((select0 + 8) << 16) |
-         static_cast<uint32>((select1 + 12) << 24);
-}
-
-static int MakeSelectors(const int blue_index,
-                         const int green_index,
-                         const int red_index,
-                         uint32 dst_fourcc_bayer,
-                         uint32 *index_map) {
-  // Now build a lookup table containing the indices for the four pixels in each
-  // 2x2 Bayer grid.
-  switch (dst_fourcc_bayer) {
-    case FOURCC_BGGR:
-      index_map[0] = GenerateSelector(blue_index, green_index);
-      index_map[1] = GenerateSelector(green_index, red_index);
-      break;
-    case FOURCC_GBRG:
-      index_map[0] = GenerateSelector(green_index, blue_index);
-      index_map[1] = GenerateSelector(red_index, green_index);
-      break;
-    case FOURCC_RGGB:
-      index_map[0] = GenerateSelector(red_index, green_index);
-      index_map[1] = GenerateSelector(green_index, blue_index);
-      break;
-    case FOURCC_GRBG:
-      index_map[0] = GenerateSelector(green_index, red_index);
-      index_map[1] = GenerateSelector(blue_index, green_index);
-      break;
-    default:
-      return -1;  // Bad FourCC
-  }
-  return 0;
-}
-
-// Converts 32 bit ARGB to Bayer RGB formats.
-LIBYUV_API
-int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_bayer, int dst_stride_bayer,
-                int width, int height,
-                uint32 dst_fourcc_bayer) {
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) = ARGBToBayerRow_C;
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-    ARGBToBayerRow = ARGBToBayerRow_SSSE3;
-  }
-#endif
-  const int blue_index = 0;  // Offsets for ARGB format
-  const int green_index = 1;
-  const int red_index = 2;
-  uint32 index_map[2];
-  if (MakeSelectors(blue_index, green_index, red_index,
-                    dst_fourcc_bayer, index_map)) {
-    return -1;  // Bad FourCC
-  }
-
-  for (int y = 0; y < height; ++y) {
-    ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width);
-    src_argb += src_stride_argb;
-    dst_bayer += dst_stride_bayer;
-  }
-  return 0;
-}
-
-#define AVG(a, b) (((a) + (b)) >> 1)
-
-static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 g = src_bayer0[1];
-  uint8 r = src_bayer1[1];
-  for (int x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = src_bayer0[0];
-    dst_argb[1] = AVG(g, src_bayer0[1]);
-    dst_argb[2] = AVG(r, src_bayer1[1]);
-    dst_argb[3] = 255U;
-    dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = src_bayer1[1];
-    dst_argb[7] = 255U;
-    g = src_bayer0[1];
-    r = src_bayer1[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = src_bayer0[0];
-  dst_argb[1] = AVG(g, src_bayer0[1]);
-  dst_argb[2] = AVG(r, src_bayer1[1]);
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer0[0];
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = src_bayer1[1];
-    dst_argb[7] = 255U;
-  }
-}
-
-static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 g = src_bayer0[1];
-  uint8 b = src_bayer1[1];
-  for (int x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = AVG(b, src_bayer1[1]);
-    dst_argb[1] = AVG(g, src_bayer0[1]);
-    dst_argb[2] = src_bayer0[0];
-    dst_argb[3] = 255U;
-    dst_argb[4] = src_bayer1[1];
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[7] = 255U;
-    g = src_bayer0[1];
-    b = src_bayer1[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = AVG(b, src_bayer1[1]);
-  dst_argb[1] = AVG(g, src_bayer0[1]);
-  dst_argb[2] = src_bayer0[0];
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer1[1];
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = src_bayer0[0];
-    dst_argb[7] = 255U;
-  }
-}
-
-static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 b = src_bayer0[1];
-  for (int x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = AVG(b, src_bayer0[1]);
-    dst_argb[1] = src_bayer0[0];
-    dst_argb[2] = src_bayer1[0];
-    dst_argb[3] = 255U;
-    dst_argb[4] = src_bayer0[1];
-    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]);
-    dst_argb[7] = 255U;
-    b = src_bayer0[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = AVG(b, src_bayer0[1]);
-  dst_argb[1] = src_bayer0[0];
-  dst_argb[2] = src_bayer1[0];
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer0[1];
-    dst_argb[5] = src_bayer0[0];
-    dst_argb[6] = src_bayer1[0];
-    dst_argb[7] = 255U;
-  }
-}
-
-static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 r = src_bayer0[1];
-  for (int x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = src_bayer1[0];
-    dst_argb[1] = src_bayer0[0];
-    dst_argb[2] = AVG(r, src_bayer0[1]);
-    dst_argb[3] = 255U;
-    dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]);
-    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[6] = src_bayer0[1];
-    dst_argb[7] = 255U;
-    r = src_bayer0[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = src_bayer1[0];
-  dst_argb[1] = src_bayer0[0];
-  dst_argb[2] = AVG(r, src_bayer0[1]);
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer1[0];
-    dst_argb[5] = src_bayer0[0];
-    dst_argb[6] = src_bayer0[1];
-    dst_argb[7] = 255U;
-  }
-}
-
-// Converts any Bayer RGB format to ARGB.
-LIBYUV_API
-int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height,
-                uint32 src_fourcc_bayer) {
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-  switch (src_fourcc_bayer) {
-    case FOURCC_BGGR:
-      BayerRow0 = BayerRowBG;
-      BayerRow1 = BayerRowGR;
-      break;
-    case FOURCC_GBRG:
-      BayerRow0 = BayerRowGB;
-      BayerRow1 = BayerRowRG;
-      break;
-    case FOURCC_GRBG:
-      BayerRow0 = BayerRowGR;
-      BayerRow1 = BayerRowBG;
-      break;
-    case FOURCC_RGGB:
-      BayerRow0 = BayerRowRG;
-      BayerRow1 = BayerRowGB;
-      break;
-    default:
-      return -1;    // Bad FourCC
-  }
-
-  for (int y = 0; y < height - 1; y += 2) {
-    BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
-    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
-              dst_argb + dst_stride_argb, width);
-    src_bayer += src_stride_bayer * 2;
-    dst_argb += dst_stride_argb * 2;
-  }
-  if (height & 1) {
-    BayerRow0(src_bayer, -src_stride_bayer, dst_argb, width);
-  }
-  return 0;
-}
-
-// Converts any Bayer RGB format to ARGB.
-LIBYUV_API
-int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height,
-                uint32 src_fourcc_bayer) {
-  if (width * 4 > kMaxStride) {
-    return -1;  // Size too large for row buffer
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    int halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-    ARGBToYRow = ARGBToYRow_SSSE3;
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
-    ARGBToUVRow = ARGBToUVRow_SSSE3;
-  }
-#endif
-
-  switch (src_fourcc_bayer) {
-    case FOURCC_BGGR:
-      BayerRow0 = BayerRowBG;
-      BayerRow1 = BayerRowGR;
-      break;
-    case FOURCC_GBRG:
-      BayerRow0 = BayerRowGB;
-      BayerRow1 = BayerRowRG;
-      break;
-    case FOURCC_GRBG:
-      BayerRow0 = BayerRowGR;
-      BayerRow1 = BayerRowBG;
-      break;
-    case FOURCC_RGGB:
-      BayerRow0 = BayerRowRG;
-      BayerRow1 = BayerRowGB;
-      break;
-    default:
-      return -1;  // Bad FourCC
-  }
-
-  for (int y = 0; y < height - 1; y += 2) {
-    BayerRow0(src_bayer, src_stride_bayer, row, width);
-    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
-              row + kMaxStride, width);
-    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
-    src_bayer += src_stride_bayer * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    BayerRow0(src_bayer, src_stride_bayer, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert I420 to Bayer.
-LIBYUV_API
-int I420ToBayer(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_bayer, int dst_stride_bayer,
-                int width, int height,
-                uint32 dst_fourcc_bayer) {
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    int halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_NEON;
-  }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_SSSE3;
-  }
-#endif
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) = ARGBToBayerRow_C;
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
-    ARGBToBayerRow = ARGBToBayerRow_SSSE3;
-  }
-#endif
-  const int blue_index = 0;  // Offsets for ARGB format
-  const int green_index = 1;
-  const int red_index = 2;
-  uint32 index_map[2];
-  if (MakeSelectors(blue_index, green_index, red_index,
-                    dst_fourcc_bayer, index_map)) {
-    return -1;  // Bad FourCC
-  }
-
-  for (int y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, row, width);
-    ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
-    dst_bayer += dst_stride_bayer;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-#define MAKEBAYERFOURCC(BAYER)                                                 \
-LIBYUV_API                                                                     \
-int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer,         \
-                         uint8* dst_y, int dst_stride_y,                       \
-                         uint8* dst_u, int dst_stride_u,                       \
-                         uint8* dst_v, int dst_stride_v,                       \
-                         int width, int height) {                              \
-  return BayerToI420(src_bayer, src_stride_bayer,                              \
-                     dst_y, dst_stride_y,                                      \
-                     dst_u, dst_stride_u,                                      \
-                     dst_v, dst_stride_v,                                      \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}                                                                              \
-                                                                               \
-LIBYUV_API                                                                     \
-int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y,                   \
-                       const uint8* src_u, int src_stride_u,                   \
-                       const uint8* src_v, int src_stride_v,                   \
-                       uint8* dst_bayer, int dst_stride_bayer,                 \
-                       int width, int height) {                                \
-  return I420ToBayer(src_y, src_stride_y,                                      \
-                     src_u, src_stride_u,                                      \
-                     src_v, src_stride_v,                                      \
-                     dst_bayer, dst_stride_bayer,                              \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}                                                                              \
-                                                                               \
-LIBYUV_API                                                                     \
-int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb,             \
-                       uint8* dst_bayer, int dst_stride_bayer,                 \
-                       int width, int height) {                                \
-  return ARGBToBayer(src_argb, src_stride_argb,                                \
-                     dst_bayer, dst_stride_bayer,                              \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}                                                                              \
-                                                                               \
-LIBYUV_API                                                                     \
-int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer,         \
-                         uint8* dst_argb, int dst_stride_argb,                 \
-                         int width, int height) {                              \
-  return BayerToARGB(src_bayer, src_stride_bayer,                              \
-                     dst_argb, dst_stride_argb,                                \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}
-
-MAKEBAYERFOURCC(BGGR)
-MAKEBAYERFOURCC(GBRG)
-MAKEBAYERFOURCC(GRBG)
-MAKEBAYERFOURCC(RGGB)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/mjpeg_decoder.cc b/files/source/mjpeg_decoder.cc
index aa603947..50818418 100644
--- a/files/source/mjpeg_decoder.cc
+++ b/files/source/mjpeg_decoder.cc
@@ -4,28 +4,41 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #include "libyuv/mjpeg_decoder.h"
 
 #ifdef HAVE_JPEG
-// Must be included before jpeglib
 #include <assert.h>
-#ifndef __CLR_VER
+
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+// Must be included before jpeglib.
 #include <setjmp.h>
 #define HAVE_SETJMP
+
+#if defined(_MSC_VER)
+// disable warning 4324: structure was padded due to __declspec(align())
+#pragma warning(disable:4324)
+#endif
+
 #endif
-#include <stdio.h>
-#include <stdlib.h>
+struct FILE;  // For jpeglib.h.
 
+// C++ build requires extern C for jpeg internals.
+#ifdef __cplusplus
 extern "C" {
+#endif
+
 #include <jpeglib.h>
-}
 
-#include <climits>
-#include <cstring>
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#include "libyuv/planar_functions.h"  // For CopyPlane().
 
 namespace libyuv {
 
@@ -43,8 +56,15 @@ const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
 const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
 const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
 
+// Methods that are passed to jpeglib.
+boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
+void init_source(jpeg_decompress_struct* cinfo);
+void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes);  // NOLINT
+void term_source(jpeg_decompress_struct* cinfo);
+void ErrorHandler(jpeg_common_struct* cinfo);
+
 MJpegDecoder::MJpegDecoder()
-    : has_scanline_padding_(false),
+    : has_scanline_padding_(LIBYUV_FALSE),
       num_outbufs_(0),
       scanlines_(NULL),
       scanlines_sizes_(NULL),
@@ -80,41 +100,9 @@ MJpegDecoder::~MJpegDecoder() {
   DestroyOutputBuffers();
 }
 
-// Helper function to validate the jpeg looks ok.
-// TODO(fbarchard): Improve performance. Scan backward for EOI?
-bool ValidateJpeg(const uint8* sample, size_t sample_size) {
-  if (sample_size < 64) {
-    // ERROR: Invalid jpeg size: sample_size
-    return false;
-  }
-  if (sample[0] != 0xff || sample[1] != 0xd8) {
-    // ERROR: Invalid jpeg initial start code
-    return false;
-  }
-  bool soi = true;
-  int total_eoi = 0;
-  for (int i = 2; i < static_cast<int>(sample_size) - 1; ++i) {
-    if (sample[i] == 0xff) {
-      if (sample[i + 1] == 0xd8) {  // Start Of Image
-        soi = true;
-      } else if (sample[i + 1] == 0xd9) {  // End Of Image
-        if (soi) {
-          ++total_eoi;
-        }
-        soi = false;
-      }
-    }
-  }
-  if (!total_eoi) {
-    // ERROR: Invalid jpeg end code not found. Size sample_size
-    return false;
-  }
-  return true;
-}
-
-bool MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
   if (!ValidateJpeg(src, src_len)) {
-    return false;
+    return LIBYUV_FALSE;
   }
 
   buf_.data = src;
@@ -125,12 +113,12 @@ bool MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
   if (setjmp(error_mgr_->setjmp_buffer)) {
     // We called jpeg_read_header, it experienced an error, and we called
     // longjmp() and rewound the stack to here. Return error.
-    return false;
+    return LIBYUV_FALSE;
   }
 #endif
   if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
     // ERROR: Bad MJPEG header
-    return false;
+    return LIBYUV_FALSE;
   }
   AllocOutputBuffers(GetNumComponents());
   for (int i = 0; i < num_outbufs_; ++i) {
@@ -160,10 +148,10 @@ bool MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
     }
 
     if (GetComponentStride(i) != GetComponentWidth(i)) {
-      has_scanline_padding_ = true;
+      has_scanline_padding_ = LIBYUV_TRUE;
     }
   }
-  return true;
+  return LIBYUV_TRUE;
 }
 
 static int DivideAndRoundUp(int numerator, int denominator) {
@@ -242,45 +230,36 @@ int MJpegDecoder::GetComponentSize(int component) {
   return GetComponentWidth(component) * GetComponentHeight(component);
 }
 
-bool MJpegDecoder::UnloadFrame() {
+LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
 #ifdef HAVE_SETJMP
   if (setjmp(error_mgr_->setjmp_buffer)) {
     // We called jpeg_abort_decompress, it experienced an error, and we called
     // longjmp() and rewound the stack to here. Return error.
-    return false;
+    return LIBYUV_FALSE;
   }
 #endif
   jpeg_abort_decompress(decompress_struct_);
-  return true;
-}
-
-static void CopyRows(uint8* source, int source_stride,
-                     uint8* dest, int pixels, int numrows) {
-  for (int i = 0; i < numrows; ++i) {
-    memcpy(dest, source, pixels);
-    dest += pixels;
-    source += source_stride;
-  }
+  return LIBYUV_TRUE;
 }
 
 // TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
-bool MJpegDecoder::DecodeToBuffers(
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
     uint8** planes, int dst_width, int dst_height) {
   if (dst_width != GetWidth() ||
       dst_height > GetHeight()) {
     // ERROR: Bad dimensions
-    return false;
+    return LIBYUV_FALSE;
   }
 #ifdef HAVE_SETJMP
   if (setjmp(error_mgr_->setjmp_buffer)) {
     // We called into jpeglib, it experienced an error sometime during this
     // function call, and we called longjmp() and rewound the stack to here.
     // Return error.
-    return false;
+    return LIBYUV_FALSE;
   }
 #endif
   if (!StartDecode()) {
-    return false;
+    return LIBYUV_FALSE;
   }
   SetScanlinePointers(databuf_);
   int lines_left = dst_height;
@@ -294,7 +273,7 @@ bool MJpegDecoder::DecodeToBuffers(
     while (skip >= GetImageScanlinesPerImcuRow()) {
       if (!DecodeImcuRow()) {
         FinishDecode();
-        return false;
+        return LIBYUV_FALSE;
       }
       skip -= GetImageScanlinesPerImcuRow();
     }
@@ -303,7 +282,7 @@ bool MJpegDecoder::DecodeToBuffers(
       // copy the parts we want into the destination.
       if (!DecodeImcuRow()) {
         FinishDecode();
-        return false;
+        return LIBYUV_FALSE;
       }
       for (int i = 0; i < num_outbufs_; ++i) {
         // TODO(fbarchard): Compute skip to avoid this
@@ -313,8 +292,9 @@ bool MJpegDecoder::DecodeToBuffers(
         int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
                                 rows_to_skip;
         int data_to_skip = rows_to_skip * GetComponentStride(i);
-        CopyRows(databuf_[i] + data_to_skip, GetComponentStride(i),
-                 planes[i], GetComponentWidth(i), scanlines_to_copy);
+        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
+                  planes[i], GetComponentWidth(i),
+                  GetComponentWidth(i), scanlines_to_copy);
         planes[i] += scanlines_to_copy * GetComponentWidth(i);
       }
       lines_left -= (GetImageScanlinesPerImcuRow() - skip);
@@ -326,12 +306,13 @@ bool MJpegDecoder::DecodeToBuffers(
          lines_left -= GetImageScanlinesPerImcuRow()) {
     if (!DecodeImcuRow()) {
       FinishDecode();
-      return false;
+      return LIBYUV_FALSE;
     }
     for (int i = 0; i < num_outbufs_; ++i) {
       int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
-      CopyRows(databuf_[i], GetComponentStride(i),
-               planes[i], GetComponentWidth(i), scanlines_to_copy);
+      CopyPlane(databuf_[i], GetComponentStride(i),
+                planes[i], GetComponentWidth(i),
+                GetComponentWidth(i), scanlines_to_copy);
       planes[i] += scanlines_to_copy * GetComponentWidth(i);
     }
   }
@@ -340,36 +321,37 @@ bool MJpegDecoder::DecodeToBuffers(
     // Have a partial iMCU row left over to decode.
     if (!DecodeImcuRow()) {
       FinishDecode();
-      return false;
+      return LIBYUV_FALSE;
     }
     for (int i = 0; i < num_outbufs_; ++i) {
       int scanlines_to_copy =
           DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
-      CopyRows(databuf_[i], GetComponentStride(i),
-               planes[i], GetComponentWidth(i), scanlines_to_copy);
+      CopyPlane(databuf_[i], GetComponentStride(i),
+                planes[i], GetComponentWidth(i),
+                GetComponentWidth(i), scanlines_to_copy);
       planes[i] += scanlines_to_copy * GetComponentWidth(i);
     }
   }
   return FinishDecode();
 }
 
-bool MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
     int dst_width, int dst_height) {
   if (dst_width != GetWidth() ||
       dst_height > GetHeight()) {
     // ERROR: Bad dimensions
-    return false;
+    return LIBYUV_FALSE;
   }
 #ifdef HAVE_SETJMP
   if (setjmp(error_mgr_->setjmp_buffer)) {
     // We called into jpeglib, it experienced an error sometime during this
     // function call, and we called longjmp() and rewound the stack to here.
     // Return error.
-    return false;
+    return LIBYUV_FALSE;
   }
 #endif
   if (!StartDecode()) {
-    return false;
+    return LIBYUV_FALSE;
   }
   SetScanlinePointers(databuf_);
   int lines_left = dst_height;
@@ -379,7 +361,7 @@ bool MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
     while (skip >= GetImageScanlinesPerImcuRow()) {
       if (!DecodeImcuRow()) {
         FinishDecode();
-        return false;
+        return LIBYUV_FALSE;
       }
       skip -= GetImageScanlinesPerImcuRow();
     }
@@ -387,7 +369,7 @@ bool MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
       // Have a partial iMCU row left over to skip.
       if (!DecodeImcuRow()) {
         FinishDecode();
-        return false;
+        return LIBYUV_FALSE;
       }
       for (int i = 0; i < num_outbufs_; ++i) {
         // TODO(fbarchard): Compute skip to avoid this
@@ -414,7 +396,7 @@ bool MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
          lines_left -= GetImageScanlinesPerImcuRow()) {
     if (!DecodeImcuRow()) {
       FinishDecode();
-      return false;
+      return LIBYUV_FALSE;
     }
     (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
   }
@@ -422,19 +404,19 @@ bool MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
     // Have a partial iMCU row left over to decode.
     if (!DecodeImcuRow()) {
       FinishDecode();
-      return false;
+      return LIBYUV_FALSE;
     }
     (*fn)(opaque, databuf_, databuf_strides_, lines_left);
   }
   return FinishDecode();
 }
 
-void MJpegDecoder::init_source(j_decompress_ptr cinfo) {
+void init_source(j_decompress_ptr cinfo) {
   fill_input_buffer(cinfo);
 }
 
-boolean MJpegDecoder::fill_input_buffer(j_decompress_ptr cinfo) {
-  BufferVector* buf_vec = static_cast<BufferVector*>(cinfo->client_data);
+boolean fill_input_buffer(j_decompress_ptr cinfo) {
+  BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
   if (buf_vec->pos >= buf_vec->len) {
     assert(0 && "No more data");
     // ERROR: No more data
@@ -446,26 +428,28 @@ boolean MJpegDecoder::fill_input_buffer(j_decompress_ptr cinfo) {
   return TRUE;
 }
 
-void MJpegDecoder::skip_input_data(j_decompress_ptr cinfo,
-                                   long num_bytes) {  // NOLINT
+void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
   cinfo->src->next_input_byte += num_bytes;
 }
 
-void MJpegDecoder::term_source(j_decompress_ptr cinfo) {
+void term_source(j_decompress_ptr cinfo) {
   // Nothing to do.
 }
 
 #ifdef HAVE_SETJMP
-void MJpegDecoder::ErrorHandler(j_common_ptr cinfo) {
+void ErrorHandler(j_common_ptr cinfo) {
   // This is called when a jpeglib command experiences an error. Unfortunately
   // jpeglib's error handling model is not very flexible, because it expects the
   // error handler to not return--i.e., it wants the program to terminate. To
   // recover from errors we use setjmp() as shown in their example. setjmp() is
   // C's implementation for the "call with current continuation" functionality
   // seen in some functional programming languages.
+  // A formatted message can be output, but is unsafe for release.
+#ifdef DEBUG
   char buf[JMSG_LENGTH_MAX];
   (*cinfo->err->format_message)(cinfo, buf);
   // ERROR: Error in jpeglib: buf
+#endif
 
   SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
   // This rewinds the call stack to the point of the corresponding setjmp()
@@ -514,26 +498,29 @@ void MJpegDecoder::DestroyOutputBuffers() {
 }
 
 // JDCT_IFAST and do_block_smoothing improve performance substantially.
-bool MJpegDecoder::StartDecode() {
+LIBYUV_BOOL MJpegDecoder::StartDecode() {
   decompress_struct_->raw_data_out = TRUE;
   decompress_struct_->dct_method = JDCT_IFAST;  // JDCT_ISLOW is default
   decompress_struct_->dither_mode = JDITHER_NONE;
-  decompress_struct_->do_fancy_upsampling = false;  // Not applicable to 'raw'
-  decompress_struct_->enable_2pass_quant = false;  // Only for buffered mode
-  decompress_struct_->do_block_smoothing = false;  // blocky but fast
+  // Not applicable to 'raw':
+  decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
+  // Only for buffered mode:
+  decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
+  // Blocky but fast:
+  decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
 
   if (!jpeg_start_decompress(decompress_struct_)) {
     // ERROR: Couldn't start JPEG decompressor";
-    return false;
+    return LIBYUV_FALSE;
   }
-  return true;
+  return LIBYUV_TRUE;
 }
 
-bool MJpegDecoder::FinishDecode() {
+LIBYUV_BOOL MJpegDecoder::FinishDecode() {
   // jpeglib considers it an error if we finish without decoding the whole
   // image, so we call "abort" rather than "finish".
   jpeg_abort_decompress(decompress_struct_);
-  return true;
+  return LIBYUV_TRUE;
 }
 
 void MJpegDecoder::SetScanlinePointers(uint8** data) {
@@ -546,8 +533,8 @@ void MJpegDecoder::SetScanlinePointers(uint8** data) {
   }
 }
 
-inline bool MJpegDecoder::DecodeImcuRow() {
-  return static_cast<unsigned int>(GetImageScanlinesPerImcuRow()) ==
+inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
+  return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
       jpeg_read_raw_data(decompress_struct_,
                          scanlines_,
                          GetImageScanlinesPerImcuRow());
diff --git a/files/source/mjpeg_validate.cc b/files/source/mjpeg_validate.cc
new file mode 100644
index 00000000..9c488320
--- /dev/null
+++ b/files/source/mjpeg_validate.cc
@@ -0,0 +1,71 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#include <string.h>  // For memchr.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Helper function to scan for EOI marker (0xff 0xd9).
+static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
+  if (sample_size >= 2) {
+    const uint8* end = sample + sample_size - 1;
+    const uint8* it = sample;
+    while (it < end) {
+      // TODO(fbarchard): scan for 0xd9 instead.
+      it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
+      if (it == NULL) {
+        break;
+      }
+      if (it[1] == 0xd9) {
+        return LIBYUV_TRUE;  // Success: Valid jpeg.
+      }
+      ++it;  // Skip over current 0xff.
+    }
+  }
+  // ERROR: Invalid jpeg end code not found. Size sample_size
+  return LIBYUV_FALSE;
+}
+
+// Helper function to validate the jpeg appears intact.
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
+  // Maximum size that ValidateJpeg will consider valid.
+  const size_t kMaxJpegSize = 0x7fffffffull;
+  const size_t kBackSearchSize = 1024;
+  if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
+    // ERROR: Invalid jpeg size: sample_size
+    return LIBYUV_FALSE;
+  }
+  if (sample[0] != 0xff || sample[1] != 0xd8) {  // SOI marker
+    // ERROR: Invalid jpeg initial start code
+    return LIBYUV_FALSE;
+  }
+
+  // Look for the End Of Image (EOI) marker near the end of the buffer.
+  if (sample_size > kBackSearchSize) {
+    if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+      return LIBYUV_TRUE;  // Success: Valid jpeg.
+    }
+    // Reduce search size for forward search.
+    sample_size = sample_size - kBackSearchSize + 1;
+  }
+  // Step over SOI marker and scan for EOI.
+  return ScanEOI(sample + 2, sample_size - 2);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc
index a7f5086a..237ab683 100644
--- a/files/source/planar_functions.cc
+++ b/files/source/planar_functions.cc
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -17,6 +17,7 @@
 #include "libyuv/mjpeg_decoder.h"
 #endif
 #include "libyuv/row.h"
+#include "libyuv/scale_row.h"  // for ScaleRowDown2
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -28,38 +29,180 @@ LIBYUV_API
 void CopyPlane(const uint8* src_y, int src_stride_y,
                uint8* dst_y, int dst_stride_y,
                int width, int height) {
+  int y;
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+  // Nothing to do.
+  if (src_y == dst_y && src_stride_y == dst_stride_y) {
+    return;
+  }
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
 #if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
-    CopyRow = CopyRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
-#if defined(HAS_COPYROW_X86)
-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
-    CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
   }
 #endif
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-    CopyRow = CopyRow_SSE2;
+
+  // Copy plane
+  for (y = 0; y < height; ++y) {
+    CopyRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+LIBYUV_API
+void CopyPlane_16(const uint16* src_y, int src_stride_y,
+                  uint16* dst_y, int dst_stride_y,
+                  int width, int height) {
+  int y;
+  void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_COPYROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_16_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_16_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_16_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_16_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_16_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_16_MIPS;
   }
 #endif
 
   // Copy plane
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     CopyRow(src_y, dst_y, width);
     src_y += src_stride_y;
     dst_y += dst_stride_y;
   }
 }
 
+// Copy I422.
+LIBYUV_API
+int I422Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+  return 0;
+}
+
+// Copy I444.
+LIBYUV_API
+int I444Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+  return 0;
+}
+
+// Copy I400.
+LIBYUV_API
+int I400ToI400(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
 // Convert I420 to I400.
 LIBYUV_API
 int I420ToI400(const uint8* src_y, int src_stride_y,
-               uint8*, int,  // src_u
-               uint8*, int,  // src_v
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
                uint8* dst_y, int dst_stride_y,
                int width, int height) {
   if (!src_y || !dst_y || width <= 0 || height == 0) {
@@ -75,30 +218,53 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// Mirror a plane of data
+// Mirror a plane of data.
 void MirrorPlane(const uint8* src_y, int src_stride_y,
                  uint8* dst_y, int dst_stride_y,
                  int width, int height) {
+  int y;
   void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
 #if defined(HAS_MIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
-    MirrorRow = MirrorRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
   }
 #endif
-#if defined(HAS_MIRRORROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
-    MirrorRow = MirrorRow_SSE2;
 #if defined(HAS_MIRRORROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3) &&
-        IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
       MirrorRow = MirrorRow_SSSE3;
     }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
 #endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
+    MirrorRow = MirrorRow_DSPR2;
   }
 #endif
 
   // Mirror plane
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     MirrorRow(src_y, dst_y, width);
     src_y += src_stride_y;
     dst_y += dst_stride_y;
@@ -112,42 +278,52 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int width) =
+      YUY2ToUV422Row_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+      YUY2ToYRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
     src_stride_yuy2 = -src_stride_yuy2;
   }
-  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix);
-  void (*YUY2ToYRow)(const uint8* src_yuy2,
-                     uint8* dst_y, int pix);
-  YUY2ToYRow = YUY2ToYRow_C;
-  YUY2ToUV422Row = YUY2ToUV422Row_C;
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
 #if defined(HAS_YUY2TOYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width > 16) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Any_SSE2;
-    }
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
-        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          YUY2ToYRow = YUY2ToYRow_SSE2;
-        }
-      }
+      YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
     }
   }
-#elif defined(HAS_YUY2TOYROW_NEON)
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      YUY2ToYRow = YUY2ToYRow_Any_NEON;
-      if (width > 16) {
-        YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
-      }
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    if (width >= 16) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
     }
     if (IS_ALIGNED(width, 16)) {
       YUY2ToYRow = YUY2ToYRow_NEON;
@@ -156,7 +332,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
     YUY2ToYRow(src_yuy2, dst_y, width);
     src_yuy2 += src_stride_yuy2;
@@ -174,42 +350,52 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*UYVYToUV422Row)(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width) =
+      UYVYToUV422Row_C;
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+                     uint8* dst_y, int width) = UYVYToYRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
     src_stride_uyvy = -src_stride_uyvy;
   }
-  void (*UYVYToUV422Row)(const uint8* src_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix);
-  void (*UYVYToYRow)(const uint8* src_uyvy,
-                     uint8* dst_y, int pix);
-  UYVYToYRow = UYVYToYRow_C;
-  UYVYToUV422Row = UYVYToUV422Row_C;
+  // Coalesce rows.
+  if (src_stride_uyvy == width * 2 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
 #if defined(HAS_UYVYTOYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width > 16) {
-      UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
-      UYVYToYRow = UYVYToYRow_Any_SSE2;
-    }
+    UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
-      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
-        UYVYToUV422Row = UYVYToUV422Row_SSE2;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          UYVYToYRow = UYVYToYRow_SSE2;
-        }
-      }
+      UYVYToUV422Row = UYVYToUV422Row_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUV422Row = UYVYToUV422Row_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
     }
   }
-#elif defined(HAS_UYVYTOYROW_NEON)
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      UYVYToYRow = UYVYToYRow_Any_NEON;
-      if (width > 16) {
-        UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
-      }
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    if (width >= 16) {
+      UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
     }
     if (IS_ALIGNED(width, 16)) {
       UYVYToYRow = UYVYToYRow_NEON;
@@ -218,7 +404,7 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
     UYVYToYRow(src_uyvy, dst_y, width);
     src_uyvy += src_stride_uyvy;
@@ -229,6 +415,26 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
   return 0;
 }
 
+// Mirror I400 with optional flipping
+LIBYUV_API
+int I400Mirror(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (!src_y || !dst_y ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+
+  MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
 // Mirror I420 with optional flipping
 LIBYUV_API
 int I420Mirror(const uint8* src_y, int src_stride_y,
@@ -238,6 +444,8 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
   if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
     return -1;
@@ -245,7 +453,7 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    int halfheight = (height + 1) >> 1;
+    halfheight = (height + 1) >> 1;
     src_y = src_y + (height - 1) * src_stride_y;
     src_u = src_u + (halfheight - 1) * src_stride_u;
     src_v = src_v + (halfheight - 1) * src_stride_v;
@@ -254,8 +462,6 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
     src_stride_v = -src_stride_v;
   }
 
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
   if (dst_y) {
     MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
@@ -269,6 +475,9 @@ LIBYUV_API
 int ARGBMirror(const uint8* src_argb, int src_stride_argb,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+      ARGBMirrorRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -278,19 +487,33 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-
-  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
-      ARGBMirrorRow_C;
-#if defined(HAS_ARGBMIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
   }
 #endif
 
   // Mirror plane
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBMirrorRow(src_argb, dst_argb, width);
     src_argb += src_stride_argb;
     dst_argb += dst_stride_argb;
@@ -298,7 +521,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
-// Get a blender that optimized for the CPU, alignment and pixel count.
+// Get a blender that optimized for the CPU and pixel count.
 // As there are 6 blenders to choose from, the caller should try to use
 // the same blend function for all pixels if possible.
 LIBYUV_API
@@ -311,9 +534,9 @@ ARGBBlendRow GetARGBBlend() {
     return ARGBBlendRow;
   }
 #endif
-#if defined(HAS_ARGBBLENDROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBBlendRow = ARGBBlendRow_SSE2;
+#if defined(HAS_ARGBBLENDROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBBlendRow = ARGBBlendRow_NEON;
   }
 #endif
   return ARGBBlendRow;
@@ -325,6 +548,9 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
               const uint8* src_argb1, int src_stride_argb1,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height) {
+  int y;
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) = GetARGBBlend();
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -334,10 +560,16 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width) = GetARGBBlend();
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
     src_argb0 += src_stride_argb0;
     src_argb1 += src_stride_argb1;
@@ -346,488 +578,478 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
   return 0;
 }
 
-// Convert ARGB to I400.
+// Alpha Blend plane and store to destination.
 LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+int BlendPlane(const uint8* src_y0, int src_stride_y0,
+               const uint8* src_y1, int src_stride_y1,
+               const uint8* alpha, int alpha_stride,
                uint8* dst_y, int dst_stride_y,
                int width, int height) {
-  if (!src_argb || !dst_y || width <= 0 || height == 0) {
+  int y;
+  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
+      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
+  if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-    ARGBToYRow = ARGBToYRow_SSSE3;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
   }
-#endif
 
-  for (int y = 0; y < height; ++y) {
-    ARGBToYRow(src_argb, dst_y, width);
-    src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
+  // Coalesce rows for Y plane.
+  if (src_stride_y0 == width &&
+      src_stride_y1 == width &&
+      alpha_stride == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
   }
-  return 0;
-}
 
-// ARGB little endian (bgra in memory) to I422
-// same as I420 except UV plane is full height
-LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_BLENDPLANEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-      ARGBToYRow = ARGBToYRow_Any_SSSE3;
+  BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      BlendPlaneRow = BlendPlaneRow_SSSE3;
     }
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUVRow = ARGBToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          ARGBToYRow = ARGBToYRow_SSSE3;
-        }
-      }
+  }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+  BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      BlendPlaneRow = BlendPlaneRow_AVX2;
     }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-    src_argb += src_stride_argb;
+  for (y = 0; y < height; ++y) {
+    BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
+    src_y0 += src_stride_y0;
+    src_y1 += src_stride_y1;
+    alpha += alpha_stride;
     dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
   }
   return 0;
 }
 
-// Convert I422 to BGRA.
+#define MAXTWIDTH 2048
+// Alpha Blend YUV images and store to destination.
 LIBYUV_API
-int I422ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_bgra ||
-      width <= 0 || height == 0) {
+int I420Blend(const uint8* src_y0, int src_stride_y0,
+              const uint8* src_u0, int src_stride_u0,
+              const uint8* src_v0, int src_stride_v0,
+              const uint8* src_y1, int src_stride_y1,
+              const uint8* src_u1, int src_stride_u1,
+              const uint8* src_v1, int src_stride_v1,
+              const uint8* alpha, int alpha_stride,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height) {
+  int y;
+  // Half width/height for UV.
+  int halfwidth = (width + 1) >> 1;
+  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
+      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
+  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
+  if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
+      !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
+
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
-    dst_stride_bgra = -dst_stride_bgra;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
   }
-  void (*I422ToBGRARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToBGRARow_C;
-#if defined(HAS_I422TOBGRAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToBGRARow = I422ToBGRARow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToBGRARow = I422ToBGRARow_NEON;
+
+  // Blend Y plane.
+  BlendPlane(src_y0, src_stride_y0,
+             src_y1, src_stride_y1,
+             alpha, alpha_stride,
+             dst_y, dst_stride_y,
+             width, height);
+
+#if defined(HAS_BLENDPLANEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      BlendPlaneRow = BlendPlaneRow_SSSE3;
     }
   }
-#elif defined(HAS_I422TOBGRAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
-        I422ToBGRARow = I422ToBGRARow_SSSE3;
-      }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      BlendPlaneRow = BlendPlaneRow_AVX2;
     }
   }
 #endif
-
-  for (int y = 0; y < height; ++y) {
-    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
-    dst_bgra += dst_stride_bgra;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I422 to ABGR.
-LIBYUV_API
-int I422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_abgr ||
-      width <= 0 || height == 0) {
-    return -1;
+  if (!IS_ALIGNED(width, 2)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
   }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
-    dst_stride_abgr = -dst_stride_abgr;
-  }
-  void (*I422ToABGRRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToABGRRow_C;
-#if defined(HAS_I422TOABGRROW_NEON)
+#if defined(HAS_SCALEROWDOWN2_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToABGRRow = I422ToABGRRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToABGRRow = I422ToABGRRow_NEON;
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON;
+    if (IS_ALIGNED(width, 2)) {
+      ScaleRowDown2 = ScaleRowDown2Box_Any_NEON;
+      if (IS_ALIGNED(halfwidth, 16)) {
+        ScaleRowDown2 = ScaleRowDown2Box_NEON;
+      }
     }
   }
-#elif defined(HAS_I422TOABGRROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
-        I422ToABGRRow = I422ToABGRRow_SSSE3;
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3;
+    if (IS_ALIGNED(width, 2)) {
+      ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3;
+      if (IS_ALIGNED(halfwidth, 16)) {
+        ScaleRowDown2 = ScaleRowDown2Box_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2;
+    if (IS_ALIGNED(width, 2)) {
+      ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2;
+      if (IS_ALIGNED(halfwidth, 32)) {
+        ScaleRowDown2 = ScaleRowDown2Box_AVX2;
       }
     }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
-    dst_abgr += dst_stride_abgr;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
+  // Row buffer for intermediate alpha pixels.
+  align_buffer_64(halfalpha, halfwidth);
+  for (y = 0; y < height; y += 2) {
+    // last row of odd height image use 1 row of alpha instead of 2.
+    if (y == (height - 1)) {
+      alpha_stride = 0;
+    }
+    // Subsample 2 rows of UV to half width and half height.
+    ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth);
+    alpha += alpha_stride * 2;
+    BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth);
+    BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth);
+    src_u0 += src_stride_u0;
+    src_u1 += src_stride_u1;
+    dst_u += dst_stride_u;
+    src_v0 += src_stride_v0;
+    src_v1 += src_stride_v1;
+    dst_v += dst_stride_v;
   }
+  free_aligned_buffer_64(halfalpha);
   return 0;
 }
 
-// Convert I422 to RGBA.
+// Multiply 2 ARGB images and store to destination.
 LIBYUV_API
-int I422ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_rgba ||
-      width <= 0 || height == 0) {
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  int y;
+  void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                          int width) = ARGBMultiplyRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
-    dst_stride_rgba = -dst_stride_rgba;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
   }
-  void (*I422ToRGBARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToRGBARow_C;
-#if defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGBARow = I422ToRGBARow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGBARow = I422ToRGBARow_NEON;
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBMULTIPLYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
     }
   }
-#elif defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
-        I422ToRGBARow = I422ToRGBARow_SSSE3;
-      }
+      ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_NEON;
     }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
-    dst_rgba += dst_stride_rgba;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
+  // Multiply plane
+  for (y = 0; y < height; ++y) {
+    ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
   }
   return 0;
 }
 
-// Convert ARGB to RGBA.
+// Add 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  if (!src_argb || !dst_rgba ||
-      width <= 0 || height == 0) {
+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
+            const uint8* src_argb1, int src_stride_argb1,
+            uint8* dst_argb, int dst_stride_argb,
+            int width, int height) {
+  int y;
+  void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                     int width) = ARGBAddRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
   }
-  void (*ARGBToRGBARow)(const uint8* src_argb, uint8* dst_rgba, int pix) =
-      ARGBToRGBARow_C;
-#if defined(HAS_ARGBTORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
-    ARGBToRGBARow = ARGBToRGBARow_SSSE3;
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
-#endif
-#if defined(HAS_ARGBTORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    ARGBToRGBARow = ARGBToRGBARow_NEON;
+#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBAddRow = ARGBAddRow_SSE2;
   }
 #endif
-
-  for (int y = 0; y < height; ++y) {
-    ARGBToRGBARow(src_argb, dst_rgba, width);
-    src_argb += src_stride_argb;
-    dst_rgba += dst_stride_rgba;
-  }
-  return 0;
-}
-
-// Convert ARGB To RGB24.
-LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height) {
-  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToRGB24Row_C;
-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-    if (width * 3 <= kMaxStride) {
-      ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBAddRow = ARGBAddRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAddRow = ARGBAddRow_SSE2;
     }
-    if (IS_ALIGNED(width, 16) &&
-        IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
-      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBADDROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAddRow = ARGBAddRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTORGB24ROW_NEON)
+#if defined(HAS_ARGBADDROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    if (width * 3 <= kMaxStride) {
-      ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
-    }
+    ARGBAddRow = ARGBAddRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+      ARGBAddRow = ARGBAddRow_NEON;
     }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    ARGBToRGB24Row(src_argb, dst_rgb24, width);
-    src_argb += src_stride_argb;
-    dst_rgb24 += dst_stride_rgb24;
+  // Add plane
+  for (y = 0; y < height; ++y) {
+    ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
   }
   return 0;
 }
 
-// Convert ARGB To RAW.
+// Subtract 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_raw, int dst_stride_raw,
-              int width, int height) {
-  if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  int y;
+  void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                          int width) = ARGBSubtractRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
   }
-  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToRAWRow_C;
-#if defined(HAS_ARGBTORAWROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-    if (width * 3 <= kMaxStride) {
-      ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSUBTRACTROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSubtractRow = ARGBSubtractRow_SSE2;
     }
-    if (IS_ALIGNED(width, 16) &&
-        IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
-      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTORAWROW_NEON)
+#if defined(HAS_ARGBSUBTRACTROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    if (width * 3 <= kMaxStride) {
-      ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
-    }
+    ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      ARGBToRAWRow = ARGBToRAWRow_NEON;
+      ARGBSubtractRow = ARGBSubtractRow_NEON;
     }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    ARGBToRAWRow(src_argb, dst_raw, width);
-    src_argb += src_stride_argb;
-    dst_raw += dst_stride_raw;
+  // Subtract plane
+  for (y = 0; y < height; ++y) {
+    ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
   }
   return 0;
 }
-
-// Convert ARGB To RGB565.
-LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
-  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+// Convert I422 to RGBA with matrix
+static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
+                            const uint8* src_u, int src_stride_u,
+                            const uint8* src_v, int src_stride_v,
+                            uint8* dst_rgba, int dst_stride_rgba,
+                            const struct YuvConstants* yuvconstants,
+                            int width, int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba ||
+      width <= 0 || height == 0) {
     return -1;
   }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
   }
-  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToRGB565Row_C;
-#if defined(HAS_ARGBTORGB565ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-    if (width * 2 <= kMaxStride) {
-      ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
-    }
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
     }
   }
 #endif
-
-  for (int y = 0; y < height; ++y) {
-    ARGBToRGB565Row(src_argb, dst_rgb565, width);
-    src_argb += src_stride_argb;
-    dst_rgb565 += dst_stride_rgb565;
-  }
-  return 0;
-}
-
-// Convert ARGB To ARGB1555.
-LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height) {
-  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToARGB1555Row_C;
-#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-    if (width * 2 <= kMaxStride) {
-      ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
     }
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
     }
   }
 #endif
+#if defined(HAS_I422TORGBAROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
+    I422ToRGBARow = I422ToRGBARow_DSPR2;
+  }
+#endif
 
-  for (int y = 0; y < height; ++y) {
-    ARGBToARGB1555Row(src_argb, dst_argb1555, width);
-    src_argb += src_stride_argb;
-    dst_argb1555 += dst_stride_argb1555;
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
   }
   return 0;
 }
 
-// Convert ARGB To ARGB4444.
+// Convert I422 to RGBA.
 LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height) {
-  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToARGB4444Row_C;
-#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-    if (width * 2 <= kMaxStride) {
-      ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
-    }
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
-    }
-  }
-#endif
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants,
+                          width, height);
+}
 
-  for (int y = 0; y < height; ++y) {
-    ARGBToARGB4444Row(src_argb, dst_argb4444, width);
-    src_argb += src_stride_argb;
-    dst_argb4444 += dst_stride_argb4444;
-  }
-  return 0;
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_bgra, dst_stride_bgra,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
 }
 
 // Convert NV12 to RGB565.
-// TODO(fbarchard): (Re) Optimize for Neon.
 LIBYUV_API
 int NV12ToRGB565(const uint8* src_y, int src_stride_y,
                  const uint8* src_uv, int src_stride_uv,
                  uint8* dst_rgb565, int dst_stride_rgb565,
                  int width, int height) {
-  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+  int y;
+  void (*NV12ToRGB565Row)(const uint8* y_buf,
+                          const uint8* uv_buf,
+                          uint8* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width) = NV12ToRGB565Row_C;
+  if (!src_y || !src_uv || !dst_rgb565 ||
+      width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -836,33 +1058,33 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
     dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
     dst_stride_rgb565 = -dst_stride_rgb565;
   }
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        int width) = NV12ToARGBRow_C;
-#if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) {
-    NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+    }
   }
 #endif
-#if defined(HAS_NV12TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width * 4 <= kMaxStride) {
-    NV12ToARGBRow = NV12ToARGBRow_NEON;
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+    }
   }
 #endif
-
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToRGB565Row_C;
-#if defined(HAS_ARGBTORGB565ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
-    ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+#if defined(HAS_NV12TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+    }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    NV12ToARGBRow(src_y, src_uv, row, width);
-    ARGBToRGB565Row(row, dst_rgb565, width);
+  for (y = 0; y < height; ++y) {
+    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
     dst_rgb565 += dst_stride_rgb565;
     src_y += src_stride_y;
     if (y & 1) {
@@ -872,48 +1094,52 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// Convert NV21 to RGB565.
+// Convert RAW to RGB24.
 LIBYUV_API
-int NV21ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_vu, int src_stride_vu,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
-  if (!src_y || !src_vu || !dst_rgb565 || width <= 0 || height == 0) {
+int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
+               uint8* dst_rgb24, int dst_stride_rgb24,
+               int width, int height) {
+  int y;
+  void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) =
+      RAWToRGB24Row_C;
+  if (!src_raw || !dst_rgb24 ||
+      width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
   }
-  void (*NV21ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        int width) = NV21ToARGBRow_C;
-#if defined(HAS_NV21TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) {
-    NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 &&
+      dst_stride_rgb24 == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_rgb24 = 0;
+  }
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToRGB24Row = RAWToRGB24Row_SSSE3;
+    }
   }
 #endif
-
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToRGB565Row_C;
-#if defined(HAS_ARGBTORGB565ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
-    ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+#if defined(HAS_RAWTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToRGB24Row = RAWToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToRGB24Row = RAWToRGB24Row_NEON;
+    }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
-    NV21ToARGBRow(src_y, src_vu, row, width);
-    ARGBToRGB565Row(row, dst_rgb565, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_vu += src_stride_vu;
-    }
+  for (y = 0; y < height; ++y) {
+    RAWToRGB24Row(src_raw, dst_rgb24, width);
+    src_raw += src_stride_raw;
+    dst_rgb24 += dst_stride_rgb24;
   }
   return 0;
 }
@@ -922,24 +1148,44 @@ LIBYUV_API
 void SetPlane(uint8* dst_y, int dst_stride_y,
               int width, int height,
               uint32 value) {
-  void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow8_C;
+  int y;
+  void (*SetRow)(uint8* dst, uint8 value, int width) = SetRow_C;
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    dst_stride_y = 0;
+  }
 #if defined(HAS_SETROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-    SetRow = SetRow8_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SetRow = SetRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SetRow = SetRow_NEON;
+    }
   }
 #endif
 #if defined(HAS_SETROW_X86)
-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
-    SetRow = SetRow8_X86;
+  if (TestCpuFlag(kCpuHasX86)) {
+    SetRow = SetRow_Any_X86;
+    if (IS_ALIGNED(width, 4)) {
+      SetRow = SetRow_X86;
+    }
+  }
+#endif
+#if defined(HAS_SETROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    SetRow = SetRow_ERMS;
   }
 #endif
 
-  uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
   // Set plane
-  for (int y = 0; y < height; ++y) {
-    SetRow(dst_y, v32, width);
+  for (y = 0; y < height; ++y) {
+    SetRow(dst_y, value, width);
     dst_y += dst_stride_y;
   }
 }
@@ -952,19 +1198,19 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
              int x, int y,
              int width, int height,
              int value_y, int value_u, int value_v) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  uint8* start_y = dst_y + y * dst_stride_y + x;
+  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
   if (!dst_y || !dst_u || !dst_v ||
-      width <= 0 || height <= 0 ||
+      width <= 0 || height == 0 ||
       x < 0 || y < 0 ||
       value_y < 0 || value_y > 255 ||
       value_u < 0 || value_u > 255 ||
       value_v < 0 || value_v > 255) {
     return -1;
   }
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  uint8* start_y = dst_y + y * dst_stride_y + x;
-  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
-  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
 
   SetPlane(start_y, dst_stride_y, width, height, value_y);
   SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
@@ -978,26 +1224,45 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
              int dst_x, int dst_y,
              int width, int height,
              uint32 value) {
+  int y;
+  void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int width) = ARGBSetRow_C;
   if (!dst_argb ||
-      width <= 0 || height <= 0 ||
+      width <= 0 || height == 0 ||
       dst_x < 0 || dst_y < 0) {
     return -1;
   }
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-#if defined(HAS_SETROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    SetRows32_NEON(dst, value, width, dst_stride_argb, height);
-    return 0;
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  dst_argb += dst_y * dst_stride_argb + dst_x * 4;
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+
+#if defined(HAS_ARGBSETROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBSetRow = ARGBSetRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_NEON;
+    }
   }
 #endif
-#if defined(HAS_SETROW_X86)
+#if defined(HAS_ARGBSETROW_X86)
   if (TestCpuFlag(kCpuHasX86)) {
-    SetRows32_X86(dst, value, width, dst_stride_argb, height);
-    return 0;
+    ARGBSetRow = ARGBSetRow_X86;
   }
 #endif
-  SetRows32_C(dst, value, width, dst_stride_argb, height);
+
+  // Set plane
+  for (y = 0; y < height; ++y) {
+    ARGBSetRow(dst_argb, value, width);
+    dst_argb += dst_stride_argb;
+  }
   return 0;
 }
 
@@ -1018,6 +1283,9 @@ LIBYUV_API
 int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
                   uint8* dst_argb, int dst_stride_argb,
                   int width, int height) {
+  int y;
+  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1026,24 +1294,39 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
-                           int width) = ARGBAttenuateRow_C;
-#if defined(HAS_ARGBATTENUATE_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
   }
-#endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBAttenuateRow(src_argb, dst_argb, width);
     src_argb += src_stride_argb;
     dst_argb += dst_stride_argb;
@@ -1056,6 +1339,9 @@ LIBYUV_API
 int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
                     uint8* dst_argb, int dst_stride_argb,
                     int width, int height) {
+  int y;
+  void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
+                             int width) = ARGBUnattenuateRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1064,17 +1350,32 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
-                             int width) = ARGBUnattenuateRow_C;
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
 #if defined(HAS_ARGBUNATTENUATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
+    }
   }
 #endif
+// TODO(fbarchard): Neon version.
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBUnattenuateRow(src_argb, dst_argb, width);
     src_argb += src_stride_argb;
     dst_argb += dst_stride_argb;
@@ -1087,6 +1388,9 @@ LIBYUV_API
 int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+                      int width) = ARGBGrayRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1095,17 +1399,25 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
-                      int width) = ARGBGrayRow_C;
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
 #if defined(HAS_ARGBGRAYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
     ARGBGrayRow = ARGBGrayRow_SSSE3;
   }
 #endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_NEON;
+  }
+#endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBGrayRow(src_argb, dst_argb, width);
     src_argb += src_stride_argb;
     dst_argb += dst_stride_argb;
@@ -1118,19 +1430,30 @@ LIBYUV_API
 int ARGBGray(uint8* dst_argb, int dst_stride_argb,
              int dst_x, int dst_y,
              int width, int height) {
+  int y;
+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+                      int width) = ARGBGrayRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
-                      int width) = ARGBGrayRow_C;
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
 #if defined(HAS_ARGBGRAYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
     ARGBGrayRow = ARGBGrayRow_SSSE3;
   }
 #endif
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  for (int y = 0; y < height; ++y) {
+#if defined(HAS_ARGBGRAYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
     ARGBGrayRow(dst, dst, width);
     dst += dst_stride_argb;
   }
@@ -1141,78 +1464,182 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
 LIBYUV_API
 int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
               int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
-  void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
 #if defined(HAS_ARGBSEPIAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
     ARGBSepiaRow = ARGBSepiaRow_SSSE3;
   }
 #endif
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  for (int y = 0; y < height; ++y) {
+#if defined(HAS_ARGBSEPIAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
     ARGBSepiaRow(dst, width);
     dst += dst_stride_argb;
   }
   return 0;
 }
 
-// Apply a 4x3 matrix rotation to each ARGB pixel.
+// Apply a 4x4 matrix to each ARGB pixel.
+// Note: Normally for shading, but can be used to swizzle or invert.
 LIBYUV_API
-int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
                     const int8* matrix_argb,
-                    int dst_x, int dst_y, int width, int height) {
-  if (!dst_argb || !matrix_argb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
+                    int width, int height) {
+  int y;
+  void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
+      const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
+  if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
     return -1;
   }
-  void (*ARGBColorMatrixRow)(uint8* dst_argb, const int8* matrix_argb,
-                             int width) = ARGBColorMatrixRow_C;
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
 #if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
     ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
   }
 #endif
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  for (int y = 0; y < height; ++y) {
-    ARGBColorMatrixRow(dst, matrix_argb, width);
-    dst += dst_stride_argb;
+#if defined(HAS_ARGBCOLORMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
   }
   return 0;
 }
 
+// Apply a 4x3 matrix to each ARGB pixel.
+// Deprecated.
+LIBYUV_API
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                   const int8* matrix_rgb,
+                   int dst_x, int dst_y, int width, int height) {
+  SIMD_ALIGNED(int8 matrix_argb[16]);
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+
+  // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
+  matrix_argb[0] = matrix_rgb[0] / 2;
+  matrix_argb[1] = matrix_rgb[1] / 2;
+  matrix_argb[2] = matrix_rgb[2] / 2;
+  matrix_argb[3] = matrix_rgb[3] / 2;
+  matrix_argb[4] = matrix_rgb[4] / 2;
+  matrix_argb[5] = matrix_rgb[5] / 2;
+  matrix_argb[6] = matrix_rgb[6] / 2;
+  matrix_argb[7] = matrix_rgb[7] / 2;
+  matrix_argb[8] = matrix_rgb[8] / 2;
+  matrix_argb[9] = matrix_rgb[9] / 2;
+  matrix_argb[10] = matrix_rgb[10] / 2;
+  matrix_argb[11] = matrix_rgb[11] / 2;
+  matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
+  matrix_argb[15] = 64;  // 1.0
+
+  return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,
+                         dst, dst_stride_argb,
+                         &matrix_argb[0], width, height);
+}
+
 // Apply a color table each ARGB pixel.
 // Table contains 256 ARGB values.
 LIBYUV_API
 int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
                    const uint8* table_argb,
                    int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+                            int width) = ARGBColorTableRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
       dst_x < 0 || dst_y < 0) {
     return -1;
   }
-  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
-                            int width) = ARGBColorTableRow_C;
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
 #if defined(HAS_ARGBCOLORTABLEROW_X86)
   if (TestCpuFlag(kCpuHasX86)) {
     ARGBColorTableRow = ARGBColorTableRow_X86;
   }
 #endif
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBColorTableRow(dst, table_argb, width);
     dst += dst_stride_argb;
   }
   return 0;
 }
 
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                  const uint8* table_argb,
+                  int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+                           int width) = RGBColorTableRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_RGBCOLORTABLEROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    RGBColorTableRow = RGBColorTableRow_X86;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    RGBColorTableRow(dst, table_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
 // ARGBQuantize is used to posterize art.
 // e.g. rgb / qvalue * qvalue + qvalue / 2
 // But the low levels implement efficiently with 3 parameters, and could be
 // used for other high level operations.
+// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+// where scale is 1 / interval_size as a fixed point value.
 // The divide is replaces with a multiply by reciprocal fixed point multiply.
 // Caveat - although SSE2 saturates, the C function does not and should be used
 // with care if doing anything but quantization.
@@ -1220,20 +1647,31 @@ LIBYUV_API
 int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
                  int scale, int interval_size, int interval_offset,
                  int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) = ARGBQuantizeRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
       interval_size < 1 || interval_size > 255) {
     return -1;
   }
-  void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) = ARGBQuantizeRow_C;
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
 #if defined(HAS_ARGBQUANTIZEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
     ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
   }
 #endif
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  for (int y = 0; y < height; ++y) {
+#if defined(HAS_ARGBQUANTIZEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
     ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
     dst += dst_stride_argb;
   }
@@ -1246,19 +1684,20 @@ LIBYUV_API
 int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
                              int32* dst_cumsum, int dst_stride32_cumsum,
                              int width, int height) {
+  int y;
+  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+  int32* previous_cumsum = dst_cumsum;
   if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
     return -1;
   }
-  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
-#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2)
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
   }
 #endif
   memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4);  // 4 int per pixel.
-  int32* previous_cumsum = dst_cumsum;
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
     previous_cumsum = dst_cumsum;
     dst_cumsum += dst_stride32_cumsum;
@@ -1276,17 +1715,36 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
              uint8* dst_argb, int dst_stride_argb,
              int32* dst_cumsum, int dst_stride32_cumsum,
              int width, int height, int radius) {
+  int y;
+  void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
+      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+  void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
+      int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
+  int32* cumsum_bot_row;
+  int32* max_cumsum_bot_row;
+  int32* cumsum_top_row;
+
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
-  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
-  void (*CumulativeSumToAverage)(const int32* topleft, const int32* botleft,
-      int width, int area, uint8* dst, int count) = CumulativeSumToAverage_C;
-#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2)
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (radius > height) {
+    radius = height;
+  }
+  if (radius > (width / 2 - 1)) {
+    radius = width / 2 - 1;
+  }
+  if (radius <= 0) {
+    return -1;
+  }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
-    CumulativeSumToAverage = CumulativeSumToAverage_SSE2;
+    CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
   }
 #endif
   // Compute enough CumulativeSum for first row to be blurred. After this
@@ -1296,16 +1754,18 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
                            width, radius);
 
   src_argb = src_argb + radius * src_stride_argb;
-  int32* cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
+  cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
 
-  const int32* max_cumsum_bot_row =
-      &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
-  const int32* cumsum_top_row = &dst_cumsum[0];
+  max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
+  cumsum_top_row = &dst_cumsum[0];
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
     int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
     int area = radius * (bot_y - top_y);
+    int boxwidth = radius * 4;
+    int x;
+    int n;
 
     // Increment cumsum_top_row pointer with circular buffer wrap around.
     if (top_y) {
@@ -1328,27 +1788,25 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
     }
 
     // Left clipped.
-    int boxwidth = radius * 4;
-    int x;
     for (x = 0; x < radius + 1; ++x) {
-      CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row,
-                              boxwidth, area, &dst_argb[x * 4], 1);
+      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+                                boxwidth, area, &dst_argb[x * 4], 1);
       area += (bot_y - top_y);
       boxwidth += 4;
     }
 
     // Middle unclipped.
-    int n = (width - 1) - radius - x + 1;
-    CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row,
-                           boxwidth, area, &dst_argb[x * 4], n);
+    n = (width - 1) - radius - x + 1;
+    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+                              boxwidth, area, &dst_argb[x * 4], n);
 
     // Right clipped.
     for (x += n; x <= width - 1; ++x) {
       area -= (bot_y - top_y);
       boxwidth -= 4;
-      CumulativeSumToAverage(cumsum_top_row + (x - radius - 1) * 4,
-                             cumsum_bot_row + (x - radius - 1) * 4,
-                             boxwidth, area, &dst_argb[x * 4], 1);
+      CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
+                                cumsum_bot_row + (x - radius - 1) * 4,
+                                boxwidth, area, &dst_argb[x * 4], 1);
     }
     dst_argb += dst_stride_argb;
   }
@@ -1360,6 +1818,9 @@ LIBYUV_API
 int ARGBShade(const uint8* src_argb, int src_stride_argb,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height, uint32 value) {
+  int y;
+  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
+                       int width, uint32 value) = ARGBShadeRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
     return -1;
   }
@@ -1368,17 +1829,25 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
-                       int width, uint32 value) = ARGBShadeRow_C;
-#if defined(HAS_ARGBSHADE_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSHADEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
     ARGBShadeRow = ARGBShadeRow_SSE2;
   }
 #endif
+#if defined(HAS_ARGBSHADEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBShadeRow = ARGBShadeRow_NEON;
+  }
+#endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBShadeRow(src_argb, dst_argb, width, value);
     src_argb += src_stride_argb;
     dst_argb += dst_stride_argb;
@@ -1386,42 +1855,816 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+// Interpolate 2 planes by specified amount (0 to 255).
+LIBYUV_API
+int InterpolatePlane(const uint8* src0, int src_stride0,
+                     const uint8* src1, int src_stride1,
+                     uint8* dst, int dst_stride,
+                     int width, int height, int interpolation) {
+  int y;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst = dst + (height - 1) * dst_stride;
+    dst_stride = -dst_stride;
+  }
+  // Coalesce rows.
+  if (src_stride0 == width &&
+      src_stride1 == width &&
+      dst_stride == width) {
+    width *= height;
+    height = 1;
+    src_stride0 = src_stride1 = dst_stride = 0;
+  }
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src0, 4) && IS_ALIGNED(src_stride0, 4) &&
+      IS_ALIGNED(src1, 4) && IS_ALIGNED(src_stride1, 4) &&
+      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4) &&
+      IS_ALIGNED(width, 4)) {
+    InterpolateRow = InterpolateRow_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    InterpolateRow(dst, src0, src1 - src0, width, interpolation);
+    src0 += src_stride0;
+    src1 += src_stride1;
+    dst += dst_stride;
+  }
+  return 0;
+}
+
 // Interpolate 2 ARGB images by specified amount (0 to 255).
 LIBYUV_API
 int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                     const uint8* src_argb1, int src_stride_argb1,
                     uint8* dst_argb, int dst_stride_argb,
                     int width, int height, int interpolation) {
-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+  return InterpolatePlane(src_argb0, src_stride_argb0,
+                          src_argb1, src_stride_argb1,
+                          dst_argb, dst_stride_argb,
+                          width * 4, height, interpolation);
+}
+
+// Interpolate 2 YUV images by specified amount (0 to 255).
+LIBYUV_API
+int I420Interpolate(const uint8* src0_y, int src0_stride_y,
+                    const uint8* src0_u, int src0_stride_u,
+                    const uint8* src0_v, int src0_stride_v,
+                    const uint8* src1_y, int src1_stride_y,
+                    const uint8* src1_u, int src1_stride_u,
+                    const uint8* src1_v, int src1_stride_v,
+                    uint8* dst_y, int dst_stride_y,
+                    uint8* dst_u, int dst_stride_u,
+                    uint8* dst_v, int dst_stride_v,
+                    int width, int height, int interpolation) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src0_y || !src0_u || !src0_v ||
+      !src1_y || !src1_u || !src1_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  InterpolatePlane(src0_y, src0_stride_y,
+                   src1_y, src1_stride_y,
+                   dst_y, dst_stride_y,
+                   width, height, interpolation);
+  InterpolatePlane(src0_u, src0_stride_u,
+                   src1_u, src1_stride_u,
+                   dst_u, dst_stride_u,
+                   halfwidth, halfheight, interpolation);
+  InterpolatePlane(src0_v, src0_stride_v,
+                   src1_v, src1_stride_v,
+                   dst_v, dst_stride_v,
+                   halfwidth, halfheight, interpolation);
+  return 0;
+}
+
+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+                uint8* dst_argb, int dst_stride_argb,
+                const uint8* shuffler, int width, int height) {
+  int y;
+  void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
+                         const uint8* shuffler, int width) = ARGBShuffleRow_C;
+  if (!src_bgra || !dst_argb ||
+      width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
   }
-  void (*ARGBInterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
-                              ptrdiff_t src_stride, int dst_width,
-                              int source_y_fraction) = ARGBInterpolateRow_C;
-#if defined(HAS_ARGBINTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
-      IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGBInterpolateRow = ARGBInterpolateRow_SSSE3;
+  // Coalesce rows.
+  if (src_stride_bgra == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_bgra = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSHUFFLEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBShuffleRow = ARGBShuffleRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBShuffleRow = ARGBShuffleRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBShuffleRow = ARGBShuffleRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBShuffleRow = ARGBShuffleRow_NEON;
+    }
   }
 #endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
+    src_bgra += src_stride_bgra;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Sobel ARGB effect.
+static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
+                        uint8* dst_argb, int dst_stride_argb,
+                        int width, int height,
+                        void (*SobelRow)(const uint8* src_sobelx,
+                                         const uint8* src_sobely,
+                                         uint8* dst, int width)) {
+  int y;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) =
+      ARGBToYJRow_C;
+  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) = SobelYRow_C;
+  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobely, int width) =
+      SobelXRow_C;
+  const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
+  if (!src_argb  || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_SOBELYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelYRow = SobelYRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelYRow = SobelYRow_NEON;
+  }
+#endif
+#if defined(HAS_SOBELXROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXRow = SobelXRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXRow = SobelXRow_NEON;
+  }
+#endif
+  {
+    // 3 rows with edges before/after.
+    const int kRowSize = (width + kEdge + 31) & ~31;
+    align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
+    uint8* row_sobelx = rows;
+    uint8* row_sobely = rows + kRowSize;
+    uint8* row_y = rows + kRowSize * 2;
+
+    // Convert first row.
+    uint8* row_y0 = row_y + kEdge;
+    uint8* row_y1 = row_y0 + kRowSize;
+    uint8* row_y2 = row_y1 + kRowSize;
+    ARGBToYJRow(src_argb, row_y0, width);
+    row_y0[-1] = row_y0[0];
+    memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
+    ARGBToYJRow(src_argb, row_y1, width);
+    row_y1[-1] = row_y1[0];
+    memset(row_y1 + width, row_y1[width - 1], 16);
+    memset(row_y2 + width, 0, 16);
+
+    for (y = 0; y < height; ++y) {
+      // Convert next row of ARGB to G.
+      if (y < (height - 1)) {
+        src_argb += src_stride_argb;
+      }
+      ARGBToYJRow(src_argb, row_y2, width);
+      row_y2[-1] = row_y2[0];
+      row_y2[width] = row_y2[width - 1];
+
+      SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
+      SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
+      SobelRow(row_sobelx, row_sobely, dst_argb, width);
+
+      // Cycle thru circular queue of 3 row_y buffers.
+      {
+        uint8* row_yt = row_y0;
+        row_y0 = row_y1;
+        row_y1 = row_y2;
+        row_y2 = row_yt;
+      }
+
+      dst_argb += dst_stride_argb;
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) = SobelRow_C;
+#if defined(HAS_SOBELROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelRow = SobelRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelRow = SobelRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelRow = SobelRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SobelRow = SobelRow_NEON;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                      width, height, SobelRow);
+}
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+                     uint8* dst_y, int dst_stride_y,
+                     int width, int height) {
+  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_, int width) = SobelToPlaneRow_C;
+#if defined(HAS_SOBELTOPLANEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelToPlaneRow = SobelToPlaneRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SobelToPlaneRow = SobelToPlaneRow_NEON;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
+                      width, height, SobelToPlaneRow);
+}
+
+// SobelXY ARGB effect.
+// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height) {
+  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) = SobelXYRow_C;
+#if defined(HAS_SOBELXYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXYRow = SobelXYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelXYRow = SobelXYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXYRow = SobelXYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SobelXYRow = SobelXYRow_NEON;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                      width, height, SobelXYRow);
+}
+
+// Apply a 4x4 polynomial to each ARGB pixel.
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   const float* poly,
+                   int width, int height) {
+  int y;
+  void (*ARGBPolynomialRow)(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) = ARGBPolynomialRow_C;
+  if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
+    ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
+      IS_ALIGNED(width, 2)) {
+    ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBPolynomialRow(src_argb, dst_argb, poly, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a lumacolortable to each ARGB pixel.
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_argb, int dst_stride_argb,
+                       const uint8* luma,
+                       int width, int height) {
+  int y;
+  void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
+      int width, const uint8* luma, const uint32 lumacoeff) =
+      ARGBLumaColorTableRow_C;
+  if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
+    ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Copy Alpha from one ARGB image to another.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height) {
+  int y;
+  void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+      ARGBCopyAlphaRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBCopyAlphaRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Extract just the alpha channel from ARGB.
+LIBYUV_API
+int ARGBExtractAlpha(const uint8* src_argb, int src_stride,
+                     uint8* dst_a, int dst_stride,
+                     int width, int height) {
+  if (!src_argb || !dst_a || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb += (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+  // Coalesce rows.
+  if (src_stride == width * 4 && dst_stride == width) {
+    width *= height;
+    height = 1;
+    src_stride = dst_stride = 0;
+  }
+  void (*ARGBExtractAlphaRow)(const uint8 *src_argb, uint8 *dst_a, int width) =
+      ARGBExtractAlphaRow_C;
+#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
+                                               : ARGBExtractAlphaRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
+                                                : ARGBExtractAlphaRow_Any_NEON;
+  }
+#endif
+
   for (int y = 0; y < height; ++y) {
-    ARGBInterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
-                       width, interpolation);
-    src_argb0 += src_stride_argb0;
-    src_argb1 += src_stride_argb1;
+    ARGBExtractAlphaRow(src_argb, dst_a, width);
+    src_argb += src_stride;
+    dst_a += dst_stride;
+  }
+  return 0;
+}
+
+// Copy a planar Y channel to the alpha channel of a destination ARGB image.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+                     uint8* dst_argb, int dst_stride_argb,
+                     int width, int height) {
+  int y;
+  void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
+      ARGBCopyYToAlphaRow_C;
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBCopyYToAlphaRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
     dst_argb += dst_stride_argb;
   }
   return 0;
 }
 
+// TODO(fbarchard): Consider if width is even Y channel can be split
+// directly. A SplitUVRow_Odd function could copy the remaining chroma.
+
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) = SplitUVRow_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_yuy2 ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+
+  {
+    int awidth = halfwidth * 2;
+    // row of y and 2 rows of uv
+    align_buffer_64(rows, awidth * 3);
+
+    for (y = 0; y < height - 1; y += 2) {
+      // Split Y from UV.
+      SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
+      memcpy(dst_y, rows, width);
+      SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
+      memcpy(dst_y + dst_stride_y, rows, width);
+      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
+      src_yuy2 += src_stride_yuy2 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      // Split Y from UV.
+      SplitUVRow(src_yuy2, rows, dst_uv, awidth);
+      memcpy(dst_y, rows, width);
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) = SplitUVRow_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_uyvy ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+
+  {
+    int awidth = halfwidth * 2;
+    // row of y and 2 rows of uv
+    align_buffer_64(rows, awidth * 3);
+
+    for (y = 0; y < height - 1; y += 2) {
+      // Split Y from UV.
+      SplitUVRow(src_uyvy, rows + awidth, rows, awidth);
+      memcpy(dst_y, rows, width);
+      SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth);
+      memcpy(dst_y + dst_stride_y, rows, width);
+      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
+      src_uyvy += src_stride_uyvy * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      // Split Y from UV.
+      SplitUVRow(src_uyvy, dst_uv, rows, awidth);
+      memcpy(dst_y, rows, width);
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/source/rotate.cc b/files/source/rotate.cc
index cac3fa0b..01ea5c40 100644
--- a/files/source/rotate.cc
+++ b/files/source/rotate.cc
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -13,6 +13,7 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/convert.h"
 #include "libyuv/planar_functions.h"
+#include "libyuv/rotate_row.h"
 #include "libyuv/row.h"
 
 #ifdef __cplusplus
@@ -20,785 +21,46 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if !defined(YUV_DISABLE_ASM) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#if defined(__APPLE__) && defined(__i386__)
-#define DECLARE_FUNCTION(name)                                                 \
-    ".text                                     \n"                             \
-    ".private_extern _" #name "                \n"                             \
-    ".align 4,0x90                             \n"                             \
-"_" #name ":                                   \n"
-#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
-#define DECLARE_FUNCTION(name)                                                 \
-    ".text                                     \n"                             \
-    ".align 4,0x90                             \n"                             \
-"_" #name ":                                   \n"
-#else
-#define DECLARE_FUNCTION(name)                                                 \
-    ".text                                     \n"                             \
-    ".align 4,0x90                             \n"                             \
-#name ":                                       \n"
-#endif
-#endif
-
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
-#define HAS_MIRRORROW_NEON
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
-#define HAS_MIRRORROW_UV_NEON
-void MirrorRowUV_NEON(const uint8* src,
-                        uint8* dst_a, uint8* dst_b,
-                        int width);
-#define HAS_TRANSPOSE_WX8_NEON
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width);
-#define HAS_TRANSPOSE_UVWX8_NEON
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
-                         int width);
-#endif  // defined(__ARM_NEON__)
-
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_TRANSPOSE_WX8_SSSE3
-__declspec(naked) __declspec(align(16))
-static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                               uint8* dst, int dst_stride, int width) {
-  __asm {
-    push      edi
-    push      esi
-    push      ebp
-    mov       eax, [esp + 12 + 4]   // src
-    mov       edi, [esp + 12 + 8]   // src_stride
-    mov       edx, [esp + 12 + 12]  // dst
-    mov       esi, [esp + 12 + 16]  // dst_stride
-    mov       ecx, [esp + 12 + 20]  // width
-
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    align      16
- convertloop:
-    movq      xmm0, qword ptr [eax]
-    lea       ebp, [eax + 8]
-    movq      xmm1, qword ptr [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    punpcklbw xmm0, xmm1
-    movq      xmm2, qword ptr [eax]
-    movdqa    xmm1, xmm0
-    palignr   xmm1, xmm1, 8
-    movq      xmm3, qword ptr [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    punpcklbw xmm2, xmm3
-    movdqa    xmm3, xmm2
-    movq      xmm4, qword ptr [eax]
-    palignr   xmm3, xmm3, 8
-    movq      xmm5, qword ptr [eax + edi]
-    punpcklbw xmm4, xmm5
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm5, xmm4
-    movq      xmm6, qword ptr [eax]
-    palignr   xmm5, xmm5, 8
-    movq      xmm7, qword ptr [eax + edi]
-    punpcklbw xmm6, xmm7
-    mov       eax, ebp
-    movdqa    xmm7, xmm6
-    palignr   xmm7, xmm7, 8
-    // Second round of bit swap.
-    punpcklwd xmm0, xmm2
-    punpcklwd xmm1, xmm3
-    movdqa    xmm2, xmm0
-    movdqa    xmm3, xmm1
-    palignr   xmm2, xmm2, 8
-    palignr   xmm3, xmm3, 8
-    punpcklwd xmm4, xmm6
-    punpcklwd xmm5, xmm7
-    movdqa    xmm6, xmm4
-    movdqa    xmm7, xmm5
-    palignr   xmm6, xmm6, 8
-    palignr   xmm7, xmm7, 8
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    punpckldq xmm0, xmm4
-    movq      qword ptr [edx], xmm0
-    movdqa    xmm4, xmm0
-    palignr   xmm4, xmm4, 8
-    movq      qword ptr [edx + esi], xmm4
-    lea       edx, [edx + 2 * esi]
-    punpckldq xmm2, xmm6
-    movdqa    xmm6, xmm2
-    palignr   xmm6, xmm6, 8
-    movq      qword ptr [edx], xmm2
-    punpckldq xmm1, xmm5
-    movq      qword ptr [edx + esi], xmm6
-    lea       edx, [edx + 2 * esi]
-    movdqa    xmm5, xmm1
-    movq      qword ptr [edx], xmm1
-    palignr   xmm5, xmm5, 8
-    punpckldq xmm3, xmm7
-    movq      qword ptr [edx + esi], xmm5
-    lea       edx, [edx + 2 * esi]
-    movq      qword ptr [edx], xmm3
-    movdqa    xmm7, xmm3
-    palignr   xmm7, xmm7, 8
-    sub       ecx, 8
-    movq      qword ptr [edx + esi], xmm7
-    lea       edx, [edx + 2 * esi]
-    jg        convertloop
-
-    pop       ebp
-    pop       esi
-    pop       edi
-    ret
-  }
-}
-
-#define HAS_TRANSPOSE_UVWX8_SSE2
-__declspec(naked) __declspec(align(16))
-static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                                uint8* dst_a, int dst_stride_a,
-                                uint8* dst_b, int dst_stride_b,
-                                int w) {
-  __asm {
-    push      ebx
-    push      esi
-    push      edi
-    push      ebp
-    mov       eax, [esp + 16 + 4]   // src
-    mov       edi, [esp + 16 + 8]   // src_stride
-    mov       edx, [esp + 16 + 12]  // dst_a
-    mov       esi, [esp + 16 + 16]  // dst_stride_a
-    mov       ebx, [esp + 16 + 20]  // dst_b
-    mov       ebp, [esp + 16 + 24]  // dst_stride_b
-    mov       ecx, esp
-    sub       esp, 4 + 16
-    and       esp, ~15
-    mov       [esp + 16], ecx
-    mov       ecx, [ecx + 16 + 28]  // w
-
-    align      16
- convertloop:
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    movdqa    xmm0, [eax]
-    movdqa    xmm1, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm7, xmm0  // use xmm7 as temp register.
-    punpcklbw xmm0, xmm1
-    punpckhbw xmm7, xmm1
-    movdqa    xmm1, xmm7
-    movdqa    xmm2, [eax]
-    movdqa    xmm3, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm7, xmm2
-    punpcklbw xmm2, xmm3
-    punpckhbw xmm7, xmm3
-    movdqa    xmm3, xmm7
-    movdqa    xmm4, [eax]
-    movdqa    xmm5, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm7, xmm4
-    punpcklbw xmm4, xmm5
-    punpckhbw xmm7, xmm5
-    movdqa    xmm5, xmm7
-    movdqa    xmm6, [eax]
-    movdqa    xmm7, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    [esp], xmm5  // backup xmm5
-    neg       edi
-    movdqa    xmm5, xmm6   // use xmm5 as temp register.
-    punpcklbw xmm6, xmm7
-    punpckhbw xmm5, xmm7
-    movdqa    xmm7, xmm5
-    lea       eax, [eax + 8 * edi + 16]
-    neg       edi
-    // Second round of bit swap.
-    movdqa    xmm5, xmm0
-    punpcklwd xmm0, xmm2
-    punpckhwd xmm5, xmm2
-    movdqa    xmm2, xmm5
-    movdqa    xmm5, xmm1
-    punpcklwd xmm1, xmm3
-    punpckhwd xmm5, xmm3
-    movdqa    xmm3, xmm5
-    movdqa    xmm5, xmm4
-    punpcklwd xmm4, xmm6
-    punpckhwd xmm5, xmm6
-    movdqa    xmm6, xmm5
-    movdqa    xmm5, [esp]  // restore xmm5
-    movdqa    [esp], xmm6  // backup xmm6
-    movdqa    xmm6, xmm5    // use xmm6 as temp register.
-    punpcklwd xmm5, xmm7
-    punpckhwd xmm6, xmm7
-    movdqa    xmm7, xmm6
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    movdqa    xmm6, xmm0
-    punpckldq xmm0, xmm4
-    punpckhdq xmm6, xmm4
-    movdqa    xmm4, xmm6
-    movdqa    xmm6, [esp]  // restore xmm6
-    movlpd    qword ptr [edx], xmm0
-    movhpd    qword ptr [ebx], xmm0
-    movlpd    qword ptr [edx + esi], xmm4
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm4
-    lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
-    punpckldq xmm2, xmm6
-    movlpd    qword ptr [edx], xmm2
-    movhpd    qword ptr [ebx], xmm2
-    punpckhdq xmm0, xmm6
-    movlpd    qword ptr [edx + esi], xmm0
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm0
-    lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
-    punpckldq xmm1, xmm5
-    movlpd    qword ptr [edx], xmm1
-    movhpd    qword ptr [ebx], xmm1
-    punpckhdq xmm0, xmm5
-    movlpd    qword ptr [edx + esi], xmm0
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm0
-    lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
-    punpckldq xmm3, xmm7
-    movlpd    qword ptr [edx], xmm3
-    movhpd    qword ptr [ebx], xmm3
-    punpckhdq xmm0, xmm7
-    sub       ecx, 8
-    movlpd    qword ptr [edx + esi], xmm0
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm0
-    lea       ebx, [ebx + 2 * ebp]
-    jg        convertloop
-
-    mov       esp, [esp + 16]
-    pop       ebp
-    pop       edi
-    pop       esi
-    pop       ebx
-    ret
-  }
-}
-#elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__))
-#define HAS_TRANSPOSE_WX8_SSSE3
-static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                               uint8* dst, int dst_stride, int width) {
-  asm volatile (
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    ".p2align  4                                 \n"
-  "1:                                            \n"
-    "movq       (%0),%%xmm0                      \n"
-    "movq       (%0,%3),%%xmm1                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm1,%%xmm0                    \n"
-    "movq       (%0),%%xmm2                      \n"
-    "movdqa     %%xmm0,%%xmm1                    \n"
-    "palignr    $0x8,%%xmm1,%%xmm1               \n"
-    "movq       (%0,%3),%%xmm3                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm3,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm3                    \n"
-    "movq       (%0),%%xmm4                      \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "movq       (%0,%3),%%xmm5                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm5,%%xmm4                    \n"
-    "movdqa     %%xmm4,%%xmm5                    \n"
-    "movq       (%0),%%xmm6                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       (%0,%3),%%xmm7                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm7,%%xmm6                    \n"
-    "neg        %3                               \n"
-    "movdqa     %%xmm6,%%xmm7                    \n"
-    "lea        0x8(%0,%3,8),%0                  \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "neg        %3                               \n"
-     // Second round of bit swap.
-    "punpcklwd  %%xmm2,%%xmm0                    \n"
-    "punpcklwd  %%xmm3,%%xmm1                    \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "palignr    $0x8,%%xmm2,%%xmm2               \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "punpcklwd  %%xmm6,%%xmm4                    \n"
-    "punpcklwd  %%xmm7,%%xmm5                    \n"
-    "movdqa     %%xmm4,%%xmm6                    \n"
-    "movdqa     %%xmm5,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    "punpckldq  %%xmm4,%%xmm0                    \n"
-    "movq       %%xmm0,(%1)                      \n"
-    "movdqa     %%xmm0,%%xmm4                    \n"
-    "palignr    $0x8,%%xmm4,%%xmm4               \n"
-    "movq       %%xmm4,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm6,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm6                    \n"
-    "movq       %%xmm2,(%1)                      \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "punpckldq  %%xmm5,%%xmm1                    \n"
-    "movq       %%xmm6,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "movdqa     %%xmm1,%%xmm5                    \n"
-    "movq       %%xmm1,(%1)                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       %%xmm5,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm7,%%xmm3                    \n"
-    "movq       %%xmm3,(%1)                      \n"
-    "movdqa     %%xmm3,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "sub        $0x8,%2                          \n"
-    "movq       %%xmm7,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "jg         1b                               \n"
-    : "+r"(src),    // %0
-      "+r"(dst),    // %1
-      "+r"(width)   // %2
-    : "r"(static_cast<intptr_t>(src_stride)),  // %3
-      "r"(static_cast<intptr_t>(dst_stride))   // %4
-    : "memory", "cc"
-  #if defined(__SSE2__)
-      , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  #endif
-  );
-}
-
-#if !defined(YUV_DISABLE_ASM) && defined (__i386__)
-#define HAS_TRANSPOSE_UVWX8_SSE2
-extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                                    uint8* dst_a, int dst_stride_a,
-                                    uint8* dst_b, int dst_stride_b,
-                                    int w);
-  asm (
-    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
-    "push   %ebx                               \n"
-    "push   %esi                               \n"
-    "push   %edi                               \n"
-    "push   %ebp                               \n"
-    "mov    0x14(%esp),%eax                    \n"
-    "mov    0x18(%esp),%edi                    \n"
-    "mov    0x1c(%esp),%edx                    \n"
-    "mov    0x20(%esp),%esi                    \n"
-    "mov    0x24(%esp),%ebx                    \n"
-    "mov    0x28(%esp),%ebp                    \n"
-    "mov    %esp,%ecx                          \n"
-    "sub    $0x14,%esp                         \n"
-    "and    $0xfffffff0,%esp                   \n"
-    "mov    %ecx,0x10(%esp)                    \n"
-    "mov    0x2c(%ecx),%ecx                    \n"
-
-"1:                                            \n"
-    "movdqa (%eax),%xmm0                       \n"
-    "movdqa (%eax,%edi,1),%xmm1                \n"
-    "lea    (%eax,%edi,2),%eax                 \n"
-    "movdqa %xmm0,%xmm7                        \n"
-    "punpcklbw %xmm1,%xmm0                     \n"
-    "punpckhbw %xmm1,%xmm7                     \n"
-    "movdqa %xmm7,%xmm1                        \n"
-    "movdqa (%eax),%xmm2                       \n"
-    "movdqa (%eax,%edi,1),%xmm3                \n"
-    "lea    (%eax,%edi,2),%eax                 \n"
-    "movdqa %xmm2,%xmm7                        \n"
-    "punpcklbw %xmm3,%xmm2                     \n"
-    "punpckhbw %xmm3,%xmm7                     \n"
-    "movdqa %xmm7,%xmm3                        \n"
-    "movdqa (%eax),%xmm4                       \n"
-    "movdqa (%eax,%edi,1),%xmm5                \n"
-    "lea    (%eax,%edi,2),%eax                 \n"
-    "movdqa %xmm4,%xmm7                        \n"
-    "punpcklbw %xmm5,%xmm4                     \n"
-    "punpckhbw %xmm5,%xmm7                     \n"
-    "movdqa %xmm7,%xmm5                        \n"
-    "movdqa (%eax),%xmm6                       \n"
-    "movdqa (%eax,%edi,1),%xmm7                \n"
-    "lea    (%eax,%edi,2),%eax                 \n"
-    "movdqa %xmm5,(%esp)                       \n"
-    "neg    %edi                               \n"
-    "movdqa %xmm6,%xmm5                        \n"
-    "punpcklbw %xmm7,%xmm6                     \n"
-    "punpckhbw %xmm7,%xmm5                     \n"
-    "movdqa %xmm5,%xmm7                        \n"
-    "lea    0x10(%eax,%edi,8),%eax             \n"
-    "neg    %edi                               \n"
-    "movdqa %xmm0,%xmm5                        \n"
-    "punpcklwd %xmm2,%xmm0                     \n"
-    "punpckhwd %xmm2,%xmm5                     \n"
-    "movdqa %xmm5,%xmm2                        \n"
-    "movdqa %xmm1,%xmm5                        \n"
-    "punpcklwd %xmm3,%xmm1                     \n"
-    "punpckhwd %xmm3,%xmm5                     \n"
-    "movdqa %xmm5,%xmm3                        \n"
-    "movdqa %xmm4,%xmm5                        \n"
-    "punpcklwd %xmm6,%xmm4                     \n"
-    "punpckhwd %xmm6,%xmm5                     \n"
-    "movdqa %xmm5,%xmm6                        \n"
-    "movdqa (%esp),%xmm5                       \n"
-    "movdqa %xmm6,(%esp)                       \n"
-    "movdqa %xmm5,%xmm6                        \n"
-    "punpcklwd %xmm7,%xmm5                     \n"
-    "punpckhwd %xmm7,%xmm6                     \n"
-    "movdqa %xmm6,%xmm7                        \n"
-    "movdqa %xmm0,%xmm6                        \n"
-    "punpckldq %xmm4,%xmm0                     \n"
-    "punpckhdq %xmm4,%xmm6                     \n"
-    "movdqa %xmm6,%xmm4                        \n"
-    "movdqa (%esp),%xmm6                       \n"
-    "movlpd %xmm0,(%edx)                       \n"
-    "movhpd %xmm0,(%ebx)                       \n"
-    "movlpd %xmm4,(%edx,%esi,1)                \n"
-    "lea    (%edx,%esi,2),%edx                 \n"
-    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
-    "lea    (%ebx,%ebp,2),%ebx                 \n"
-    "movdqa %xmm2,%xmm0                        \n"
-    "punpckldq %xmm6,%xmm2                     \n"
-    "movlpd %xmm2,(%edx)                       \n"
-    "movhpd %xmm2,(%ebx)                       \n"
-    "punpckhdq %xmm6,%xmm0                     \n"
-    "movlpd %xmm0,(%edx,%esi,1)                \n"
-    "lea    (%edx,%esi,2),%edx                 \n"
-    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
-    "lea    (%ebx,%ebp,2),%ebx                 \n"
-    "movdqa %xmm1,%xmm0                        \n"
-    "punpckldq %xmm5,%xmm1                     \n"
-    "movlpd %xmm1,(%edx)                       \n"
-    "movhpd %xmm1,(%ebx)                       \n"
-    "punpckhdq %xmm5,%xmm0                     \n"
-    "movlpd %xmm0,(%edx,%esi,1)                \n"
-    "lea    (%edx,%esi,2),%edx                 \n"
-    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
-    "lea    (%ebx,%ebp,2),%ebx                 \n"
-    "movdqa %xmm3,%xmm0                        \n"
-    "punpckldq %xmm7,%xmm3                     \n"
-    "movlpd %xmm3,(%edx)                       \n"
-    "movhpd %xmm3,(%ebx)                       \n"
-    "punpckhdq %xmm7,%xmm0                     \n"
-    "sub    $0x8,%ecx                          \n"
-    "movlpd %xmm0,(%edx,%esi,1)                \n"
-    "lea    (%edx,%esi,2),%edx                 \n"
-    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
-    "lea    (%ebx,%ebp,2),%ebx                 \n"
-    "jg     1b                                 \n"
-    "mov    0x10(%esp),%esp                    \n"
-    "pop    %ebp                               \n"
-    "pop    %edi                               \n"
-    "pop    %esi                               \n"
-    "pop    %ebx                               \n"
-    "ret                                       \n"
-);
-#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
-// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
-#define HAS_TRANSPOSE_WX8_FAST_SSSE3
-static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
-                                    uint8* dst, int dst_stride, int width) {
-  asm volatile (
-  // Read in the data from the source pointer.
-  // First round of bit swap.
-  ".p2align  4                                 \n"
-"1:                                            \n"
-  "movdqa     (%0),%%xmm0                      \n"
-  "movdqa     (%0,%3),%%xmm1                   \n"
-  "lea        (%0,%3,2),%0                     \n"
-  "movdqa     %%xmm0,%%xmm8                    \n"
-  "punpcklbw  %%xmm1,%%xmm0                    \n"
-  "punpckhbw  %%xmm1,%%xmm8                    \n"
-  "movdqa     (%0),%%xmm2                      \n"
-  "movdqa     %%xmm0,%%xmm1                    \n"
-  "movdqa     %%xmm8,%%xmm9                    \n"
-  "palignr    $0x8,%%xmm1,%%xmm1               \n"
-  "palignr    $0x8,%%xmm9,%%xmm9               \n"
-  "movdqa     (%0,%3),%%xmm3                   \n"
-  "lea        (%0,%3,2),%0                     \n"
-  "movdqa     %%xmm2,%%xmm10                   \n"
-  "punpcklbw  %%xmm3,%%xmm2                    \n"
-  "punpckhbw  %%xmm3,%%xmm10                   \n"
-  "movdqa     %%xmm2,%%xmm3                    \n"
-  "movdqa     %%xmm10,%%xmm11                  \n"
-  "movdqa     (%0),%%xmm4                      \n"
-  "palignr    $0x8,%%xmm3,%%xmm3               \n"
-  "palignr    $0x8,%%xmm11,%%xmm11             \n"
-  "movdqa     (%0,%3),%%xmm5                   \n"
-  "lea        (%0,%3,2),%0                     \n"
-  "movdqa     %%xmm4,%%xmm12                   \n"
-  "punpcklbw  %%xmm5,%%xmm4                    \n"
-  "punpckhbw  %%xmm5,%%xmm12                   \n"
-  "movdqa     %%xmm4,%%xmm5                    \n"
-  "movdqa     %%xmm12,%%xmm13                  \n"
-  "movdqa     (%0),%%xmm6                      \n"
-  "palignr    $0x8,%%xmm5,%%xmm5               \n"
-  "palignr    $0x8,%%xmm13,%%xmm13             \n"
-  "movdqa     (%0,%3),%%xmm7                   \n"
-  "lea        (%0,%3,2),%0                     \n"
-  "movdqa     %%xmm6,%%xmm14                   \n"
-  "punpcklbw  %%xmm7,%%xmm6                    \n"
-  "punpckhbw  %%xmm7,%%xmm14                   \n"
-  "neg        %3                               \n"
-  "movdqa     %%xmm6,%%xmm7                    \n"
-  "movdqa     %%xmm14,%%xmm15                  \n"
-  "lea        0x10(%0,%3,8),%0                 \n"
-  "palignr    $0x8,%%xmm7,%%xmm7               \n"
-  "palignr    $0x8,%%xmm15,%%xmm15             \n"
-  "neg        %3                               \n"
-   // Second round of bit swap.
-  "punpcklwd  %%xmm2,%%xmm0                    \n"
-  "punpcklwd  %%xmm3,%%xmm1                    \n"
-  "movdqa     %%xmm0,%%xmm2                    \n"
-  "movdqa     %%xmm1,%%xmm3                    \n"
-  "palignr    $0x8,%%xmm2,%%xmm2               \n"
-  "palignr    $0x8,%%xmm3,%%xmm3               \n"
-  "punpcklwd  %%xmm6,%%xmm4                    \n"
-  "punpcklwd  %%xmm7,%%xmm5                    \n"
-  "movdqa     %%xmm4,%%xmm6                    \n"
-  "movdqa     %%xmm5,%%xmm7                    \n"
-  "palignr    $0x8,%%xmm6,%%xmm6               \n"
-  "palignr    $0x8,%%xmm7,%%xmm7               \n"
-  "punpcklwd  %%xmm10,%%xmm8                   \n"
-  "punpcklwd  %%xmm11,%%xmm9                   \n"
-  "movdqa     %%xmm8,%%xmm10                   \n"
-  "movdqa     %%xmm9,%%xmm11                   \n"
-  "palignr    $0x8,%%xmm10,%%xmm10             \n"
-  "palignr    $0x8,%%xmm11,%%xmm11             \n"
-  "punpcklwd  %%xmm14,%%xmm12                  \n"
-  "punpcklwd  %%xmm15,%%xmm13                  \n"
-  "movdqa     %%xmm12,%%xmm14                  \n"
-  "movdqa     %%xmm13,%%xmm15                  \n"
-  "palignr    $0x8,%%xmm14,%%xmm14             \n"
-  "palignr    $0x8,%%xmm15,%%xmm15             \n"
-  // Third round of bit swap.
-  // Write to the destination pointer.
-  "punpckldq  %%xmm4,%%xmm0                    \n"
-  "movq       %%xmm0,(%1)                      \n"
-  "movdqa     %%xmm0,%%xmm4                    \n"
-  "palignr    $0x8,%%xmm4,%%xmm4               \n"
-  "movq       %%xmm4,(%1,%4)                   \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "punpckldq  %%xmm6,%%xmm2                    \n"
-  "movdqa     %%xmm2,%%xmm6                    \n"
-  "movq       %%xmm2,(%1)                      \n"
-  "palignr    $0x8,%%xmm6,%%xmm6               \n"
-  "punpckldq  %%xmm5,%%xmm1                    \n"
-  "movq       %%xmm6,(%1,%4)                   \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "movdqa     %%xmm1,%%xmm5                    \n"
-  "movq       %%xmm1,(%1)                      \n"
-  "palignr    $0x8,%%xmm5,%%xmm5               \n"
-  "movq       %%xmm5,(%1,%4)                   \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "punpckldq  %%xmm7,%%xmm3                    \n"
-  "movq       %%xmm3,(%1)                      \n"
-  "movdqa     %%xmm3,%%xmm7                    \n"
-  "palignr    $0x8,%%xmm7,%%xmm7               \n"
-  "movq       %%xmm7,(%1,%4)                   \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "punpckldq  %%xmm12,%%xmm8                   \n"
-  "movq       %%xmm8,(%1)                      \n"
-  "movdqa     %%xmm8,%%xmm12                   \n"
-  "palignr    $0x8,%%xmm12,%%xmm12             \n"
-  "movq       %%xmm12,(%1,%4)                  \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "punpckldq  %%xmm14,%%xmm10                  \n"
-  "movdqa     %%xmm10,%%xmm14                  \n"
-  "movq       %%xmm10,(%1)                     \n"
-  "palignr    $0x8,%%xmm14,%%xmm14             \n"
-  "punpckldq  %%xmm13,%%xmm9                   \n"
-  "movq       %%xmm14,(%1,%4)                  \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "movdqa     %%xmm9,%%xmm13                   \n"
-  "movq       %%xmm9,(%1)                      \n"
-  "palignr    $0x8,%%xmm13,%%xmm13             \n"
-  "movq       %%xmm13,(%1,%4)                  \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "punpckldq  %%xmm15,%%xmm11                  \n"
-  "movq       %%xmm11,(%1)                     \n"
-  "movdqa     %%xmm11,%%xmm15                  \n"
-  "palignr    $0x8,%%xmm15,%%xmm15             \n"
-  "sub        $0x10,%2                         \n"
-  "movq       %%xmm15,(%1,%4)                  \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "jg         1b                               \n"
-  : "+r"(src),    // %0
-    "+r"(dst),    // %1
-    "+r"(width)   // %2
-  : "r"(static_cast<intptr_t>(src_stride)),  // %3
-    "r"(static_cast<intptr_t>(dst_stride))   // %4
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
-    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
-);
-}
-
-#define HAS_TRANSPOSE_UVWX8_SSE2
-static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                                uint8* dst_a, int dst_stride_a,
-                                uint8* dst_b, int dst_stride_b,
-                                int w) {
-  asm volatile (
-  // Read in the data from the source pointer.
-  // First round of bit swap.
-  ".p2align  4                                 \n"
-"1:                                            \n"
-  "movdqa     (%0),%%xmm0                      \n"
-  "movdqa     (%0,%4),%%xmm1                   \n"
-  "lea        (%0,%4,2),%0                     \n"
-  "movdqa     %%xmm0,%%xmm8                    \n"
-  "punpcklbw  %%xmm1,%%xmm0                    \n"
-  "punpckhbw  %%xmm1,%%xmm8                    \n"
-  "movdqa     %%xmm8,%%xmm1                    \n"
-  "movdqa     (%0),%%xmm2                      \n"
-  "movdqa     (%0,%4),%%xmm3                   \n"
-  "lea        (%0,%4,2),%0                     \n"
-  "movdqa     %%xmm2,%%xmm8                    \n"
-  "punpcklbw  %%xmm3,%%xmm2                    \n"
-  "punpckhbw  %%xmm3,%%xmm8                    \n"
-  "movdqa     %%xmm8,%%xmm3                    \n"
-  "movdqa     (%0),%%xmm4                      \n"
-  "movdqa     (%0,%4),%%xmm5                   \n"
-  "lea        (%0,%4,2),%0                     \n"
-  "movdqa     %%xmm4,%%xmm8                    \n"
-  "punpcklbw  %%xmm5,%%xmm4                    \n"
-  "punpckhbw  %%xmm5,%%xmm8                    \n"
-  "movdqa     %%xmm8,%%xmm5                    \n"
-  "movdqa     (%0),%%xmm6                      \n"
-  "movdqa     (%0,%4),%%xmm7                   \n"
-  "lea        (%0,%4,2),%0                     \n"
-  "movdqa     %%xmm6,%%xmm8                    \n"
-  "punpcklbw  %%xmm7,%%xmm6                    \n"
-  "neg        %4                               \n"
-  "lea        0x10(%0,%4,8),%0                 \n"
-  "punpckhbw  %%xmm7,%%xmm8                    \n"
-  "movdqa     %%xmm8,%%xmm7                    \n"
-  "neg        %4                               \n"
-   // Second round of bit swap.
-  "movdqa     %%xmm0,%%xmm8                    \n"
-  "movdqa     %%xmm1,%%xmm9                    \n"
-  "punpckhwd  %%xmm2,%%xmm8                    \n"
-  "punpckhwd  %%xmm3,%%xmm9                    \n"
-  "punpcklwd  %%xmm2,%%xmm0                    \n"
-  "punpcklwd  %%xmm3,%%xmm1                    \n"
-  "movdqa     %%xmm8,%%xmm2                    \n"
-  "movdqa     %%xmm9,%%xmm3                    \n"
-  "movdqa     %%xmm4,%%xmm8                    \n"
-  "movdqa     %%xmm5,%%xmm9                    \n"
-  "punpckhwd  %%xmm6,%%xmm8                    \n"
-  "punpckhwd  %%xmm7,%%xmm9                    \n"
-  "punpcklwd  %%xmm6,%%xmm4                    \n"
-  "punpcklwd  %%xmm7,%%xmm5                    \n"
-  "movdqa     %%xmm8,%%xmm6                    \n"
-  "movdqa     %%xmm9,%%xmm7                    \n"
-  // Third round of bit swap.
-  // Write to the destination pointer.
-  "movdqa     %%xmm0,%%xmm8                    \n"
-  "punpckldq  %%xmm4,%%xmm0                    \n"
-  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
-  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
-  "punpckhdq  %%xmm4,%%xmm8                    \n"
-  "movlpd     %%xmm8,(%1,%5)                   \n"
-  "lea        (%1,%5,2),%1                     \n"
-  "movhpd     %%xmm8,(%2,%6)                   \n"
-  "lea        (%2,%6,2),%2                     \n"
-  "movdqa     %%xmm2,%%xmm8                    \n"
-  "punpckldq  %%xmm6,%%xmm2                    \n"
-  "movlpd     %%xmm2,(%1)                      \n"
-  "movhpd     %%xmm2,(%2)                      \n"
-  "punpckhdq  %%xmm6,%%xmm8                    \n"
-  "movlpd     %%xmm8,(%1,%5)                   \n"
-  "lea        (%1,%5,2),%1                     \n"
-  "movhpd     %%xmm8,(%2,%6)                   \n"
-  "lea        (%2,%6,2),%2                     \n"
-  "movdqa     %%xmm1,%%xmm8                    \n"
-  "punpckldq  %%xmm5,%%xmm1                    \n"
-  "movlpd     %%xmm1,(%1)                      \n"
-  "movhpd     %%xmm1,(%2)                      \n"
-  "punpckhdq  %%xmm5,%%xmm8                    \n"
-  "movlpd     %%xmm8,(%1,%5)                   \n"
-  "lea        (%1,%5,2),%1                     \n"
-  "movhpd     %%xmm8,(%2,%6)                   \n"
-  "lea        (%2,%6,2),%2                     \n"
-  "movdqa     %%xmm3,%%xmm8                    \n"
-  "punpckldq  %%xmm7,%%xmm3                    \n"
-  "movlpd     %%xmm3,(%1)                      \n"
-  "movhpd     %%xmm3,(%2)                      \n"
-  "punpckhdq  %%xmm7,%%xmm8                    \n"
-  "sub        $0x8,%3                          \n"
-  "movlpd     %%xmm8,(%1,%5)                   \n"
-  "lea        (%1,%5,2),%1                     \n"
-  "movhpd     %%xmm8,(%2,%6)                   \n"
-  "lea        (%2,%6,2),%2                     \n"
-  "jg         1b                               \n"
-  : "+r"(src),    // %0
-    "+r"(dst_a),  // %1
-    "+r"(dst_b),  // %2
-    "+r"(w)   // %3
-  : "r"(static_cast<intptr_t>(src_stride)),    // %4
-    "r"(static_cast<intptr_t>(dst_stride_a)),  // %5
-    "r"(static_cast<intptr_t>(dst_stride_b))   // %6
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
-    "xmm8", "xmm9"
-);
-}
-#endif
-#endif
-
-static void TransposeWx8_C(const uint8* src, int src_stride,
-                           uint8* dst, int dst_stride,
-                           int width) {
-  for (int i = 0; i < width; ++i) {
-    dst[0] = src[0 * src_stride];
-    dst[1] = src[1 * src_stride];
-    dst[2] = src[2 * src_stride];
-    dst[3] = src[3 * src_stride];
-    dst[4] = src[4 * src_stride];
-    dst[5] = src[5 * src_stride];
-    dst[6] = src[6 * src_stride];
-    dst[7] = src[7 * src_stride];
-    ++src;
-    dst += dst_stride;
-  }
-}
-
-static void TransposeWxH_C(const uint8* src, int src_stride,
-                           uint8* dst, int dst_stride,
-                           int width, int height) {
-  for (int i = 0; i < width; ++i) {
-    for (int j = 0; j < height; ++j) {
-      dst[i * dst_stride + j] = src[j * src_stride + i];
-    }
-  }
-}
-
 LIBYUV_API
 void TransposePlane(const uint8* src, int src_stride,
                     uint8* dst, int dst_stride,
                     int width, int height) {
+  int i = height;
   void (*TransposeWx8)(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride,
-                       int width) = TransposeWx8_C;
-#if defined(HAS_TRANSPOSE_WX8_NEON)
+                       uint8* dst, int dst_stride, int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSEWX8_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     TransposeWx8 = TransposeWx8_NEON;
   }
 #endif
-#if defined(HAS_TRANSPOSE_WX8_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
-    TransposeWx8 = TransposeWx8_SSSE3;
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeWx8 = TransposeWx8_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx8 = TransposeWx8_Fast_SSSE3;
+    }
   }
 #endif
-#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
-    TransposeWx8 = TransposeWx8_FAST_SSSE3;
+#if defined(HAS_TRANSPOSEWX8_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    if (IS_ALIGNED(width, 4) &&
+        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+      TransposeWx8 = TransposeWx8_Fast_DSPR2;
+    } else {
+      TransposeWx8 = TransposeWx8_DSPR2;
+    }
   }
 #endif
 
   // Work across the source in 8x8 tiles
-  int i = height;
   while (i >= 8) {
     TransposeWx8(src, src_stride, dst, dst_stride, width);
     src += 8 * src_stride;    // Go down 8 rows.
@@ -806,7 +68,9 @@ void TransposePlane(const uint8* src, int src_stride,
     i -= 8;
   }
 
-  TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+  if (i > 0) {
+    TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+  }
 }
 
 LIBYUV_API
@@ -837,56 +101,74 @@ LIBYUV_API
 void RotatePlane180(const uint8* src, int src_stride,
                     uint8* dst, int dst_stride,
                     int width, int height) {
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width);
+  const uint8* src_bot = src + src_stride * (height - 1);
+  uint8* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
   void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 #if defined(HAS_MIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    MirrorRow = MirrorRow_NEON;
-  }
-#endif
-#if defined(HAS_MIRRORROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    MirrorRow = MirrorRow_SSE2;
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
   }
 #endif
 #if defined(HAS_MIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    MirrorRow = MirrorRow_SSSE3;
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
   }
 #endif
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
-    CopyRow = CopyRow_NEON;
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
   }
 #endif
-#if defined(HAS_COPYROW_X86)
-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
-    CopyRow = CopyRow_X86;
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
+    MirrorRow = MirrorRow_DSPR2;
   }
 #endif
 #if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    CopyRow = CopyRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
   }
 #endif
-  if (width > kMaxStride) {
-    return;
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
   }
-  // Swap first and last row and mirror the content. Uses a temporary row.
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  const uint8* src_bot = src + src_stride * (height - 1);
-  uint8* dst_bot = dst + dst_stride * (height - 1);
-  int half_height = (height + 1) >> 1;
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
   // Odd height will harmlessly mirror the middle row twice.
-  for (int y = 0; y < half_height; ++y) {
+  for (y = 0; y < half_height; ++y) {
     MirrorRow(src, row, width);  // Mirror first row into a buffer
     src += src_stride;
     MirrorRow(src_bot, dst, width);  // Mirror last row into first row
@@ -895,44 +177,7 @@ void RotatePlane180(const uint8* src, int src_stride,
     src_bot -= src_stride;
     dst_bot -= dst_stride;
   }
-}
-
-static void TransposeUVWx8_C(const uint8* src, int src_stride,
-                             uint8* dst_a, int dst_stride_a,
-                             uint8* dst_b, int dst_stride_b,
-                             int width) {
-  for (int i = 0; i < width; ++i) {
-    dst_a[0] = src[0 * src_stride + 0];
-    dst_b[0] = src[0 * src_stride + 1];
-    dst_a[1] = src[1 * src_stride + 0];
-    dst_b[1] = src[1 * src_stride + 1];
-    dst_a[2] = src[2 * src_stride + 0];
-    dst_b[2] = src[2 * src_stride + 1];
-    dst_a[3] = src[3 * src_stride + 0];
-    dst_b[3] = src[3 * src_stride + 1];
-    dst_a[4] = src[4 * src_stride + 0];
-    dst_b[4] = src[4 * src_stride + 1];
-    dst_a[5] = src[5 * src_stride + 0];
-    dst_b[5] = src[5 * src_stride + 1];
-    dst_a[6] = src[6 * src_stride + 0];
-    dst_b[6] = src[6 * src_stride + 1];
-    dst_a[7] = src[7 * src_stride + 0];
-    dst_b[7] = src[7 * src_stride + 1];
-    src += 2;
-    dst_a += dst_stride_a;
-    dst_b += dst_stride_b;
-  }
-}
-
-static void TransposeUVWxH_C(const uint8* src, int src_stride,
-                             uint8* dst_a, int dst_stride_a,
-                             uint8* dst_b, int dst_stride_b,
-                             int width, int height) {
-  for (int i = 0; i < width * 2; i += 2)
-    for (int j = 0; j < height; ++j) {
-      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
-      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
-    }
+  free_aligned_buffer_64(row);
 }
 
 LIBYUV_API
@@ -940,24 +185,32 @@ void TransposeUV(const uint8* src, int src_stride,
                  uint8* dst_a, int dst_stride_a,
                  uint8* dst_b, int dst_stride_b,
                  int width, int height) {
+  int i = height;
   void (*TransposeUVWx8)(const uint8* src, int src_stride,
                          uint8* dst_a, int dst_stride_a,
                          uint8* dst_b, int dst_stride_b,
                          int width) = TransposeUVWx8_C;
-#if defined(HAS_TRANSPOSE_UVWX8_NEON)
+#if defined(HAS_TRANSPOSEUVWX8_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     TransposeUVWx8 = TransposeUVWx8_NEON;
   }
-#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
-    TransposeUVWx8 = TransposeUVWx8_SSE2;
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeUVWx8 = TransposeUVWx8_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    TransposeUVWx8 = TransposeUVWx8_DSPR2;
   }
 #endif
 
   // Work through the source in 8x8 tiles.
-  int i = height;
   while (i >= 8) {
     TransposeUVWx8(src, src_stride,
                    dst_a, dst_stride_a,
@@ -969,10 +222,12 @@ void TransposeUV(const uint8* src, int src_stride,
     i -= 8;
   }
 
-  TransposeUVWxH_C(src, src_stride,
-                   dst_a, dst_stride_a,
-                   dst_b, dst_stride_b,
-                   width, i);
+  if (i > 0) {
+    TransposeUVWxH_C(src, src_stride,
+                     dst_a, dst_stride_a,
+                     dst_b, dst_stride_b,
+                     width, i);
+  }
 }
 
 LIBYUV_API
@@ -1011,25 +266,31 @@ void RotateUV180(const uint8* src, int src_stride,
                  uint8* dst_a, int dst_stride_a,
                  uint8* dst_b, int dst_stride_b,
                  int width, int height) {
-  void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
-      MirrorRowUV_C;
-#if defined(HAS_MIRRORROW_UV_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MirrorRowUV = MirrorRowUV_NEON;
+  int i;
+  void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
+      MirrorUVRow_C;
+#if defined(HAS_MIRRORUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    MirrorUVRow = MirrorUVRow_NEON;
   }
-#elif defined(HAS_MIRRORROW_UV_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
-    MirrorRowUV = MirrorRowUV_SSSE3;
+#endif
+#if defined(HAS_MIRRORUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+    MirrorUVRow = MirrorUVRow_SSSE3;
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    MirrorUVRow = MirrorUVRow_DSPR2;
   }
 #endif
 
   dst_a += dst_stride_a * (height - 1);
   dst_b += dst_stride_b * (height - 1);
 
-  for (int i = 0; i < height; ++i) {
-    MirrorRowUV(src, dst_a, dst_b, width);
+  for (i = 0; i < height; ++i) {
+    MirrorUVRow(src, dst_a, dst_b, width);
     src += src_stride;
     dst_a -= dst_stride_a;
     dst_b -= dst_stride_b;
@@ -1037,6 +298,50 @@ void RotateUV180(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+                uint8* dst, int dst_stride,
+                int width, int height,
+                enum RotationMode mode) {
+  if (!src || width <= 0 || height == 0 || !dst) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src = src + (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      CopyPlane(src, src_stride,
+                dst, dst_stride,
+                width, height);
+      return 0;
+    case kRotate90:
+      RotatePlane90(src, src_stride,
+                    dst, dst_stride,
+                    width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src, src_stride,
+                     dst, dst_stride,
+                     width, height);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src, src_stride,
+                     dst, dst_stride,
+                     width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
 int I420Rotate(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
@@ -1044,13 +349,13 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height,
-               RotationMode mode) {
+               enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
   if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
       !dst_y || !dst_u || !dst_v) {
     return -1;
   }
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
 
   // Negative height means invert the image.
   if (height < 0) {
@@ -1120,13 +425,13 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
                      uint8* dst_u, int dst_stride_u,
                      uint8* dst_v, int dst_stride_v,
                      int width, int height,
-                     RotationMode mode) {
+                     enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
   if (!src_y || !src_uv || width <= 0 || height == 0 ||
       !dst_y || !dst_u || !dst_v) {
     return -1;
   }
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
 
   // Negative height means invert the image.
   if (height < 0) {
diff --git a/files/source/rotate_any.cc b/files/source/rotate_any.cc
new file mode 100644
index 00000000..31a74c31
--- /dev/null
+++ b/files/source/rotate_any.cc
@@ -0,0 +1,80 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define TANY(NAMEANY, TPOS_SIMD, MASK)                                         \
+    void NAMEANY(const uint8* src, int src_stride,                             \
+                 uint8* dst, int dst_stride, int width) {                      \
+      int r = width & MASK;                                                    \
+      int n = width - r;                                                       \
+      if (n > 0) {                                                             \
+        TPOS_SIMD(src, src_stride, dst, dst_stride, n);                        \
+      }                                                                        \
+      TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
+    }
+
+#ifdef HAS_TRANSPOSEWX8_NEON
+TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_SSSE3
+TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
+TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
+#endif
+#ifdef HAS_TRANSPOSEWX8_DSPR2
+TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
+#endif
+#undef TANY
+
+#define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \
+    void NAMEANY(const uint8* src, int src_stride,                             \
+                uint8* dst_a, int dst_stride_a,                                \
+                uint8* dst_b, int dst_stride_b, int width) {                   \
+      int r = width & MASK;                                                    \
+      int n = width - r;                                                       \
+      if (n > 0) {                                                             \
+        TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,   \
+                  n);                                                          \
+      }                                                                        \
+      TransposeUVWx8_C(src + n * 2, src_stride,                                \
+                       dst_a + n * dst_stride_a, dst_stride_a,                 \
+                       dst_b + n * dst_stride_b, dst_stride_b, r);             \
+    }
+
+#ifdef HAS_TRANSPOSEUVWX8_NEON
+TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX8_SSE2
+TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX8_DSPR2
+TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
+#endif
+#undef TUVANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+
+
+
+
diff --git a/files/source/rotate_argb.cc b/files/source/rotate_argb.cc
index 9c994467..787c0ad1 100644
--- a/files/source/rotate_argb.cc
+++ b/files/source/rotate_argb.cc
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -22,32 +22,41 @@ extern "C" {
 
 // ARGBScale has a function to copy pixels to a row, striding each source
 // pixel by a constant.
-#if !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \
-  defined(__x86_64__) || defined(__i386__))
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || \
+    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
 void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
-                               int src_stepx,
-                               uint8* dst_ptr, int dst_width);
+                               int src_stepx, uint8* dst_ptr, int dst_width);
 #endif
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
+                               int src_stepx, uint8* dst_ptr, int dst_width);
+#endif
+
 void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
-                            int src_stepx,
-                            uint8* dst_ptr, int dst_width);
+                            int src_stepx, uint8* dst_ptr, int dst_width);
 
 static void ARGBTranspose(const uint8* src, int src_stride,
-                          uint8* dst, int dst_stride,
-                          int width, int height) {
+                          uint8* dst, int dst_stride, int width, int height) {
+  int i;
+  int src_pixel_step = src_stride >> 2;
   void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
       int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(height, 4) &&  // width of dest.
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) {  // Width of dest.
     ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) {  // Width of dest.
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+  }
+#endif
 
-  int src_pixel_step = src_stride / 4;
-  for (int i = 0; i < width; ++i) {  // column of source to row of dest.
+  for (i = 0; i < width; ++i) {  // column of source to row of dest.
     ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
     dst += dst_stride;
     src += 4;
@@ -55,8 +64,7 @@ static void ARGBTranspose(const uint8* src, int src_stride,
 }
 
 void ARGBRotate90(const uint8* src, int src_stride,
-                  uint8* dst, int dst_stride,
-                  int width, int height) {
+                  uint8* dst, int dst_stride, int width, int height) {
   // Rotate by 90 is a ARGBTranspose with the source read
   // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
@@ -66,8 +74,7 @@ void ARGBRotate90(const uint8* src, int src_stride,
 }
 
 void ARGBRotate270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+                    uint8* dst, int dst_stride, int width, int height) {
   // Rotate by 270 is a ARGBTranspose with the destination written
   // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
@@ -77,60 +84,83 @@ void ARGBRotate270(const uint8* src, int src_stride,
 }
 
 void ARGBRotate180(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride,
-                   int width, int height) {
+                   uint8* dst, int dst_stride, int width, int height) {
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width * 4);
+  const uint8* src_bot = src + src_stride * (height - 1);
+  uint8* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
   void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
       ARGBMirrorRow_C;
-#if defined(HAS_ARGBMIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
   }
 #endif
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 64)) {
-    CopyRow = CopyRow_NEON;
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
   }
 #endif
-#if defined(HAS_COPYROW_X86)
-  if (TestCpuFlag(kCpuHasX86)) {
-    CopyRow = CopyRow_X86;
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
   }
 #endif
 #if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    CopyRow = CopyRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
   }
 #endif
-  if (width * 4 > kMaxStride) {
-    return;
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
   }
-  // Swap first and last row and mirror the content. Uses a temporary row.
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  const uint8* src_bot = src + src_stride * (height - 1);
-  uint8* dst_bot = dst + dst_stride * (height - 1);
-  int half_height = (height + 1) >> 1;
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
   // Odd height will harmlessly mirror the middle row twice.
-  for (int y = 0; y < half_height; ++y) {
+  for (y = 0; y < half_height; ++y) {
     ARGBMirrorRow(src, row, width);  // Mirror first row into a buffer
-    src += src_stride;
     ARGBMirrorRow(src_bot, dst, width);  // Mirror last row into first row
-    dst += dst_stride;
     CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last
+    src += src_stride;
+    dst += dst_stride;
     src_bot -= src_stride;
     dst_bot -= dst_stride;
   }
+  free_aligned_buffer_64(row);
 }
 
 LIBYUV_API
 int ARGBRotate(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height,
-               RotationMode mode) {
+               uint8* dst_argb, int dst_stride_argb, int width, int height,
+               enum RotationMode mode) {
   if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
     return -1;
   }
diff --git a/files/source/rotate_common.cc b/files/source/rotate_common.cc
new file mode 100644
index 00000000..b33a9a0c
--- /dev/null
+++ b/files/source/rotate_common.cc
@@ -0,0 +1,92 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void TransposeWx8_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst[0] = src[0 * src_stride];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
+}
+
+void TransposeUVWx8_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_a[0] = src[0 * src_stride + 0];
+    dst_b[0] = src[0 * src_stride + 1];
+    dst_a[1] = src[1 * src_stride + 0];
+    dst_b[1] = src[1 * src_stride + 1];
+    dst_a[2] = src[2 * src_stride + 0];
+    dst_b[2] = src[2 * src_stride + 1];
+    dst_a[3] = src[3 * src_stride + 0];
+    dst_b[3] = src[3 * src_stride + 1];
+    dst_a[4] = src[4 * src_stride + 0];
+    dst_b[4] = src[4 * src_stride + 1];
+    dst_a[5] = src[5 * src_stride + 0];
+    dst_b[5] = src[5 * src_stride + 1];
+    dst_a[6] = src[6 * src_stride + 0];
+    dst_b[6] = src[6 * src_stride + 1];
+    dst_a[7] = src[7 * src_stride + 0];
+    dst_b[7] = src[7 * src_stride + 1];
+    src += 2;
+    dst_a += dst_stride_a;
+    dst_b += dst_stride_b;
+  }
+}
+
+void TransposeWxH_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst[i * dst_stride + j] = src[j * src_stride + i];
+    }
+  }
+}
+
+void TransposeUVWxH_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b,
+                      int width, int height) {
+  int i;
+  for (i = 0; i < width * 2; i += 2) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+    }
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/rotate_gcc.cc b/files/source/rotate_gcc.cc
new file mode 100644
index 00000000..cbe870ca
--- /dev/null
+++ b/files/source/rotate_gcc.cc
@@ -0,0 +1,368 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
+  asm volatile (
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    LABELALIGN
+  "1:                                            \n"
+    "movq       (%0),%%xmm0                      \n"
+    "movq       (%0,%3),%%xmm1                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm1,%%xmm0                    \n"
+    "movq       (%0),%%xmm2                      \n"
+    "movdqa     %%xmm0,%%xmm1                    \n"
+    "palignr    $0x8,%%xmm1,%%xmm1               \n"
+    "movq       (%0,%3),%%xmm3                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm3,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm3                    \n"
+    "movq       (%0),%%xmm4                      \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "movq       (%0,%3),%%xmm5                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm5,%%xmm4                    \n"
+    "movdqa     %%xmm4,%%xmm5                    \n"
+    "movq       (%0),%%xmm6                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       (%0,%3),%%xmm7                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm7,%%xmm6                    \n"
+    "neg        %3                               \n"
+    "movdqa     %%xmm6,%%xmm7                    \n"
+    "lea        0x8(%0,%3,8),%0                  \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "neg        %3                               \n"
+     // Second round of bit swap.
+    "punpcklwd  %%xmm2,%%xmm0                    \n"
+    "punpcklwd  %%xmm3,%%xmm1                    \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "palignr    $0x8,%%xmm2,%%xmm2               \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "punpcklwd  %%xmm6,%%xmm4                    \n"
+    "punpcklwd  %%xmm7,%%xmm5                    \n"
+    "movdqa     %%xmm4,%%xmm6                    \n"
+    "movdqa     %%xmm5,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    "punpckldq  %%xmm4,%%xmm0                    \n"
+    "movq       %%xmm0,(%1)                      \n"
+    "movdqa     %%xmm0,%%xmm4                    \n"
+    "palignr    $0x8,%%xmm4,%%xmm4               \n"
+    "movq       %%xmm4,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm6,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm6                    \n"
+    "movq       %%xmm2,(%1)                      \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "punpckldq  %%xmm5,%%xmm1                    \n"
+    "movq       %%xmm6,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "movdqa     %%xmm1,%%xmm5                    \n"
+    "movq       %%xmm1,(%1)                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       %%xmm5,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm7,%%xmm3                    \n"
+    "movq       %%xmm3,(%1)                      \n"
+    "movdqa     %%xmm3,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "sub        $0x8,%2                          \n"
+    "movq       %%xmm7,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "jg         1b                               \n"
+    : "+r"(src),    // %0
+      "+r"(dst),    // %1
+      "+r"(width)   // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "r"((intptr_t)(dst_stride))   // %4
+    : "memory", "cc",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
+
+// Transpose 16x8. 64 bit
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width) {
+  asm volatile (
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu     (%0),%%xmm0                      \n"
+    "movdqu     (%0,%3),%%xmm1                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "movdqa     %%xmm0,%%xmm8                    \n"
+    "punpcklbw  %%xmm1,%%xmm0                    \n"
+    "punpckhbw  %%xmm1,%%xmm8                    \n"
+    "movdqu     (%0),%%xmm2                      \n"
+    "movdqa     %%xmm0,%%xmm1                    \n"
+    "movdqa     %%xmm8,%%xmm9                    \n"
+    "palignr    $0x8,%%xmm1,%%xmm1               \n"
+    "palignr    $0x8,%%xmm9,%%xmm9               \n"
+    "movdqu     (%0,%3),%%xmm3                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "movdqa     %%xmm2,%%xmm10                   \n"
+    "punpcklbw  %%xmm3,%%xmm2                    \n"
+    "punpckhbw  %%xmm3,%%xmm10                   \n"
+    "movdqa     %%xmm2,%%xmm3                    \n"
+    "movdqa     %%xmm10,%%xmm11                  \n"
+    "movdqu     (%0),%%xmm4                      \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "palignr    $0x8,%%xmm11,%%xmm11             \n"
+    "movdqu     (%0,%3),%%xmm5                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "movdqa     %%xmm4,%%xmm12                   \n"
+    "punpcklbw  %%xmm5,%%xmm4                    \n"
+    "punpckhbw  %%xmm5,%%xmm12                   \n"
+    "movdqa     %%xmm4,%%xmm5                    \n"
+    "movdqa     %%xmm12,%%xmm13                  \n"
+    "movdqu     (%0),%%xmm6                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "palignr    $0x8,%%xmm13,%%xmm13             \n"
+    "movdqu     (%0,%3),%%xmm7                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "movdqa     %%xmm6,%%xmm14                   \n"
+    "punpcklbw  %%xmm7,%%xmm6                    \n"
+    "punpckhbw  %%xmm7,%%xmm14                   \n"
+    "neg        %3                               \n"
+    "movdqa     %%xmm6,%%xmm7                    \n"
+    "movdqa     %%xmm14,%%xmm15                  \n"
+    "lea        0x10(%0,%3,8),%0                 \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "palignr    $0x8,%%xmm15,%%xmm15             \n"
+    "neg        %3                               \n"
+     // Second round of bit swap.
+    "punpcklwd  %%xmm2,%%xmm0                    \n"
+    "punpcklwd  %%xmm3,%%xmm1                    \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "palignr    $0x8,%%xmm2,%%xmm2               \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "punpcklwd  %%xmm6,%%xmm4                    \n"
+    "punpcklwd  %%xmm7,%%xmm5                    \n"
+    "movdqa     %%xmm4,%%xmm6                    \n"
+    "movdqa     %%xmm5,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "punpcklwd  %%xmm10,%%xmm8                   \n"
+    "punpcklwd  %%xmm11,%%xmm9                   \n"
+    "movdqa     %%xmm8,%%xmm10                   \n"
+    "movdqa     %%xmm9,%%xmm11                   \n"
+    "palignr    $0x8,%%xmm10,%%xmm10             \n"
+    "palignr    $0x8,%%xmm11,%%xmm11             \n"
+    "punpcklwd  %%xmm14,%%xmm12                  \n"
+    "punpcklwd  %%xmm15,%%xmm13                  \n"
+    "movdqa     %%xmm12,%%xmm14                  \n"
+    "movdqa     %%xmm13,%%xmm15                  \n"
+    "palignr    $0x8,%%xmm14,%%xmm14             \n"
+    "palignr    $0x8,%%xmm15,%%xmm15             \n"
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    "punpckldq  %%xmm4,%%xmm0                    \n"
+    "movq       %%xmm0,(%1)                      \n"
+    "movdqa     %%xmm0,%%xmm4                    \n"
+    "palignr    $0x8,%%xmm4,%%xmm4               \n"
+    "movq       %%xmm4,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm6,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm6                    \n"
+    "movq       %%xmm2,(%1)                      \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "punpckldq  %%xmm5,%%xmm1                    \n"
+    "movq       %%xmm6,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "movdqa     %%xmm1,%%xmm5                    \n"
+    "movq       %%xmm1,(%1)                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       %%xmm5,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm7,%%xmm3                    \n"
+    "movq       %%xmm3,(%1)                      \n"
+    "movdqa     %%xmm3,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "movq       %%xmm7,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm12,%%xmm8                   \n"
+    "movq       %%xmm8,(%1)                      \n"
+    "movdqa     %%xmm8,%%xmm12                   \n"
+    "palignr    $0x8,%%xmm12,%%xmm12             \n"
+    "movq       %%xmm12,(%1,%4)                  \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm14,%%xmm10                  \n"
+    "movdqa     %%xmm10,%%xmm14                  \n"
+    "movq       %%xmm10,(%1)                     \n"
+    "palignr    $0x8,%%xmm14,%%xmm14             \n"
+    "punpckldq  %%xmm13,%%xmm9                   \n"
+    "movq       %%xmm14,(%1,%4)                  \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "movdqa     %%xmm9,%%xmm13                   \n"
+    "movq       %%xmm9,(%1)                      \n"
+    "palignr    $0x8,%%xmm13,%%xmm13             \n"
+    "movq       %%xmm13,(%1,%4)                  \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm15,%%xmm11                  \n"
+    "movq       %%xmm11,(%1)                     \n"
+    "movdqa     %%xmm11,%%xmm15                  \n"
+    "palignr    $0x8,%%xmm15,%%xmm15             \n"
+    "sub        $0x10,%2                         \n"
+    "movq       %%xmm15,(%1,%4)                  \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "jg         1b                               \n"
+    : "+r"(src),    // %0
+      "+r"(dst),    // %1
+      "+r"(width)   // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "r"((intptr_t)(dst_stride))   // %4
+    : "memory", "cc",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
+  );
+}
+#endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+
+// Transpose UV 8x8.  64 bit.
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width) {
+  asm volatile (
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu     (%0),%%xmm0                      \n"
+    "movdqu     (%0,%4),%%xmm1                   \n"
+    "lea        (%0,%4,2),%0                     \n"
+    "movdqa     %%xmm0,%%xmm8                    \n"
+    "punpcklbw  %%xmm1,%%xmm0                    \n"
+    "punpckhbw  %%xmm1,%%xmm8                    \n"
+    "movdqa     %%xmm8,%%xmm1                    \n"
+    "movdqu     (%0),%%xmm2                      \n"
+    "movdqu     (%0,%4),%%xmm3                   \n"
+    "lea        (%0,%4,2),%0                     \n"
+    "movdqa     %%xmm2,%%xmm8                    \n"
+    "punpcklbw  %%xmm3,%%xmm2                    \n"
+    "punpckhbw  %%xmm3,%%xmm8                    \n"
+    "movdqa     %%xmm8,%%xmm3                    \n"
+    "movdqu     (%0),%%xmm4                      \n"
+    "movdqu     (%0,%4),%%xmm5                   \n"
+    "lea        (%0,%4,2),%0                     \n"
+    "movdqa     %%xmm4,%%xmm8                    \n"
+    "punpcklbw  %%xmm5,%%xmm4                    \n"
+    "punpckhbw  %%xmm5,%%xmm8                    \n"
+    "movdqa     %%xmm8,%%xmm5                    \n"
+    "movdqu     (%0),%%xmm6                      \n"
+    "movdqu     (%0,%4),%%xmm7                   \n"
+    "lea        (%0,%4,2),%0                     \n"
+    "movdqa     %%xmm6,%%xmm8                    \n"
+    "punpcklbw  %%xmm7,%%xmm6                    \n"
+    "neg        %4                               \n"
+    "lea        0x10(%0,%4,8),%0                 \n"
+    "punpckhbw  %%xmm7,%%xmm8                    \n"
+    "movdqa     %%xmm8,%%xmm7                    \n"
+    "neg        %4                               \n"
+     // Second round of bit swap.
+    "movdqa     %%xmm0,%%xmm8                    \n"
+    "movdqa     %%xmm1,%%xmm9                    \n"
+    "punpckhwd  %%xmm2,%%xmm8                    \n"
+    "punpckhwd  %%xmm3,%%xmm9                    \n"
+    "punpcklwd  %%xmm2,%%xmm0                    \n"
+    "punpcklwd  %%xmm3,%%xmm1                    \n"
+    "movdqa     %%xmm8,%%xmm2                    \n"
+    "movdqa     %%xmm9,%%xmm3                    \n"
+    "movdqa     %%xmm4,%%xmm8                    \n"
+    "movdqa     %%xmm5,%%xmm9                    \n"
+    "punpckhwd  %%xmm6,%%xmm8                    \n"
+    "punpckhwd  %%xmm7,%%xmm9                    \n"
+    "punpcklwd  %%xmm6,%%xmm4                    \n"
+    "punpcklwd  %%xmm7,%%xmm5                    \n"
+    "movdqa     %%xmm8,%%xmm6                    \n"
+    "movdqa     %%xmm9,%%xmm7                    \n"
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    "movdqa     %%xmm0,%%xmm8                    \n"
+    "punpckldq  %%xmm4,%%xmm0                    \n"
+    "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
+    "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
+    "punpckhdq  %%xmm4,%%xmm8                    \n"
+    "movlpd     %%xmm8,(%1,%5)                   \n"
+    "lea        (%1,%5,2),%1                     \n"
+    "movhpd     %%xmm8,(%2,%6)                   \n"
+    "lea        (%2,%6,2),%2                     \n"
+    "movdqa     %%xmm2,%%xmm8                    \n"
+    "punpckldq  %%xmm6,%%xmm2                    \n"
+    "movlpd     %%xmm2,(%1)                      \n"
+    "movhpd     %%xmm2,(%2)                      \n"
+    "punpckhdq  %%xmm6,%%xmm8                    \n"
+    "movlpd     %%xmm8,(%1,%5)                   \n"
+    "lea        (%1,%5,2),%1                     \n"
+    "movhpd     %%xmm8,(%2,%6)                   \n"
+    "lea        (%2,%6,2),%2                     \n"
+    "movdqa     %%xmm1,%%xmm8                    \n"
+    "punpckldq  %%xmm5,%%xmm1                    \n"
+    "movlpd     %%xmm1,(%1)                      \n"
+    "movhpd     %%xmm1,(%2)                      \n"
+    "punpckhdq  %%xmm5,%%xmm8                    \n"
+    "movlpd     %%xmm8,(%1,%5)                   \n"
+    "lea        (%1,%5,2),%1                     \n"
+    "movhpd     %%xmm8,(%2,%6)                   \n"
+    "lea        (%2,%6,2),%2                     \n"
+    "movdqa     %%xmm3,%%xmm8                    \n"
+    "punpckldq  %%xmm7,%%xmm3                    \n"
+    "movlpd     %%xmm3,(%1)                      \n"
+    "movhpd     %%xmm3,(%2)                      \n"
+    "punpckhdq  %%xmm7,%%xmm8                    \n"
+    "sub        $0x8,%3                          \n"
+    "movlpd     %%xmm8,(%1,%5)                   \n"
+    "lea        (%1,%5,2),%1                     \n"
+    "movhpd     %%xmm8,(%2,%6)                   \n"
+    "lea        (%2,%6,2),%2                     \n"
+    "jg         1b                               \n"
+    : "+r"(src),    // %0
+      "+r"(dst_a),  // %1
+      "+r"(dst_b),  // %2
+      "+r"(width)   // %3
+    : "r"((intptr_t)(src_stride)),    // %4
+      "r"((intptr_t)(dst_stride_a)),  // %5
+      "r"((intptr_t)(dst_stride_b))   // %6
+    : "memory", "cc",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9"
+  );
+}
+#endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/rotate_mips.cc b/files/source/rotate_mips.cc
new file mode 100644
index 00000000..1e8ce251
--- /dev/null
+++ b/files/source/rotate_mips.cc
@@ -0,0 +1,484 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void TransposeWx8_DSPR2(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
+   __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+    "1:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "sw               $s0, 0(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "sw               $s1, 4(%[dst])                   \n"
+      "bnez             %[width], 1b                     \n"
+      " addu            %[dst], %[dst], %[dst_stride]    \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+   "11:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "swr              $s0, 0(%[dst])                   \n"
+      "swl              $s0, 3(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "swr              $s1, 4(%[dst])                   \n"
+      "swl              $s1, 7(%[dst])                   \n"
+      "bnez             %[width], 11b                    \n"
+       "addu             %[dst], %[dst], %[dst_stride]   \n"
+    "2:                                                  \n"
+      ".set pop                                          \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1",  "t2", "t3", "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1"
+  );
+}
+
+void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width) {
+  __asm__ __volatile__ (
+      ".set noat                                         \n"
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz             %[width], 2f                     \n"
+      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+
+      "srl              $AT, %[width], 0x2               \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+      "1:                                                \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "sw              $s4, 0(%[dst])                    \n"
+      "sw              $t0, 4(%[dst])                    \n"
+      "sw              $s6, 0($s0)                       \n"
+      "sw              $t8, 4($s0)                       \n"
+      "sw              $s5, 0($s1)                       \n"
+      "sw              $t1, 4($s1)                       \n"
+      "sw              $s7, 0($s2)                       \n"
+      "sw              $t9, 4($s2)                       \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 1b                          \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+      "11:                                               \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "swr              $s4, 0(%[dst])                   \n"
+      "swl              $s4, 3(%[dst])                   \n"
+      "swr              $t0, 4(%[dst])                   \n"
+      "swl              $t0, 7(%[dst])                   \n"
+      "swr              $s6, 0($s0)                      \n"
+      "swl              $s6, 3($s0)                      \n"
+      "swr              $t8, 4($s0)                      \n"
+      "swl              $t8, 7($s0)                      \n"
+      "swr              $s5, 0($s1)                      \n"
+      "swl              $s5, 3($s1)                      \n"
+      "swr              $t1, 4($s1)                      \n"
+      "swl              $t1, 7($s1)                      \n"
+      "swr              $s7, 0($s2)                      \n"
+      "swl              $s7, 3($s2)                      \n"
+      "swr              $t9, 4($s2)                      \n"
+      "swl              $t9, 7($s2)                      \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 11b                         \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "2:                                                \n"
+      ".set pop                                          \n"
+      ".set at                                           \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
+  );
+}
+
+void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
+                          uint8* dst_a, int dst_stride_a,
+                          uint8* dst_b, int dst_stride_b,
+                          int width) {
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz            %[width], 2f                      \n"
+      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2
+      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4
+      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8
+      "addu            $t3, $t2, %[src_stride]           \n"
+      "addu            $t5, $t4, %[src_stride]           \n"
+      "addu            $t6, $t2, $t4                     \n"
+      "subu            $t7, $t9, %[src_stride]           \n"
+      "srl             $t1, %[width], 1                  \n"
+
+// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
+      "andi            $t0, %[dst_a], 0x3                \n"
+      "andi            $t8, %[dst_b], 0x3                \n"
+      "or              $t0, $t0, $t8                     \n"
+      "andi            $t8, %[dst_stride_a], 0x3         \n"
+      "andi            $s5, %[dst_stride_b], 0x3         \n"
+      "or              $t8, $t8, $s5                     \n"
+      "or              $t0, $t0, $t8                     \n"
+      "bnez            $t0, 11f                          \n"
+      " nop                                              \n"
+// dst + dst_stride word aligned (both, a & b dst addresses)
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "sw              $s3, 0($s5)                       \n"
+      "sw              $s4, 0($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "sw              $s3, 0(%[dst_a])                  \n"
+      "sw              $s4, 0(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+      "sw              $s3, 4($s5)                       \n"
+      "sw              $s4, 4($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "sw              $s3, 4(%[dst_a])                  \n"
+      "sw              $s4, 4(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 1b                           \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+      "b               2f                                \n"
+      " nop                                              \n"
+
+// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
+   "11:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "swr             $s3, 0($s5)                       \n"
+      "swl             $s3, 3($s5)                       \n"
+      "swr             $s4, 0($s6)                       \n"
+      "swl             $s4, 3($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "swr             $s3, 0(%[dst_a])                  \n"
+      "swl             $s3, 3(%[dst_a])                  \n"
+      "swr             $s4, 0(%[dst_b])                  \n"
+      "swl             $s4, 3(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+
+      "swr             $s3, 4($s5)                       \n"
+      "swl             $s3, 7($s5)                       \n"
+      "swr             $s4, 4($s6)                       \n"
+      "swl             $s4, 7($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "swr             $s3, 4(%[dst_a])                  \n"
+      "swl             $s3, 7(%[dst_a])                  \n"
+      "swr             $s4, 4(%[dst_b])                  \n"
+      "swl             $s4, 7(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 11b                          \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+
+      "2:                                                \n"
+      ".set pop                                          \n"
+      : [src] "+r" (src),
+        [dst_a] "+r" (dst_a),
+        [dst_b] "+r" (dst_b),
+        [width] "+r" (width),
+        [src_stride] "+r" (src_stride)
+      : [dst_stride_a] "r" (dst_stride_a),
+        [dst_stride_b] "r" (dst_stride_b)
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3",
+        "s4", "s5", "s6"
+  );
+}
+
+#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/rotate_neon.cc b/files/source/rotate_neon.cc
index 49b30032..1c22b472 100644
--- a/files/source/rotate_neon.cc
+++ b/files/source/rotate_neon.cc
@@ -4,11 +4,12 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
 
 #include "libyuv/basic_types.h"
 
@@ -17,33 +18,42 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
 
-static const uvec8 kVTbl4x4Transpose =
+static uvec8 kVTbl4x4Transpose =
   { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
 
 void TransposeWx8_NEON(const uint8* src, int src_stride,
                        uint8* dst, int dst_stride,
                        int width) {
+  const uint8* src_temp;
   asm volatile (
     // loops are on blocks of 8. loop will stop when
     // counter gets to or below 0. starting the counter
     // at w-8 allow for this
-    "sub         %4, #8                        \n"
+    "sub         %5, #8                        \n"
 
     // handle 8x8 blocks. this should be the majority of the plane
-    ".p2align  4                               \n"
     "1:                                        \n"
-      "mov         r9, %0                      \n"
-
-      "vld1.8      {d0}, [r9], %1              \n"
-      "vld1.8      {d1}, [r9], %1              \n"
-      "vld1.8      {d2}, [r9], %1              \n"
-      "vld1.8      {d3}, [r9], %1              \n"
-      "vld1.8      {d4}, [r9], %1              \n"
-      "vld1.8      {d5}, [r9], %1              \n"
-      "vld1.8      {d6}, [r9], %1              \n"
-      "vld1.8      {d7}, [r9]                  \n"
+      "mov         %0, %1                      \n"
+
+      MEMACCESS(0)
+      "vld1.8      {d0}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d1}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d2}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d3}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d4}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d5}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d6}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d7}, [%0]                  \n"
 
       "vtrn.8      d1, d0                      \n"
       "vtrn.8      d3, d2                      \n"
@@ -65,152 +75,205 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
       "vrev16.8    q2, q2                      \n"
       "vrev16.8    q3, q3                      \n"
 
-      "mov         r9, %2                      \n"
-
-      "vst1.8      {d1}, [r9], %3              \n"
-      "vst1.8      {d0}, [r9], %3              \n"
-      "vst1.8      {d3}, [r9], %3              \n"
-      "vst1.8      {d2}, [r9], %3              \n"
-      "vst1.8      {d5}, [r9], %3              \n"
-      "vst1.8      {d4}, [r9], %3              \n"
-      "vst1.8      {d7}, [r9], %3              \n"
-      "vst1.8      {d6}, [r9]                  \n"
-
-      "add         %0, #8                      \n"  // src += 8
-      "add         %2, %2, %3, lsl #3          \n"  // dst += 8 * dst_stride
-      "subs        %4,  #8                     \n"  // w   -= 8
+      "mov         %0, %3                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d1}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d0}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d3}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d2}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d5}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d4}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d7}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d6}, [%0]                  \n"
+
+      "add         %1, #8                      \n"  // src += 8
+      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
+      "subs        %5,  #8                     \n"  // w   -= 8
       "bge         1b                          \n"
 
     // add 8 back to counter. if the result is 0 there are
     // no residuals.
-    "adds        %4, #8                        \n"
+    "adds        %5, #8                        \n"
     "beq         4f                            \n"
 
     // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %4, #2                        \n"
+    "cmp         %5, #2                        \n"
     "blt         3f                            \n"
 
-    "cmp         %4, #4                        \n"
+    "cmp         %5, #4                        \n"
     "blt         2f                            \n"
 
     // 4x8 block
-    "mov         r9, %0                        \n"
-    "vld1.32     {d0[0]}, [r9], %1             \n"
-    "vld1.32     {d0[1]}, [r9], %1             \n"
-    "vld1.32     {d1[0]}, [r9], %1             \n"
-    "vld1.32     {d1[1]}, [r9], %1             \n"
-    "vld1.32     {d2[0]}, [r9], %1             \n"
-    "vld1.32     {d2[1]}, [r9], %1             \n"
-    "vld1.32     {d3[0]}, [r9], %1             \n"
-    "vld1.32     {d3[1]}, [r9]                 \n"
-
-    "mov         r9, %2                        \n"
-
-    "vld1.8      {q3}, [%5]                    \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.32     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d2[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d2[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d3[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d3[1]}, [%0]                 \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(6)
+    "vld1.8      {q3}, [%6]                    \n"
 
     "vtbl.8      d4, {d0, d1}, d6              \n"
     "vtbl.8      d5, {d0, d1}, d7              \n"
     "vtbl.8      d0, {d2, d3}, d6              \n"
     "vtbl.8      d1, {d2, d3}, d7              \n"
 
-    // TODO: rework shuffle above to write
-    //       out with 4 instead of 8 writes
-    "vst1.32     {d4[0]}, [r9], %3             \n"
-    "vst1.32     {d4[1]}, [r9], %3             \n"
-    "vst1.32     {d5[0]}, [r9], %3             \n"
-    "vst1.32     {d5[1]}, [r9]                 \n"
-
-    "add         r9, %2, #4                    \n"
-    "vst1.32     {d0[0]}, [r9], %3             \n"
-    "vst1.32     {d0[1]}, [r9], %3             \n"
-    "vst1.32     {d1[0]}, [r9], %3             \n"
-    "vst1.32     {d1[1]}, [r9]                 \n"
-
-    "add         %0, #4                        \n"  // src += 4
-    "add         %2, %2, %3, lsl #2            \n"  // dst += 4 * dst_stride
-    "subs        %4,  #4                       \n"  // w   -= 4
+    // TODO(frkoenig): Rework shuffle above to
+    // write out with 4 instead of 8 writes.
+    MEMACCESS(0)
+    "vst1.32     {d4[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d4[1]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d5[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d5[1]}, [%0]                 \n"
+
+    "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d0[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d0[1]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d1[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d1[1]}, [%0]                 \n"
+
+    "add         %1, #4                        \n"  // src += 4
+    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
+    "subs        %5,  #4                       \n"  // w   -= 4
     "beq         4f                            \n"
 
     // some residual, check to see if it includes a 2x8 block,
     // or less
-    "cmp         %4, #2                        \n"
+    "cmp         %5, #2                        \n"
     "blt         3f                            \n"
 
     // 2x8 block
     "2:                                        \n"
-    "mov         r9, %0                        \n"
-    "vld1.16     {d0[0]}, [r9], %1             \n"
-    "vld1.16     {d1[0]}, [r9], %1             \n"
-    "vld1.16     {d0[1]}, [r9], %1             \n"
-    "vld1.16     {d1[1]}, [r9], %1             \n"
-    "vld1.16     {d0[2]}, [r9], %1             \n"
-    "vld1.16     {d1[2]}, [r9], %1             \n"
-    "vld1.16     {d0[3]}, [r9], %1             \n"
-    "vld1.16     {d1[3]}, [r9]                 \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[2]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[2]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[3]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[3]}, [%0]                 \n"
 
     "vtrn.8      d0, d1                        \n"
 
-    "mov         r9, %2                        \n"
+    "mov         %0, %3                        \n"
 
-    "vst1.64     {d0}, [r9], %3                \n"
-    "vst1.64     {d1}, [r9]                    \n"
+    MEMACCESS(0)
+    "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
+    "vst1.64     {d1}, [%0]                    \n"
 
-    "add         %0, #2                        \n"  // src += 2
-    "add         %2, %2, %3, lsl #1            \n"  // dst += 2 * dst_stride
-    "subs        %4,  #2                       \n"  // w   -= 2
+    "add         %1, #2                        \n"  // src += 2
+    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
+    "subs        %5,  #2                       \n"  // w   -= 2
     "beq         4f                            \n"
 
     // 1x8 block
     "3:                                        \n"
-    "vld1.8      {d0[0]}, [%0], %1             \n"
-    "vld1.8      {d0[1]}, [%0], %1             \n"
-    "vld1.8      {d0[2]}, [%0], %1             \n"
-    "vld1.8      {d0[3]}, [%0], %1             \n"
-    "vld1.8      {d0[4]}, [%0], %1             \n"
-    "vld1.8      {d0[5]}, [%0], %1             \n"
-    "vld1.8      {d0[6]}, [%0], %1             \n"
-    "vld1.8      {d0[7]}, [%0]                 \n"
-
-    "vst1.64     {d0}, [%2]                    \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[0]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[1]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[2]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[3]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[4]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[5]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[6]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[7]}, [%1]                 \n"
+
+    MEMACCESS(3)
+    "vst1.64     {d0}, [%3]                    \n"
 
     "4:                                        \n"
 
-    : "+r"(src),               // %0
-      "+r"(src_stride),        // %1
-      "+r"(dst),               // %2
-      "+r"(dst_stride),        // %3
-      "+r"(width)              // %4
-    : "r"(&kVTbl4x4Transpose)  // %5
-    : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
+    : "=&r"(src_temp),         // %0
+      "+r"(src),               // %1
+      "+r"(src_stride),        // %2
+      "+r"(dst),               // %3
+      "+r"(dst_stride),        // %4
+      "+r"(width)              // %5
+    : "r"(&kVTbl4x4Transpose)  // %6
+    : "memory", "cc", "q0", "q1", "q2", "q3"
   );
 }
 
-static const uvec8 kVTbl4x4TransposeDi =
+static uvec8 kVTbl4x4TransposeDi =
   { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
 
 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                          uint8* dst_a, int dst_stride_a,
                          uint8* dst_b, int dst_stride_b,
                          int width) {
+  const uint8* src_temp;
   asm volatile (
     // loops are on blocks of 8. loop will stop when
     // counter gets to or below 0. starting the counter
     // at w-8 allow for this
-    "sub         %6, #8                        \n"
+    "sub         %7, #8                        \n"
 
     // handle 8x8 blocks. this should be the majority of the plane
-    ".p2align  4                               \n"
     "1:                                        \n"
-      "mov         r9, %0                      \n"
-
-      "vld2.8      {d0,  d1},  [r9], %1        \n"
-      "vld2.8      {d2,  d3},  [r9], %1        \n"
-      "vld2.8      {d4,  d5},  [r9], %1        \n"
-      "vld2.8      {d6,  d7},  [r9], %1        \n"
-      "vld2.8      {d16, d17}, [r9], %1        \n"
-      "vld2.8      {d18, d19}, [r9], %1        \n"
-      "vld2.8      {d20, d21}, [r9], %1        \n"
-      "vld2.8      {d22, d23}, [r9]            \n"
+      "mov         %0, %1                      \n"
+
+      MEMACCESS(0)
+      "vld2.8      {d0,  d1},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d2,  d3},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d4,  d5},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d6,  d7},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d16, d17}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d18, d19}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d20, d21}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d22, d23}, [%0]            \n"
 
       "vtrn.8      q1, q0                      \n"
       "vtrn.8      q3, q2                      \n"
@@ -236,59 +299,84 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
       "vrev16.8    q10, q10                    \n"
       "vrev16.8    q11, q11                    \n"
 
-      "mov         r9, %2                      \n"
-
-      "vst1.8      {d2},  [r9], %3             \n"
-      "vst1.8      {d0},  [r9], %3             \n"
-      "vst1.8      {d6},  [r9], %3             \n"
-      "vst1.8      {d4},  [r9], %3             \n"
-      "vst1.8      {d18}, [r9], %3             \n"
-      "vst1.8      {d16}, [r9], %3             \n"
-      "vst1.8      {d22}, [r9], %3             \n"
-      "vst1.8      {d20}, [r9]                 \n"
-
-      "mov         r9, %4                      \n"
-
-      "vst1.8      {d3},  [r9], %5             \n"
-      "vst1.8      {d1},  [r9], %5             \n"
-      "vst1.8      {d7},  [r9], %5             \n"
-      "vst1.8      {d5},  [r9], %5             \n"
-      "vst1.8      {d19}, [r9], %5             \n"
-      "vst1.8      {d17}, [r9], %5             \n"
-      "vst1.8      {d23}, [r9], %5             \n"
-      "vst1.8      {d21}, [r9]                 \n"
-
-      "add         %0, #8*2                    \n"  // src   += 8*2
-      "add         %2, %2, %3, lsl #3          \n"  // dst_a += 8 * dst_stride_a
-      "add         %4, %4, %5, lsl #3          \n"  // dst_b += 8 * dst_stride_b
-      "subs        %6,  #8                     \n"  // w     -= 8
+      "mov         %0, %3                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d2},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d0},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d6},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d4},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d18}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d16}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d22}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d20}, [%0]                 \n"
+
+      "mov         %0, %5                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d3},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d1},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d7},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d5},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d19}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d17}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d23}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d21}, [%0]                 \n"
+
+      "add         %1, #8*2                    \n"  // src   += 8*2
+      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
+      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
+      "subs        %7,  #8                     \n"  // w     -= 8
       "bge         1b                          \n"
 
     // add 8 back to counter. if the result is 0 there are
     // no residuals.
-    "adds        %6, #8                        \n"
+    "adds        %7, #8                        \n"
     "beq         4f                            \n"
 
     // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %6, #2                        \n"
+    "cmp         %7, #2                        \n"
     "blt         3f                            \n"
 
-    "cmp         %6, #4                        \n"
+    "cmp         %7, #4                        \n"
     "blt         2f                            \n"
 
-    //TODO(frkoenig) : clean this up
+    // TODO(frkoenig): Clean this up
     // 4x8 block
-    "mov         r9, %0                        \n"
-    "vld1.64     {d0}, [r9], %1                \n"
-    "vld1.64     {d1}, [r9], %1                \n"
-    "vld1.64     {d2}, [r9], %1                \n"
-    "vld1.64     {d3}, [r9], %1                \n"
-    "vld1.64     {d4}, [r9], %1                \n"
-    "vld1.64     {d5}, [r9], %1                \n"
-    "vld1.64     {d6}, [r9], %1                \n"
-    "vld1.64     {d7}, [r9]                    \n"
-
-    "vld1.8      {q15}, [%7]                   \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.64     {d0}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d1}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d2}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d3}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d4}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d5}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d6}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d7}, [%0]                    \n"
+
+    MEMACCESS(8)
+    "vld1.8      {q15}, [%8]                   \n"
 
     "vtrn.8      q0, q1                        \n"
     "vtrn.8      q2, q3                        \n"
@@ -302,103 +390,142 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
     "vtbl.8      d22, {d6, d7}, d30            \n"
     "vtbl.8      d23, {d6, d7}, d31            \n"
 
-    "mov         r9, %2                        \n"
-
-    "vst1.32     {d16[0]},  [r9], %3           \n"
-    "vst1.32     {d16[1]},  [r9], %3           \n"
-    "vst1.32     {d17[0]},  [r9], %3           \n"
-    "vst1.32     {d17[1]},  [r9], %3           \n"
-
-    "add         r9, %2, #4                    \n"
-    "vst1.32     {d20[0]}, [r9], %3            \n"
-    "vst1.32     {d20[1]}, [r9], %3            \n"
-    "vst1.32     {d21[0]}, [r9], %3            \n"
-    "vst1.32     {d21[1]}, [r9]                \n"
-
-    "mov         r9, %4                        \n"
-
-    "vst1.32     {d18[0]}, [r9], %5            \n"
-    "vst1.32     {d18[1]}, [r9], %5            \n"
-    "vst1.32     {d19[0]}, [r9], %5            \n"
-    "vst1.32     {d19[1]}, [r9], %5            \n"
-
-    "add         r9, %4, #4                    \n"
-    "vst1.32     {d22[0]},  [r9], %5           \n"
-    "vst1.32     {d22[1]},  [r9], %5           \n"
-    "vst1.32     {d23[0]},  [r9], %5           \n"
-    "vst1.32     {d23[1]},  [r9]               \n"
-
-    "add         %0, #4*2                      \n"  // src   += 4 * 2
-    "add         %2, %2, %3, lsl #2            \n"  // dst_a += 4 * dst_stride_a
-    "add         %4, %4, %5, lsl #2            \n"  // dst_b += 4 * dst_stride_b
-    "subs        %6,  #4                       \n"  // w     -= 4
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d16[0]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d16[1]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d17[0]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d17[1]},  [%0], %4           \n"
+
+    "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d20[0]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d20[1]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d21[0]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d21[1]}, [%0]                \n"
+
+    "mov         %0, %5                        \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d18[0]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d18[1]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d19[0]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d19[1]}, [%0], %6            \n"
+
+    "add         %0, %5, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d22[0]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d22[1]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d23[0]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d23[1]},  [%0]               \n"
+
+    "add         %1, #4*2                      \n"  // src   += 4 * 2
+    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
+    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
+    "subs        %7,  #4                       \n"  // w     -= 4
     "beq         4f                            \n"
 
     // some residual, check to see if it includes a 2x8 block,
     // or less
-    "cmp         %6, #2                        \n"
+    "cmp         %7, #2                        \n"
     "blt         3f                            \n"
 
     // 2x8 block
     "2:                                        \n"
-    "mov         r9, %0                        \n"
-    "vld2.16     {d0[0], d2[0]}, [r9], %1      \n"
-    "vld2.16     {d1[0], d3[0]}, [r9], %1      \n"
-    "vld2.16     {d0[1], d2[1]}, [r9], %1      \n"
-    "vld2.16     {d1[1], d3[1]}, [r9], %1      \n"
-    "vld2.16     {d0[2], d2[2]}, [r9], %1      \n"
-    "vld2.16     {d1[2], d3[2]}, [r9], %1      \n"
-    "vld2.16     {d0[3], d2[3]}, [r9], %1      \n"
-    "vld2.16     {d1[3], d3[3]}, [r9]          \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[3], d3[3]}, [%0]          \n"
 
     "vtrn.8      d0, d1                        \n"
     "vtrn.8      d2, d3                        \n"
 
-    "mov         r9, %2                        \n"
+    "mov         %0, %3                        \n"
 
-    "vst1.64     {d0}, [r9], %3                \n"
-    "vst1.64     {d2}, [r9]                    \n"
+    MEMACCESS(0)
+    "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
+    "vst1.64     {d2}, [%0]                    \n"
 
-    "mov         r9, %4                        \n"
+    "mov         %0, %5                        \n"
 
-    "vst1.64     {d1}, [r9], %5                \n"
-    "vst1.64     {d3}, [r9]                    \n"
+    MEMACCESS(0)
+    "vst1.64     {d1}, [%0], %6                \n"
+    MEMACCESS(0)
+    "vst1.64     {d3}, [%0]                    \n"
 
-    "add         %0, #2*2                      \n"  // src   += 2 * 2
-    "add         %2, %2, %3, lsl #1            \n"  // dst_a += 2 * dst_stride_a
-    "add         %4, %4, %5, lsl #1            \n"  // dst_b += 2 * dst_stride_b
-    "subs        %6,  #2                       \n"  // w     -= 2
+    "add         %1, #2*2                      \n"  // src   += 2 * 2
+    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
+    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
+    "subs        %7,  #2                       \n"  // w     -= 2
     "beq         4f                            \n"
 
     // 1x8 block
     "3:                                        \n"
-    "vld2.8      {d0[0], d1[0]}, [%0], %1      \n"
-    "vld2.8      {d0[1], d1[1]}, [%0], %1      \n"
-    "vld2.8      {d0[2], d1[2]}, [%0], %1      \n"
-    "vld2.8      {d0[3], d1[3]}, [%0], %1      \n"
-    "vld2.8      {d0[4], d1[4]}, [%0], %1      \n"
-    "vld2.8      {d0[5], d1[5]}, [%0], %1      \n"
-    "vld2.8      {d0[6], d1[6]}, [%0], %1      \n"
-    "vld2.8      {d0[7], d1[7]}, [%0]          \n"
-
-    "vst1.64     {d0}, [%2]                    \n"
-    "vst1.64     {d1}, [%4]                    \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[7], d1[7]}, [%1]          \n"
+
+    MEMACCESS(3)
+    "vst1.64     {d0}, [%3]                    \n"
+    MEMACCESS(5)
+    "vst1.64     {d1}, [%5]                    \n"
 
     "4:                                        \n"
 
-    : "+r"(src),                 // %0
-      "+r"(src_stride),          // %1
-      "+r"(dst_a),               // %2
-      "+r"(dst_stride_a),        // %3
-      "+r"(dst_b),               // %4
-      "+r"(dst_stride_b),        // %5
-      "+r"(width)                // %6
-    : "r"(&kVTbl4x4TransposeDi)  // %7
-    : "memory", "cc", "r9",
+    : "=&r"(src_temp),           // %0
+      "+r"(src),                 // %1
+      "+r"(src_stride),          // %2
+      "+r"(dst_a),               // %3
+      "+r"(dst_stride_a),        // %4
+      "+r"(dst_b),               // %5
+      "+r"(dst_stride_b),        // %6
+      "+r"(width)                // %7
+    : "r"(&kVTbl4x4TransposeDi)  // %8
+    : "memory", "cc",
       "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
   );
 }
-#endif
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/files/source/rotate_neon64.cc b/files/source/rotate_neon64.cc
new file mode 100644
index 00000000..1ab448f3
--- /dev/null
+++ b/files/source/rotate_neon64.cc
@@ -0,0 +1,543 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose =
+  { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width) {
+  const uint8* src_temp;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %3, %3, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                          \n"
+      "mov         %0, %1                        \n"
+
+      MEMACCESS(0)
+      "ld1        {v0.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v1.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v2.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v3.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v4.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v5.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v6.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v7.8b}, [%0]                  \n"
+
+      "trn2     v16.8b, v0.8b, v1.8b             \n"
+      "trn1     v17.8b, v0.8b, v1.8b             \n"
+      "trn2     v18.8b, v2.8b, v3.8b             \n"
+      "trn1     v19.8b, v2.8b, v3.8b             \n"
+      "trn2     v20.8b, v4.8b, v5.8b             \n"
+      "trn1     v21.8b, v4.8b, v5.8b             \n"
+      "trn2     v22.8b, v6.8b, v7.8b             \n"
+      "trn1     v23.8b, v6.8b, v7.8b             \n"
+
+      "trn2     v3.4h, v17.4h, v19.4h            \n"
+      "trn1     v1.4h, v17.4h, v19.4h            \n"
+      "trn2     v2.4h, v16.4h, v18.4h            \n"
+      "trn1     v0.4h, v16.4h, v18.4h            \n"
+      "trn2     v7.4h, v21.4h, v23.4h            \n"
+      "trn1     v5.4h, v21.4h, v23.4h            \n"
+      "trn2     v6.4h, v20.4h, v22.4h            \n"
+      "trn1     v4.4h, v20.4h, v22.4h            \n"
+
+      "trn2     v21.2s, v1.2s, v5.2s             \n"
+      "trn1     v17.2s, v1.2s, v5.2s             \n"
+      "trn2     v20.2s, v0.2s, v4.2s             \n"
+      "trn1     v16.2s, v0.2s, v4.2s             \n"
+      "trn2     v23.2s, v3.2s, v7.2s             \n"
+      "trn1     v19.2s, v3.2s, v7.2s             \n"
+      "trn2     v22.2s, v2.2s, v6.2s             \n"
+      "trn1     v18.2s, v2.2s, v6.2s             \n"
+
+      "mov         %0, %2                        \n"
+
+    MEMACCESS(0)
+      "st1      {v17.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v16.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v19.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v18.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v21.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v20.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v23.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v22.8b}, [%0]                   \n"
+
+      "add         %1, %1, #8                    \n"  // src += 8
+      "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
+      "subs        %3, %3, #8                    \n"  // w   -= 8
+      "b.ge        1b                            \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %3, %3, #8                      \n"
+    "b.eq        4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    "cmp         %3, #4                          \n"
+    "b.lt        2f                              \n"
+
+    // 4x8 block
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[3], [%0]                     \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(4)
+    "ld1      {v2.16b}, [%4]                     \n"
+
+    "tbl      v3.16b, {v0.16b}, v2.16b           \n"
+    "tbl      v0.16b, {v1.16b}, v2.16b           \n"
+
+    // TODO(frkoenig): Rework shuffle above to
+    // write out with 4 instead of 8 writes.
+    MEMACCESS(0)
+    "st1 {v3.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[3], [%0]                         \n"
+
+    "add         %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[3], [%0]                         \n"
+
+    "add         %1, %1, #4                      \n"  // src += 4
+    "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
+    "subs        %3, %3, #4                      \n"  // w   -= 4
+    "b.eq        4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    // 2x8 block
+    "2:                                          \n"
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[3], [%0]                     \n"
+
+    "trn2    v2.8b, v0.8b, v1.8b                 \n"
+    "trn1    v3.8b, v0.8b, v1.8b                 \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1     {v3.8b}, [%0], %6                   \n"
+    MEMACCESS(0)
+    "st1     {v2.8b}, [%0]                       \n"
+
+    "add         %1, %1, #2                      \n"  // src += 2
+    "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
+    "subs        %3, %3,  #2                     \n"  // w   -= 2
+    "b.eq        4f                              \n"
+
+    // 1x8 block
+    "3:                                          \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[0], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[1], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[2], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[3], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[4], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[5], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[6], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[7], [%1]                 \n"
+
+    MEMACCESS(2)
+    "st1         {v0.8b}, [%2]                   \n"
+
+    "4:                                          \n"
+
+    : "=&r"(src_temp),                            // %0
+      "+r"(src),                                  // %1
+      "+r"(dst),                                  // %2
+      "+r"(width64)                               // %3
+    : "r"(&kVTbl4x4Transpose),                    // %4
+      "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride))     // %6
+    : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+      "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+  );
+}
+
+static uint8 kVTbl4x4TransposeDi[32] =
+  { 0,  16, 32, 48,  2, 18, 34, 50,  4, 20, 36, 52,  6, 22, 38, 54,
+    1,  17, 33, 49,  3, 19, 35, 51,  5, 21, 37, 53,  7, 23, 39, 55};
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) {
+  const uint8* src_temp;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub       %4, %4, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                        \n"
+    "mov       %0, %1                          \n"
+
+    MEMACCESS(0)
+    "ld1       {v0.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v1.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v2.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v3.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v4.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v5.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v6.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v7.16b}, [%0]                  \n"
+
+    "trn1      v16.16b, v0.16b, v1.16b         \n"
+    "trn2      v17.16b, v0.16b, v1.16b         \n"
+    "trn1      v18.16b, v2.16b, v3.16b         \n"
+    "trn2      v19.16b, v2.16b, v3.16b         \n"
+    "trn1      v20.16b, v4.16b, v5.16b         \n"
+    "trn2      v21.16b, v4.16b, v5.16b         \n"
+    "trn1      v22.16b, v6.16b, v7.16b         \n"
+    "trn2      v23.16b, v6.16b, v7.16b         \n"
+
+    "trn1      v0.8h, v16.8h, v18.8h           \n"
+    "trn2      v1.8h, v16.8h, v18.8h           \n"
+    "trn1      v2.8h, v20.8h, v22.8h           \n"
+    "trn2      v3.8h, v20.8h, v22.8h           \n"
+    "trn1      v4.8h, v17.8h, v19.8h           \n"
+    "trn2      v5.8h, v17.8h, v19.8h           \n"
+    "trn1      v6.8h, v21.8h, v23.8h           \n"
+    "trn2      v7.8h, v21.8h, v23.8h           \n"
+
+    "trn1      v16.4s, v0.4s, v2.4s            \n"
+    "trn2      v17.4s, v0.4s, v2.4s            \n"
+    "trn1      v18.4s, v1.4s, v3.4s            \n"
+    "trn2      v19.4s, v1.4s, v3.4s            \n"
+    "trn1      v20.4s, v4.4s, v6.4s            \n"
+    "trn2      v21.4s, v4.4s, v6.4s            \n"
+    "trn1      v22.4s, v5.4s, v7.4s            \n"
+    "trn2      v23.4s, v5.4s, v7.4s            \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v16.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[1], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v20.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v20.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[1], [%0]                \n"
+
+    "add       %1, %1, #16                     \n"  // src   += 8*2
+    "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 * dst_stride_a
+    "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 * dst_stride_b
+    "subs      %4, %4,  #8                     \n"  // w     -= 8
+    "b.ge      1b                              \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds      %4, %4, #8                      \n"
+    "b.eq      4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    "cmp       %4, #4                          \n"
+    "b.lt      2f                              \n"
+
+    // TODO(frkoenig): Clean this up
+    // 4x8 block
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1       {v0.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v1.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v2.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v3.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v4.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v5.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v6.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v7.8b}, [%0]                   \n"
+
+    MEMACCESS(8)
+    "ld1       {v30.16b}, [%8], #16            \n"
+    "ld1       {v31.16b}, [%8]                 \n"
+
+    "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
+    "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
+    "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
+    "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.s}[0],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[1],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[2],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[3],  [%0], %6           \n"
+
+    "add       %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[2], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[3], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v17.s}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[2], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[3], [%0], %7            \n"
+
+    "add       %0, %3, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[0],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[1],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[2],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[3],  [%0]               \n"
+
+    "add       %1, %1, #8                      \n"  // src   += 4 * 2
+    "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 * dst_stride_a
+    "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 * dst_stride_b
+    "subs      %4,  %4,  #4                    \n"  // w     -= 4
+    "b.eq      4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[3], [%0]           \n"
+
+    "trn1      v4.8b, v0.8b, v2.8b             \n"
+    "trn2      v5.8b, v0.8b, v2.8b             \n"
+    "trn1      v6.8b, v1.8b, v3.8b             \n"
+    "trn2      v7.8b, v1.8b, v3.8b             \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v4.d}[0], [%0], %6             \n"
+    MEMACCESS(0)
+    "st1       {v6.d}[0], [%0]                 \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v5.d}[0], [%0], %7             \n"
+    MEMACCESS(0)
+    "st1       {v7.d}[0], [%0]                 \n"
+
+    "add       %1, %1, #4                      \n"  // src   += 2 * 2
+    "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 * dst_stride_a
+    "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 * dst_stride_b
+    "subs      %4,  %4,  #2                    \n"  // w     -= 2
+    "b.eq      4f                              \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[7], [%1]           \n"
+
+    MEMACCESS(2)
+    "st1       {v0.d}[0], [%2]                 \n"
+    MEMACCESS(3)
+    "st1       {v1.d}[0], [%3]                 \n"
+
+    "4:                                        \n"
+
+    : "=&r"(src_temp),                            // %0
+      "+r"(src),                                  // %1
+      "+r"(dst_a),                                // %2
+      "+r"(dst_b),                                // %3
+      "+r"(width64)                               // %4
+    : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
+      "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
+      "r"(&kVTbl4x4TransposeDi)                   // %8
+    : "memory", "cc",
+      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+      "v30", "v31"
+  );
+}
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/rotate_win.cc b/files/source/rotate_win.cc
new file mode 100644
index 00000000..1300fc0f
--- /dev/null
+++ b/files/source/rotate_win.cc
@@ -0,0 +1,247 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+__declspec(naked)
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
+  __asm {
+    push      edi
+    push      esi
+    push      ebp
+    mov       eax, [esp + 12 + 4]   // src
+    mov       edi, [esp + 12 + 8]   // src_stride
+    mov       edx, [esp + 12 + 12]  // dst
+    mov       esi, [esp + 12 + 16]  // dst_stride
+    mov       ecx, [esp + 12 + 20]  // width
+
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    align      4
+ convertloop:
+    movq      xmm0, qword ptr [eax]
+    lea       ebp, [eax + 8]
+    movq      xmm1, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm0, xmm1
+    movq      xmm2, qword ptr [eax]
+    movdqa    xmm1, xmm0
+    palignr   xmm1, xmm1, 8
+    movq      xmm3, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm2, xmm3
+    movdqa    xmm3, xmm2
+    movq      xmm4, qword ptr [eax]
+    palignr   xmm3, xmm3, 8
+    movq      xmm5, qword ptr [eax + edi]
+    punpcklbw xmm4, xmm5
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm5, xmm4
+    movq      xmm6, qword ptr [eax]
+    palignr   xmm5, xmm5, 8
+    movq      xmm7, qword ptr [eax + edi]
+    punpcklbw xmm6, xmm7
+    mov       eax, ebp
+    movdqa    xmm7, xmm6
+    palignr   xmm7, xmm7, 8
+    // Second round of bit swap.
+    punpcklwd xmm0, xmm2
+    punpcklwd xmm1, xmm3
+    movdqa    xmm2, xmm0
+    movdqa    xmm3, xmm1
+    palignr   xmm2, xmm2, 8
+    palignr   xmm3, xmm3, 8
+    punpcklwd xmm4, xmm6
+    punpcklwd xmm5, xmm7
+    movdqa    xmm6, xmm4
+    movdqa    xmm7, xmm5
+    palignr   xmm6, xmm6, 8
+    palignr   xmm7, xmm7, 8
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    punpckldq xmm0, xmm4
+    movq      qword ptr [edx], xmm0
+    movdqa    xmm4, xmm0
+    palignr   xmm4, xmm4, 8
+    movq      qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    punpckldq xmm2, xmm6
+    movdqa    xmm6, xmm2
+    palignr   xmm6, xmm6, 8
+    movq      qword ptr [edx], xmm2
+    punpckldq xmm1, xmm5
+    movq      qword ptr [edx + esi], xmm6
+    lea       edx, [edx + 2 * esi]
+    movdqa    xmm5, xmm1
+    movq      qword ptr [edx], xmm1
+    palignr   xmm5, xmm5, 8
+    punpckldq xmm3, xmm7
+    movq      qword ptr [edx + esi], xmm5
+    lea       edx, [edx + 2 * esi]
+    movq      qword ptr [edx], xmm3
+    movdqa    xmm7, xmm3
+    palignr   xmm7, xmm7, 8
+    sub       ecx, 8
+    movq      qword ptr [edx + esi], xmm7
+    lea       edx, [edx + 2 * esi]
+    jg        convertloop
+
+    pop       ebp
+    pop       esi
+    pop       edi
+    ret
+  }
+}
+
+__declspec(naked)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int w) {
+  __asm {
+    push      ebx
+    push      esi
+    push      edi
+    push      ebp
+    mov       eax, [esp + 16 + 4]   // src
+    mov       edi, [esp + 16 + 8]   // src_stride
+    mov       edx, [esp + 16 + 12]  // dst_a
+    mov       esi, [esp + 16 + 16]  // dst_stride_a
+    mov       ebx, [esp + 16 + 20]  // dst_b
+    mov       ebp, [esp + 16 + 24]  // dst_stride_b
+    mov       ecx, esp
+    sub       esp, 4 + 16
+    and       esp, ~15
+    mov       [esp + 16], ecx
+    mov       ecx, [ecx + 16 + 28]  // w
+
+    align      4
+ convertloop:
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm0  // use xmm7 as temp register.
+    punpcklbw xmm0, xmm1
+    punpckhbw xmm7, xmm1
+    movdqa    xmm1, xmm7
+    movdqu    xmm2, [eax]
+    movdqu    xmm3, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm2
+    punpcklbw xmm2, xmm3
+    punpckhbw xmm7, xmm3
+    movdqa    xmm3, xmm7
+    movdqu    xmm4, [eax]
+    movdqu    xmm5, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm4
+    punpcklbw xmm4, xmm5
+    punpckhbw xmm7, xmm5
+    movdqa    xmm5, xmm7
+    movdqu    xmm6, [eax]
+    movdqu    xmm7, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqu    [esp], xmm5  // backup xmm5
+    neg       edi
+    movdqa    xmm5, xmm6   // use xmm5 as temp register.
+    punpcklbw xmm6, xmm7
+    punpckhbw xmm5, xmm7
+    movdqa    xmm7, xmm5
+    lea       eax, [eax + 8 * edi + 16]
+    neg       edi
+    // Second round of bit swap.
+    movdqa    xmm5, xmm0
+    punpcklwd xmm0, xmm2
+    punpckhwd xmm5, xmm2
+    movdqa    xmm2, xmm5
+    movdqa    xmm5, xmm1
+    punpcklwd xmm1, xmm3
+    punpckhwd xmm5, xmm3
+    movdqa    xmm3, xmm5
+    movdqa    xmm5, xmm4
+    punpcklwd xmm4, xmm6
+    punpckhwd xmm5, xmm6
+    movdqa    xmm6, xmm5
+    movdqu    xmm5, [esp]  // restore xmm5
+    movdqu    [esp], xmm6  // backup xmm6
+    movdqa    xmm6, xmm5    // use xmm6 as temp register.
+    punpcklwd xmm5, xmm7
+    punpckhwd xmm6, xmm7
+    movdqa    xmm7, xmm6
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    movdqa    xmm6, xmm0
+    punpckldq xmm0, xmm4
+    punpckhdq xmm6, xmm4
+    movdqa    xmm4, xmm6
+    movdqu    xmm6, [esp]  // restore xmm6
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [ebx], xmm0
+    movlpd    qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm4
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
+    punpckldq xmm2, xmm6
+    movlpd    qword ptr [edx], xmm2
+    movhpd    qword ptr [ebx], xmm2
+    punpckhdq xmm0, xmm6
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
+    punpckldq xmm1, xmm5
+    movlpd    qword ptr [edx], xmm1
+    movhpd    qword ptr [ebx], xmm1
+    punpckhdq xmm0, xmm5
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
+    punpckldq xmm3, xmm7
+    movlpd    qword ptr [edx], xmm3
+    movhpd    qword ptr [ebx], xmm3
+    punpckhdq xmm0, xmm7
+    sub       ecx, 8
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    jg        convertloop
+
+    mov       esp, [esp + 16]
+    pop       ebp
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+  }
+}
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/row_any.cc b/files/source/row_any.cc
new file mode 100644
index 00000000..494164fd
--- /dev/null
+++ b/files/source/row_any.cc
@@ -0,0 +1,824 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h>  // For memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Subsampled source needs to be increase by 1 of not even.
+#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
+
+// Any 4 planes to 1 with yuvconstants
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \
+    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
+                 const uint8* a_buf, uint8* dst_ptr,                           \
+                 const struct YuvConstants* yuvconstants,  int width) {        \
+      SIMD_ALIGNED(uint8 temp[64 * 5]);                                        \
+      memset(temp, 0, 64 * 4);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
+      }                                                                        \
+      memcpy(temp, y_buf + n, r);                                              \
+      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+      memcpy(temp + 192, a_buf + n, r);                                        \
+      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
+               yuvconstants, MASK + 1);                                        \
+      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
+             SS(r, DUVSHIFT) * BPP);                                           \
+    }
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
+ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_NEON
+ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
+#endif
+#undef ANY41C
+
+// Any 3 planes to 1.
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                 \
+    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
+                 uint8* dst_ptr, int width) {                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
+      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                             \
+      }                                                                        \
+      memcpy(temp, y_buf + n, r);                                              \
+      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);             \
+      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
+             SS(r, DUVSHIFT) * BPP);                                           \
+    }
+#ifdef HAS_I422TOYUY2ROW_SSE2
+ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
+ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_NEON
+ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOUYVYROW_NEON
+ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_BLENDPLANEROW_AVX2
+ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
+#endif
+#ifdef HAS_BLENDPLANEROW_SSSE3
+ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
+#endif
+#undef ANY31
+
+// Note that odd width replication includes 444 due to implementation
+// on arm that subsamples 444 to 422 internally.
+// Any 3 planes to 1 with yuvconstants
+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \
+    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
+                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \
+                 int width) {                                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
+      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);               \
+      }                                                                        \
+      memcpy(temp, y_buf + n, r);                                              \
+      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+      if (width & 1) {                                                         \
+        temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1];             \
+        temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];           \
+      }                                                                        \
+      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192,                        \
+               yuvconstants, MASK + 1);                                        \
+      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
+             SS(r, DUVSHIFT) * BPP);                                           \
+    }
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I411TOARGBROW_SSSE3
+ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
+#endif
+#ifdef HAS_I444TOARGBROW_SSSE3
+ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
+ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
+#endif  // HAS_I444TOARGBROW_SSSE3
+#ifdef HAS_I422TORGB24ROW_AVX2
+ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
+#endif
+#ifdef HAS_I422TOARGBROW_AVX2
+ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TORGBAROW_AVX2
+ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_AVX2
+ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I411TOARGBROW_AVX2
+ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_AVX2
+ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_AVX2
+ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TORGB565ROW_AVX2
+ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_NEON
+ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
+ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
+ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
+#endif
+#undef ANY31C
+
+// Any 2 planes to 1.
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)              \
+    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
+                 uint8* dst_ptr, int width) {                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
+      memset(temp, 0, 64 * 2);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                   \
+      }                                                                        \
+      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
+      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
+             SS(r, UVSHIFT) * SBPP2);                                          \
+      ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                         \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+// Merge functions.
+#ifdef HAS_MERGEUVROW_SSE2
+ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX2
+ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
+#endif
+#ifdef HAS_MERGEUVROW_NEON
+ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
+#endif
+
+// Math functions.
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_SSE2
+ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_AVX2
+ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_NEON
+ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_SOBELROW_SSE2
+ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_NEON
+ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_NEON
+ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELXYROW_SSE2
+ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_NEON
+ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#undef ANY21
+
+// Any 2 planes to 1 with yuvconstants
+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
+    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
+                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \
+                 int width) {                                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
+      memset(temp, 0, 64 * 2);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                     \
+      }                                                                        \
+      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
+      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
+             SS(r, UVSHIFT) * SBPP2);                                          \
+      ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1);           \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+// Biplanar to RGB.
+#ifdef HAS_NV12TOARGBROW_SSSE3
+ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV12TOARGBROW_NEON
+ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_SSSE3
+ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_AVX2
+ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV21TOARGBROW_NEON
+ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+#endif
+#ifdef HAS_NV12TORGB565ROW_NEON
+ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+#endif
+#undef ANY21C
+
+// Any 1 to 1.
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                     \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
+      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
+      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
+      }                                                                        \
+      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
+      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+#ifdef HAS_COPYROW_AVX
+ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
+#endif
+#ifdef HAS_COPYROW_SSE2
+ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
+#endif
+#ifdef HAS_COPYROW_NEON
+ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_SSE2)
+ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
+#endif
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
+ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
+ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
+#endif
+#if defined(HAS_RAWTORGB24ROW_NEON)
+ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
+#endif
+#ifdef HAS_ARGBTOYROW_AVX2
+ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_AVX2
+ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_AVX2
+ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_AVX2
+ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYROW_SSSE3
+ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
+ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
+ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_SSSE3
+ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_NEON
+ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYJROW_NEON
+ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_BGRATOYROW_NEON
+ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_NEON
+ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_NEON
+ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGB24TOYROW_NEON
+ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RAWTOYROW_NEON
+ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RGB565TOYROW_NEON
+ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB1555TOYROW_NEON
+ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB4444TOYROW_NEON
+ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
+ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
+ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
+ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
+ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_NEON
+ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
+ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
+#endif
+#undef ANY11
+
+// Any 1 to 1 blended.  Destination is read, modify, write.
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
+      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
+      memset(temp, 0, 128 * 2);  /* for YUY2 and msan */                       \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
+      }                                                                        \
+      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
+      memcpy(temp + 128, dst_ptr + n * BPP, r * BPP);                          \
+      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
+#endif
+#undef ANY11B
+
+// Any 1 to 1 with parameter.
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
+                 T shuffler, int width) {                                      \
+      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
+      memset(temp, 0, 64);  /* for msan */                                     \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, shuffler, n);                               \
+      }                                                                        \
+      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+      ANY_SIMD(temp, temp + 64, shuffler, MASK + 1);                           \
+      memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                           \
+    }
+
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
+       const uint32, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
+       const uint32, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
+       const uint32, 4, 2, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
+#endif
+#undef ANY11P
+
+// Any 1 to 1 with yuvconstants
+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
+                 const struct YuvConstants* yuvconstants, int width) {         \
+      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
+      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                           \
+      }                                                                        \
+      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
+      ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                      \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
+ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \
+    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
+                 ptrdiff_t src_stride_ptr, int width,                          \
+                 int source_y_fraction) {                                      \
+      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
+      memset(temp, 0, 64 * 2);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
+      }                                                                        \
+      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+      memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
+      ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSSE3
+ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_NEON
+ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_DSPR2
+ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
+#endif
+#undef ANY11T
+
+// Any 1 to 1 mirror.
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                                   \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
+      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
+      memset(temp, 0, 64);  /* for msan */                                     \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                               \
+      }                                                                        \
+      memcpy(temp, src_ptr, r * BPP);                                          \
+      ANY_SIMD(temp, temp + 64, MASK + 1);                                     \
+      memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP);    \
+    }
+
+#ifdef HAS_MIRRORROW_AVX2
+ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_SSSE3
+ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_NEON
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
+#endif
+#ifdef HAS_ARGBMIRRORROW_AVX2
+ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_SSE2
+ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_NEON
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
+#endif
+#undef ANY11M
+
+// Any 1 plane. (memset)
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)                                  \
+    void NAMEANY(uint8* dst_ptr, T v32, int width) {                           \
+      SIMD_ALIGNED(uint8 temp[64]);                                            \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(dst_ptr, v32, n);                                             \
+      }                                                                        \
+      ANY_SIMD(temp, v32, MASK + 1);                                           \
+      memcpy(dst_ptr + n * BPP, temp, r * BPP);                                \
+    }
+
+#ifdef HAS_SETROW_X86
+ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
+#endif
+#ifdef HAS_SETROW_NEON
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
+#endif
+#ifdef HAS_ARGBSETROW_NEON
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
+#endif
+#undef ANY1
+
+// Any 1 to 2.  Outputs UV planes.
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)                 \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
+      SIMD_ALIGNED(uint8 temp[128 * 3]);                                       \
+      memset(temp, 0, 128);  /* for msan */                                    \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_u, dst_v, n);                                    \
+      }                                                                        \
+      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
+      /* repeat last 4 bytes for 422 subsampler */                             \
+      if ((width & 1) && BPP == 4 && DUVSHIFT == 1) {                          \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
+      }                                                                        \
+      /* repeat last 4 - 12 bytes for 411 subsampler */                        \
+      if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) {                   \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
+        memcpy(temp + SS(r, UVSHIFT) * BPP + BPP,                              \
+               temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2);                    \
+      }                                                                        \
+      if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) {                   \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2);                \
+      }                                                                        \
+      if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) {                   \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
+      }                                                                        \
+      ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                        \
+      memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));            \
+      memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));            \
+    }
+
+#ifdef HAS_SPLITUVROW_SSE2
+ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_AVX2
+ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_NEON
+ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_DSPR2
+ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_AVX2
+ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
+ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
+ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
+#endif
+#undef ANY12
+
+// Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                          \
+    void NAMEANY(const uint8* src_ptr, int src_stride_ptr,                     \
+                 uint8* dst_u, uint8* dst_v, int width) {                      \
+      SIMD_ALIGNED(uint8 temp[128 * 4]);                                       \
+      memset(temp, 0, 128 * 2);  /* for msan */                                \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
+      }                                                                        \
+      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
+      memcpy(temp + 128, src_ptr  + src_stride_ptr + (n >> UVSHIFT) * BPP,     \
+             SS(r, UVSHIFT) * BPP);                                            \
+      if ((width & 1) && UVSHIFT == 0) {  /* repeat last pixel for subsample */\
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
+        memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
+               temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
+      }                                                                        \
+      ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
+      memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
+      memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
+    }
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_AVX2
+ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
+ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
+ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_AVX2
+ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
+ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_SSE2
+ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
+ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_NEON
+ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_NEON
+ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_NEON
+ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_NEON
+ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_NEON
+ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_NEON
+ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_NEON
+ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_NEON
+ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_NEON
+ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB4444TOUVROW_NEON
+ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
+ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
+ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
+#endif
+#undef ANY12S
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/row_common.cc b/files/source/row_common.cc
index c5f3ce05..32d2f686 100644
--- a/files/source/row_common.cc
+++ b/files/source/row_common.cc
@@ -4,13 +4,13 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #include "libyuv/row.h"
 
-#include <string.h>  // For memcpy
+#include <string.h>  // For memcpy and memset.
 
 #include "libyuv/basic_types.h"
 
@@ -19,56 +19,60 @@ namespace libyuv {
 extern "C" {
 #endif
 
-void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int width) {
-  for (int x = 0; x < width; ++x) {
-    // To support in-place conversion.
-    uint8 a = src_bgra[0];
-    uint8 r = src_bgra[1];
-    uint8 g = src_bgra[2];
-    uint8 b = src_bgra[3];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = a;
-    dst_argb += 4;
-    src_bgra += 4;
-  }
+// llvm x86 is poor at ternary operator, so use branchless min/max.
+
+#define USE_BRANCHLESS 1
+#if USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+  return ((-(v) >> 31) & (v));
 }
 
-void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
-  for (int x = 0; x < width; ++x) {
-    // To support in-place conversion.
-    uint8 r = src_abgr[0];
-    uint8 g = src_abgr[1];
-    uint8 b = src_abgr[2];
-    uint8 a = src_abgr[3];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = a;
-    dst_argb += 4;
-    src_abgr += 4;
-  }
+static __inline int32 clamp255(int32 v) {
+  return (((255 - (v)) >> 31) | (v)) & 255;
 }
 
-void RGBAToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
-  for (int x = 0; x < width; ++x) {
-    // To support in-place conversion.
-    uint8 a = src_abgr[0];
-    uint8 b = src_abgr[1];
-    uint8 g = src_abgr[2];
-    uint8 r = src_abgr[3];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = a;
-    dst_argb += 4;
-    src_abgr += 4;
-  }
+static __inline uint32 Clamp(int32 val) {
+  int v = clamp0(val);
+  return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+  int m = v >> 31;
+  return (v + m) ^ m;
+}
+#else  // USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+  return (v < 0) ? 0 : v;
+}
+
+static __inline int32 clamp255(int32 v) {
+  return (v > 255) ? 255 : v;
 }
 
+static __inline uint32 Clamp(int32 val) {
+  int v = clamp0(val);
+  return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+  return (v < 0) ? -v : v;
+}
+#endif  // USE_BRANCHLESS
+
+#ifdef LIBYUV_LITTLE_ENDIAN
+#define WRITEWORD(p, v) *(uint32*)(p) = v
+#else
+static inline void WRITEWORD(uint8* p, uint32 v) {
+  p[0] = (uint8)(v & 255);
+  p[1] = (uint8)((v >> 8) & 255);
+  p[2] = (uint8)((v >> 16) & 255);
+  p[3] = (uint8)((v >> 24) & 255);
+}
+#endif
+
 void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
-  for (int x = 0; x < width; ++x) {
+  int x;
+  for (x = 0; x < width; ++x) {
     uint8 b = src_rgb24[0];
     uint8 g = src_rgb24[1];
     uint8 r = src_rgb24[2];
@@ -82,7 +86,8 @@ void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
 }
 
 void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
-  for (int x = 0; x < width; ++x) {
+  int x;
+  for (x = 0; x < width; ++x) {
     uint8 r = src_raw[0];
     uint8 g = src_raw[1];
     uint8 b = src_raw[2];
@@ -95,67 +100,72 @@ void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
   }
 }
 
-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
-  for (int x = 0; x < width; ++x) {
-    uint8 b = src_rgb[0] & 0x1f;
-    uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x07) << 3);
-    uint8 r = src_rgb[1] >> 3;
+void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 r = src_raw[0];
+    uint8 g = src_raw[1];
+    uint8 b = src_raw[2];
+    dst_rgb24[0] = b;
+    dst_rgb24[1] = g;
+    dst_rgb24[2] = r;
+    dst_rgb24 += 3;
+    src_raw += 3;
+  }
+}
+
+void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_rgb565[0] & 0x1f;
+    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r = src_rgb565[1] >> 3;
     dst_argb[0] = (b << 3) | (b >> 2);
     dst_argb[1] = (g << 2) | (g >> 4);
     dst_argb[2] = (r << 3) | (r >> 2);
     dst_argb[3] = 255u;
     dst_argb += 4;
-    src_rgb += 2;
+    src_rgb565 += 2;
   }
 }
 
-void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
-  for (int x = 0; x < width; ++x) {
-    uint8 b = src_rgb[0] & 0x1f;
-    uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x03) << 3);
-    uint8 r = (src_rgb[1] & 0x7c) >> 2;
-    uint8 a = src_rgb[1] >> 7;
+void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
+                         int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb1555[0] & 0x1f;
+    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 a = src_argb1555[1] >> 7;
     dst_argb[0] = (b << 3) | (b >> 2);
     dst_argb[1] = (g << 3) | (g >> 2);
     dst_argb[2] = (r << 3) | (r >> 2);
     dst_argb[3] = -a;
     dst_argb += 4;
-    src_rgb += 2;
+    src_argb1555 += 2;
   }
 }
 
-void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
-  for (int x = 0; x < width; ++x) {
-    uint8 b = src_rgb[0] & 0x0f;
-    uint8 g = src_rgb[0] >> 4;
-    uint8 r = src_rgb[1] & 0x0f;
-    uint8 a = src_rgb[1] >> 4;
+void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
+                         int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb4444[0] & 0x0f;
+    uint8 g = src_argb4444[0] >> 4;
+    uint8 r = src_argb4444[1] & 0x0f;
+    uint8 a = src_argb4444[1] >> 4;
     dst_argb[0] = (b << 4) | b;
     dst_argb[1] = (g << 4) | g;
     dst_argb[2] = (r << 4) | r;
     dst_argb[3] = (a << 4) | a;
     dst_argb += 4;
-    src_rgb += 2;
-  }
-}
-
-void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  for (int x = 0; x < width; ++x) {
-    uint8 b = src_argb[0];
-    uint8 g = src_argb[1];
-    uint8 r = src_argb[2];
-    uint8 a = src_argb[3];
-    dst_rgb[0] = a;
-    dst_rgb[1] = b;
-    dst_rgb[2] = g;
-    dst_rgb[3] = r;
-    dst_rgb += 4;
-    src_argb += 4;
+    src_argb4444 += 2;
   }
 }
 
 void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  for (int x = 0; x < width; ++x) {
+  int x;
+  for (x = 0; x < width; ++x) {
     uint8 b = src_argb[0];
     uint8 g = src_argb[1];
     uint8 r = src_argb[2];
@@ -168,7 +178,8 @@ void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 }
 
 void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  for (int x = 0; x < width; ++x) {
+  int x;
+  for (x = 0; x < width; ++x) {
     uint8 b = src_argb[0];
     uint8 g = src_argb[1];
     uint8 r = src_argb[2];
@@ -180,17 +191,17 @@ void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
-// TODO(fbarchard): support big endian CPU
 void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  for (int x = 0; x < width - 1; x += 2) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
     uint8 b0 = src_argb[0] >> 3;
     uint8 g0 = src_argb[1] >> 2;
     uint8 r0 = src_argb[2] >> 3;
     uint8 b1 = src_argb[4] >> 3;
     uint8 g1 = src_argb[5] >> 2;
     uint8 r1 = src_argb[6] >> 3;
-    *reinterpret_cast<uint32*>(dst_rgb) = b0 | (g0 << 5) | (r0 << 11) |
-        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+              (b1 << 16) | (g1 << 21) | (r1 << 27));
     dst_rgb += 4;
     src_argb += 8;
   }
@@ -198,12 +209,47 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
     uint8 b0 = src_argb[0] >> 3;
     uint8 g0 = src_argb[1] >> 2;
     uint8 r0 = src_argb[2] >> 3;
-    *reinterpret_cast<uint16*>(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+// dither4 is a row of 4 values from 4x4 dither matrix.
+// The 4x4 matrix contains values to increase RGB.  When converting to
+// fewer bits (565) this provides an ordered dither.
+// The order in the 4x4 matrix in first byte is upper left.
+// The 4 values are passed as an int, then referenced as an array, so
+// endian will not affect order of the original matrix.  But the dither4
+// will containing the first pixel in the lower byte for little endian
+// or the upper byte for big endian.
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint32 dither4, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    int dither0 = ((const unsigned char*)(&dither4))[x & 3];
+    int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
+    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
+    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
+    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
+    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
   }
 }
 
 void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  for (int x = 0; x < width - 1; x += 2) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
     uint8 b0 = src_argb[0] >> 3;
     uint8 g0 = src_argb[1] >> 3;
     uint8 r0 = src_argb[2] >> 3;
@@ -212,7 +258,7 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
     uint8 g1 = src_argb[5] >> 3;
     uint8 r1 = src_argb[6] >> 3;
     uint8 a1 = src_argb[7] >> 7;
-    *reinterpret_cast<uint32*>(dst_rgb) =
+    *(uint32*)(dst_rgb) =
         b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
         (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
     dst_rgb += 4;
@@ -223,13 +269,14 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
     uint8 g0 = src_argb[1] >> 3;
     uint8 r0 = src_argb[2] >> 3;
     uint8 a0 = src_argb[3] >> 7;
-    *reinterpret_cast<uint16*>(dst_rgb) =
+    *(uint16*)(dst_rgb) =
         b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
   }
 }
 
 void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  for (int x = 0; x < width - 1; x += 2) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
     uint8 b0 = src_argb[0] >> 4;
     uint8 g0 = src_argb[1] >> 4;
     uint8 r0 = src_argb[2] >> 4;
@@ -238,7 +285,7 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
     uint8 g1 = src_argb[5] >> 4;
     uint8 r1 = src_argb[6] >> 4;
     uint8 a1 = src_argb[7] >> 4;
-    *reinterpret_cast<uint32*>(dst_rgb) =
+    *(uint32*)(dst_rgb) =
         b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
         (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
     dst_rgb += 4;
@@ -249,44 +296,46 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
     uint8 g0 = src_argb[1] >> 4;
     uint8 r0 = src_argb[2] >> 4;
     uint8 a0 = src_argb[3] >> 4;
-    *reinterpret_cast<uint16*>(dst_rgb) =
+    *(uint16*)(dst_rgb) =
         b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
   }
 }
 
 static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
-  return (( 66 * r + 129 * g +  25 * b + 128) >> 8) + 16;
+  return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
 }
 
 static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
-  return ((-38 * r -  74 * g + 112 * b + 128) >> 8) + 128;
+  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
 }
 static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
-  return ((112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
+  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
 }
 
-#define MAKEROWY(NAME, R, G, B) \
+#define MAKEROWY(NAME, R, G, B, BPP) \
 void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
-  for (int x = 0; x < width; ++x) {                                            \
+  int x;                                                                       \
+  for (x = 0; x < width; ++x) {                                                \
     dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
-    src_argb0 += 4;                                                            \
+    src_argb0 += BPP;                                                          \
     dst_y += 1;                                                                \
   }                                                                            \
 }                                                                              \
 void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
                        uint8* dst_u, uint8* dst_v, int width) {                \
   const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
-  for (int x = 0; x < width - 1; x += 2) {                                     \
-    uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] +                                \
-               src_rgb1[B] + src_rgb1[B + 4]) >> 2;                            \
-    uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] +                                \
-               src_rgb1[G] + src_rgb1[G + 4]) >> 2;                            \
-    uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] +                                \
-               src_rgb1[R] + src_rgb1[R + 4]) >> 2;                            \
+  int x;                                                                       \
+  for (x = 0; x < width - 1; x += 2) {                                         \
+    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
+               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
+    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
+               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
+    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
+               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
     dst_u[0] = RGBToU(ar, ag, ab);                                             \
     dst_v[0] = RGBToV(ar, ag, ab);                                             \
-    src_rgb0 += 8;                                                             \
-    src_rgb1 += 8;                                                             \
+    src_rgb0 += BPP * 2;                                                       \
+    src_rgb1 += BPP * 2;                                                       \
     dst_u += 1;                                                                \
     dst_v += 1;                                                                \
   }                                                                            \
@@ -299,21 +348,333 @@ void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
   }                                                                            \
 }
 
-MAKEROWY(ARGB, 2, 1, 0)
-MAKEROWY(BGRA, 1, 2, 3)
-MAKEROWY(ABGR, 0, 1, 2)
-MAKEROWY(RGBA, 3, 2, 1)
+MAKEROWY(ARGB, 2, 1, 0, 4)
+MAKEROWY(BGRA, 1, 2, 3, 4)
+MAKEROWY(ABGR, 0, 1, 2, 4)
+MAKEROWY(RGBA, 3, 2, 1, 4)
+MAKEROWY(RGB24, 2, 1, 0, 3)
+MAKEROWY(RAW, 0, 1, 2, 3)
+#undef MAKEROWY
+
+// JPeg uses a variation on BT.601-1 full range
+// y =  0.29900 * r + 0.58700 * g + 0.11400 * b
+// u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
+// v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
+// BT.601 Mpeg range uses:
+// b 0.1016 * 255 = 25.908 = 25
+// g 0.5078 * 255 = 129.489 = 129
+// r 0.2578 * 255 = 65.739 = 66
+// JPeg 8 bit Y (not used):
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
+// JPeg 7 bit Y:
+// b 0.11400 * 128 = 14.592 = 15
+// g 0.58700 * 128 = 75.136 = 75
+// r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit U:
+// b  0.50000 * 255 = 127.5 = 127
+// g -0.33126 * 255 = -84.4713 = -84
+// r -0.16874 * 255 = -43.0287 = -43
+// JPeg 8 bit V:
+// b -0.08131 * 255 = -20.73405 = -20
+// g -0.41869 * 255 = -106.76595 = -107
+// r  0.50000 * 255 = 127.5 = 127
+
+static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
+  return (38 * r + 75 * g +  15 * b + 64) >> 7;
+}
+
+static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
+  return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
+}
+static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
+  return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
+}
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
+  int x;                                                                       \
+  for (x = 0; x < width; ++x) {                                                \
+    dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
+    src_argb0 += BPP;                                                          \
+    dst_y += 1;                                                                \
+  }                                                                            \
+}                                                                              \
+void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
+                        uint8* dst_u, uint8* dst_v, int width) {               \
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
+  int x;                                                                       \
+  for (x = 0; x < width - 1; x += 2) {                                         \
+    uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
+                    AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
+    uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
+                    AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
+    uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
+                    AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
+    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
+    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
+    src_rgb0 += BPP * 2;                                                       \
+    src_rgb1 += BPP * 2;                                                       \
+    dst_u += 1;                                                                \
+    dst_v += 1;                                                                \
+  }                                                                            \
+  if (width & 1) {                                                             \
+    uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
+    uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
+    uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
+    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
+    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
+  }                                                                            \
+}
+
+MAKEROWYJ(ARGB, 2, 1, 0, 4)
+#undef MAKEROWYJ
+
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_rgb565[0] & 0x1f;
+    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r = src_rgb565[1] >> 3;
+    b = (b << 3) | (b >> 2);
+    g = (g << 2) | (g >> 4);
+    r = (r << 3) | (r >> 2);
+    dst_y[0] = RGBToY(r, g, b);
+    src_rgb565 += 2;
+    dst_y += 1;
+  }
+}
+
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb1555[0] & 0x1f;
+    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+    b = (b << 3) | (b >> 2);
+    g = (g << 3) | (g >> 2);
+    r = (r << 3) | (r >> 2);
+    dst_y[0] = RGBToY(r, g, b);
+    src_argb1555 += 2;
+    dst_y += 1;
+  }
+}
+
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb4444[0] & 0x0f;
+    uint8 g = src_argb4444[0] >> 4;
+    uint8 r = src_argb4444[1] & 0x0f;
+    b = (b << 4) | b;
+    g = (g << 4) | g;
+    r = (r << 4) | r;
+    dst_y[0] = RGBToY(r, g, b);
+    src_argb4444 += 2;
+    dst_y += 1;
+  }
+}
+
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+                     uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_rgb565[0] & 0x1f;
+    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r0 = src_rgb565[1] >> 3;
+    uint8 b1 = src_rgb565[2] & 0x1f;
+    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+    uint8 r1 = src_rgb565[3] >> 3;
+    uint8 b2 = next_rgb565[0] & 0x1f;
+    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8 r2 = next_rgb565[1] >> 3;
+    uint8 b3 = next_rgb565[2] & 0x1f;
+    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+    uint8 r3 = next_rgb565[3] >> 3;
+    uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
+    uint8 g = (g0 + g1 + g2 + g3);
+    uint8 r = (r0 + r1 + r2 + r3);
+    b = (b << 1) | (b >> 6);  // 787 -> 888.
+    r = (r << 1) | (r >> 6);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_rgb565 += 4;
+    next_rgb565 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_rgb565[0] & 0x1f;
+    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r0 = src_rgb565[1] >> 3;
+    uint8 b2 = next_rgb565[0] & 0x1f;
+    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8 r2 = next_rgb565[1] >> 3;
+    uint8 b = (b0 + b2);  // 565 * 2 = 676.
+    uint8 g = (g0 + g2);
+    uint8 r = (r0 + r2);
+    b = (b << 2) | (b >> 4);  // 676 -> 888
+    g = (g << 1) | (g >> 6);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb1555[0] & 0x1f;
+    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 b1 = src_argb1555[2] & 0x1f;
+    uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
+    uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
+    uint8 b2 = next_argb1555[0] & 0x1f;
+    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
+    uint8 b3 = next_argb1555[2] & 0x1f;
+    uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
+    uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
+    uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
+    uint8 g = (g0 + g1 + g2 + g3);
+    uint8 r = (r0 + r1 + r2 + r3);
+    b = (b << 1) | (b >> 6);  // 777 -> 888.
+    g = (g << 1) | (g >> 6);
+    r = (r << 1) | (r >> 6);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_argb1555 += 4;
+    next_argb1555 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb1555[0] & 0x1f;
+    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 b2 = next_argb1555[0] & 0x1f;
+    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8 r2 = next_argb1555[1] >> 3;
+    uint8 b = (b0 + b2);  // 555 * 2 = 666.
+    uint8 g = (g0 + g2);
+    uint8 r = (r0 + r2);
+    b = (b << 2) | (b >> 4);  // 666 -> 888.
+    g = (g << 2) | (g >> 4);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb4444[0] & 0x0f;
+    uint8 g0 = src_argb4444[0] >> 4;
+    uint8 r0 = src_argb4444[1] & 0x0f;
+    uint8 b1 = src_argb4444[2] & 0x0f;
+    uint8 g1 = src_argb4444[2] >> 4;
+    uint8 r1 = src_argb4444[3] & 0x0f;
+    uint8 b2 = next_argb4444[0] & 0x0f;
+    uint8 g2 = next_argb4444[0] >> 4;
+    uint8 r2 = next_argb4444[1] & 0x0f;
+    uint8 b3 = next_argb4444[2] & 0x0f;
+    uint8 g3 = next_argb4444[2] >> 4;
+    uint8 r3 = next_argb4444[3] & 0x0f;
+    uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
+    uint8 g = (g0 + g1 + g2 + g3);
+    uint8 r = (r0 + r1 + r2 + r3);
+    b = (b << 2) | (b >> 4);  // 666 -> 888.
+    g = (g << 2) | (g >> 4);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_argb4444 += 4;
+    next_argb4444 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb4444[0] & 0x0f;
+    uint8 g0 = src_argb4444[0] >> 4;
+    uint8 r0 = src_argb4444[1] & 0x0f;
+    uint8 b2 = next_argb4444[0] & 0x0f;
+    uint8 g2 = next_argb4444[0] >> 4;
+    uint8 r2 = next_argb4444[1] & 0x0f;
+    uint8 b = (b0 + b2);  // 444 * 2 = 555.
+    uint8 g = (g0 + g2);
+    uint8 r = (r0 + r2);
+    b = (b << 3) | (b >> 2);  // 555 -> 888.
+    g = (g << 3) | (g >> 2);
+    r = (r << 3) | (r >> 2);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
 
-// http://en.wikipedia.org/wiki/Grayscale.
-// 0.11 * B + 0.59 * G + 0.30 * R
-// Coefficients rounded to multiple of 2 for consistency with SSSE3 version.
-static __inline int RGBToGray(uint8 r, uint8 g, uint8 b) {
-  return (( 76 * r + 152 * g +  28 * b) >> 8);
+void ARGBToUV411Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 3; x += 4) {
+    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
+    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
+    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 16;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  // Odd width handling mimics 'any' function which replicates last pixel.
+  if ((width & 3) == 3) {
+    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2;
+    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2;
+    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  } else if ((width & 3) == 2) {
+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  } else if ((width & 3) == 1) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  }
 }
 
 void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
-  for (int x = 0; x < width; ++x) {
-    uint8 y = RGBToGray(src_argb[2], src_argb[1], src_argb[0]);
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
     dst_argb[3] = src_argb[3];
     dst_argb += 4;
@@ -323,7 +684,8 @@ void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
 
 // Convert a row of image to Sepia tone.
 void ARGBSepiaRow_C(uint8* dst_argb, int width) {
-  for (int x = 0; x < width; ++x) {
+  int x;
+  for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
     int g = dst_argb[1];
     int r = dst_argb[2];
@@ -331,60 +693,44 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
     int sg = (b * 22 + g * 88 + r * 45) >> 7;
     int sr = (b * 24 + g * 98 + r * 50) >> 7;
     // b does not over flow. a is preserved from original.
-    if (sg > 255) {
-      sg = 255;
-    }
-    if (sr > 255) {
-      sr = 255;
-    }
     dst_argb[0] = sb;
-    dst_argb[1] = sg;
-    dst_argb[2] = sr;
+    dst_argb[1] = clamp255(sg);
+    dst_argb[2] = clamp255(sr);
     dst_argb += 4;
   }
 }
 
 // Apply color matrix to a row of image. Matrix is signed.
-void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) {
-  for (int x = 0; x < width; ++x) {
-    int b = dst_argb[0];
-    int g = dst_argb[1];
-    int r = dst_argb[2];
-    int a = dst_argb[3];
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+                          const int8* matrix_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = src_argb[0];
+    int g = src_argb[1];
+    int r = src_argb[2];
+    int a = src_argb[3];
     int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
-              r * matrix_argb[2] + a * matrix_argb[3]) >> 7;
+              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
     int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
-              r * matrix_argb[6] + a * matrix_argb[7]) >> 7;
+              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
     int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
-              r * matrix_argb[10] + a * matrix_argb[11]) >> 7;
-    if (sb < 0) {
-      sb = 0;
-    }
-    if (sb > 255) {
-      sb = 255;
-    }
-    if (sg < 0) {
-      sg = 0;
-    }
-    if (sg > 255) {
-      sg = 255;
-    }
-    if (sr < 0) {
-      sr = 0;
-    }
-    if (sr > 255) {
-      sr = 255;
-    }
-    dst_argb[0] = sb;
-    dst_argb[1] = sg;
-    dst_argb[2] = sr;
+              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
+    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
+              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
+    dst_argb[0] = Clamp(sb);
+    dst_argb[1] = Clamp(sg);
+    dst_argb[2] = Clamp(sr);
+    dst_argb[3] = Clamp(sa);
+    src_argb += 4;
     dst_argb += 4;
   }
 }
 
 // Apply color table to a row of image.
 void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
-  for (int x = 0; x < width; ++x) {
+  int x;
+  for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
     int g = dst_argb[1];
     int r = dst_argb[2];
@@ -397,9 +743,24 @@ void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
   }
 }
 
+// Apply color table to a row of image.
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    dst_argb[0] = table_argb[b * 4 + 0];
+    dst_argb[1] = table_argb[g * 4 + 1];
+    dst_argb[2] = table_argb[r * 4 + 2];
+    dst_argb += 4;
+  }
+}
+
 void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
                        int interval_offset, int width) {
-  for (int x = 0; x < width; ++x) {
+  int x;
+  for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
     int g = dst_argb[1];
     int r = dst_argb[2];
@@ -410,9 +771,192 @@ void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
   }
 }
 
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 24
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                    uint32 value) {
+  const uint32 b_scale = REPEAT8(value & 0xff);
+  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
+  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
+  const uint32 a_scale = REPEAT8(value >> 24);
+
+  int i;
+  for (i = 0; i < width; ++i) {
+    const uint32 b = REPEAT8(src_argb[0]);
+    const uint32 g = REPEAT8(src_argb[1]);
+    const uint32 r = REPEAT8(src_argb[2]);
+    const uint32 a = REPEAT8(src_argb[3]);
+    dst_argb[0] = SHADE(b, b_scale);
+    dst_argb[1] = SHADE(g, g_scale);
+    dst_argb[2] = SHADE(r, r_scale);
+    dst_argb[3] = SHADE(a, a_scale);
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 16
+
+void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const uint32 b = REPEAT8(src_argb0[0]);
+    const uint32 g = REPEAT8(src_argb0[1]);
+    const uint32 r = REPEAT8(src_argb0[2]);
+    const uint32 a = REPEAT8(src_argb0[3]);
+    const uint32 b_scale = src_argb1[0];
+    const uint32 g_scale = src_argb1[1];
+    const uint32 r_scale = src_argb1[2];
+    const uint32 a_scale = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_scale);
+    dst_argb[1] = SHADE(g, g_scale);
+    dst_argb[2] = SHADE(r, r_scale);
+    dst_argb[3] = SHADE(a, a_scale);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define SHADE(f, v) clamp255(v + f)
+
+void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                  uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const int b = src_argb0[0];
+    const int g = src_argb0[1];
+    const int r = src_argb0[2];
+    const int a = src_argb0[3];
+    const int b_add = src_argb1[0];
+    const int g_add = src_argb1[1];
+    const int r_add = src_argb1[2];
+    const int a_add = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_add);
+    dst_argb[1] = SHADE(g, g_add);
+    dst_argb[2] = SHADE(r, r_add);
+    dst_argb[3] = SHADE(a, a_add);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef SHADE
+
+#define SHADE(f, v) clamp0(f - v)
+
+void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const int b = src_argb0[0];
+    const int g = src_argb0[1];
+    const int r = src_argb0[2];
+    const int a = src_argb0[3];
+    const int b_sub = src_argb1[0];
+    const int g_sub = src_argb1[1];
+    const int r_sub = src_argb1[2];
+    const int a_sub = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_sub);
+    dst_argb[1] = SHADE(g, g_sub);
+    dst_argb[2] = SHADE(r, r_sub);
+    dst_argb[3] = SHADE(a, a_sub);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef SHADE
+
+// Sobel functions which mimics SSSE3.
+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
+                 uint8* dst_sobelx, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int a = src_y0[i];
+    int b = src_y1[i];
+    int c = src_y2[i];
+    int a_sub = src_y0[i + 2];
+    int b_sub = src_y1[i + 2];
+    int c_sub = src_y2[i + 2];
+    int a_diff = a - a_sub;
+    int b_diff = b - b_sub;
+    int c_diff = c - c_sub;
+    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+    dst_sobelx[i] = (uint8)(clamp255(sobel));
+  }
+}
+
+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
+                 uint8* dst_sobely, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int a = src_y0[i + 0];
+    int b = src_y0[i + 1];
+    int c = src_y0[i + 2];
+    int a_sub = src_y1[i + 0];
+    int b_sub = src_y1[i + 1];
+    int c_sub = src_y1[i + 2];
+    int a_diff = a - a_sub;
+    int b_diff = b - b_sub;
+    int c_diff = c - c_sub;
+    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+    dst_sobely[i] = (uint8)(clamp255(sobel));
+  }
+}
+
+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int s = clamp255(r + b);
+    dst_argb[0] = (uint8)(s);
+    dst_argb[1] = (uint8)(s);
+    dst_argb[2] = (uint8)(s);
+    dst_argb[3] = (uint8)(255u);
+    dst_argb += 4;
+  }
+}
+
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_y, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int s = clamp255(r + b);
+    dst_y[i] = (uint8)(s);
+  }
+}
+
+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                  uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int g = clamp255(r + b);
+    dst_argb[0] = (uint8)(b);
+    dst_argb[1] = (uint8)(g);
+    dst_argb[2] = (uint8)(r);
+    dst_argb[3] = (uint8)(255u);
+    dst_argb += 4;
+  }
+}
+
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
   // Copy a Y to RGB.
-  for (int x = 0; x < width; ++x) {
+  int x;
+  for (x = 0; x < width; ++x) {
     uint8 y = src_y[0];
     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
     dst_argb[3] = 255u;
@@ -421,250 +965,786 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
   }
 }
 
-// C reference code that mimics the YUV assembly.
+// TODO(fbarchard): Unify these structures to be platform independent.
+// TODO(fbarchard): Generate SIMD structures from float matrix.
+
+// BT.601 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.596
+//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
+//  B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR            (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+  { UG, VG, UG, VG, UG, VG, UG, VG },
+  { UG, VG, UG, VG, UG, VG, UG, VG },
+  { BB, BG, BR, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG },
+  { VG, UG, VG, UG, VG, UG, VG, UG },
+  { BR, BG, BB, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+#elif defined(__arm__)
+const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { BB, BG, BR, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { BR, BG, BB, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+#else
+const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+#endif
 
-#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// JPEG YUV to RGB reference
+// *  R = Y                - V * -1.40200
+// *  G = Y - U *  0.34414 - V *  0.71414
+// *  B = Y - U * -1.77200
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGB 32  /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -113 /* round(-1.77200 * 64) */
+#define UG 22 /* round(0.34414 * 64) */
+#define VG 46 /* round(0.71414  * 64) */
+#define VR -90 /* round(-1.40200 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128            + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR            (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+  { UG, VG, UG, VG, UG, VG, UG, VG },
+  { UG, VG, UG, VG, UG, VG, UG, VG },
+  { BB, BG, BR, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG },
+  { VG, UG, VG, UG, VG, UG, VG, UG },
+  { BR, BG, BB, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+#elif defined(__arm__)
+const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { BB, BG, BR, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { BR, BG, BB, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+#else
+const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+#endif
 
-#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
-#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
-#define UR 0
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// BT.709 YUV to RGB reference
+// *  R = Y                - V * -1.28033
+// *  G = Y - U *  0.21482 - V *  0.38059
+// *  B = Y - U * -2.12798
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGB 32  /* 64 / 2 */
+
+// TODO(fbarchard): Find way to express 2.12 instead of 2.0.
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.12798 * 64)) */
+#define UG 14 /* round(0.21482 * 64) */
+#define VG 24 /* round(0.38059  * 64) */
+#define VR -82 /* round(-1.28033 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128            + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR            (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+  { UG, VG, UG, VG, UG, VG, UG, VG },
+  { UG, VG, UG, VG, UG, VG, UG, VG },
+  { BB, BG, BR, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG },
+  { VG, UG, VG, UG, VG, UG, VG, UG },
+  { BR, BG, BB, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+#elif defined(__arm__)
+const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { BB, BG, BR, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { BR, BG, BB, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+#else
+const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+#endif
 
-#define VB 0
-#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
-#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
 
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
+// C reference code that mimics the YUV assembly.
+static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
+                              uint8* b, uint8* g, uint8* r,
+                              const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = -yuvconstants->kUVToRB[1];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[4];
+  int vr = -yuvconstants->kUVToRB[4];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+  int ub = yuvconstants->kUVToB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = yuvconstants->kUVToR[1];
+  int bb = yuvconstants->kUVBiasB[0];
+  int bg = yuvconstants->kUVBiasG[0];
+  int br = yuvconstants->kUVBiasR[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
 
-static __inline uint32 Clip(int32 val) {
-  if (val < 0) {
-    return static_cast<uint32>(0);
-  } else if (val > 255) {
-    return static_cast<uint32>(255);
-  }
-  return static_cast<uint32>(val);
+  uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;
+  *b = Clamp((int32)(-(u * ub         ) + y1 + bb) >> 6);
+  *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
+  *r = Clamp((int32)(-(         v * vr) + y1 + br) >> 6);
 }
 
-static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
-                              int ashift, int rshift, int gshift, int bshift) {
-  int32 y1 = (static_cast<int32>(y) - 16) * YG;
-  uint32 b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
-  uint32 g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
-  uint32 r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
-  *reinterpret_cast<uint32*>(rgb_buf) = (b << bshift) |
-                                        (g << gshift) |
-                                        (r << rshift) |
-                                        (255u << ashift);
-}
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
-static __inline void YuvPixel2(uint8 y, uint8 u, uint8 v,
-                               uint8* b, uint8* g, uint8* r) {
-  int32 y1 = (static_cast<int32>(y) - 16) * YG;
-  *b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
-  *g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
-  *r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
+// C reference code that mimics the YUV assembly.
+static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
+  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+  *b = Clamp((int32)(y1 + YGB) >> 6);
+  *g = Clamp((int32)(y1 + YGB) >> 6);
+  *r = Clamp((int32)(y1 + YGB) >> 6);
 }
 
-void I444ToARGBRow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
+#undef YG
+#undef YGB
+
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
+// C mimic assembly.
+// TODO(fbarchard): Remove subsampling from Neon.
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
+    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
+    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
+             yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
+             yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 2;
+    src_v += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+#else
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
                      uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
-  for (int x = 0; x < width; ++x) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0);
-    y_buf += 1;
-    u_buf += 1;
-    v_buf += 1;
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
     rgb_buf += 4;  // Advance 1 pixel.
   }
 }
+#endif
 
 // Also used for 420
-void I422ToARGBRow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
+void I422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
                      uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
-  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
-    y_buf += 2;
-    u_buf += 1;
-    v_buf += 1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+void I422AlphaToARGBRow_C(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          const uint8* src_a,
+                          uint8* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = src_a[0];
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = src_a[1];
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    src_a += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = src_a[0];
   }
 }
 
-void I422ToRGB24Row_C(const uint8* y_buf,
-                      const uint8* u_buf,
-                      const uint8* v_buf,
+void I422ToRGB24Row_C(const uint8* src_y,
+                      const uint8* src_u,
+                      const uint8* src_v,
                       uint8* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
                       int width) {
-  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
-              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
-    YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
-              rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
-    y_buf += 2;
-    u_buf += 1;
-    v_buf += 1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
     rgb_buf += 6;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
-              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
-  }
-}
-
-void I422ToRAWRow_C(const uint8* y_buf,
-                    const uint8* u_buf,
-                    const uint8* v_buf,
-                    uint8* rgb_buf,
-                    int width) {
-  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
-              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
-    YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
-              rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
-    y_buf += 2;
-    u_buf += 1;
-    v_buf += 1;
-    rgb_buf += 6;  // Advance 2 pixels.
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+  }
+}
+
+void I422ToARGB4444Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb4444,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+    b0 = b0 >> 4;
+    g0 = g0 >> 4;
+    r0 = r0 >> 4;
+    b1 = b1 >> 4;
+    g1 = g1 >> 4;
+    r1 = r1 >> 4;
+    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb4444 += 4;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
-              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    b0 = b0 >> 4;
+    g0 = g0 >> 4;
+    r0 = r0 >> 4;
+    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+        0xf000;
   }
 }
 
-void I411ToARGBRow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
+void I422ToARGB1555Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb1555,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 3;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 3;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb1555 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 3;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+        0x8000;
+  }
+}
+
+void I422ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void I411ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
                      uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
-  for (int x = 0; x < width - 3; x += 4) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
-    YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0);
-    YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0);
-    y_buf += 4;
-    u_buf += 1;
-    v_buf += 1;
+  int x;
+  for (x = 0; x < width - 3; x += 4) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    YuvPixel(src_y[2], src_u[0], src_v[0],
+             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants);
+    rgb_buf[11] = 255;
+    YuvPixel(src_y[3], src_u[0], src_v[0],
+             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants);
+    rgb_buf[15] = 255;
+    src_y += 4;
+    src_u += 1;
+    src_v += 1;
     rgb_buf += 16;  // Advance 4 pixels.
   }
   if (width & 2) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
-    y_buf += 2;
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
   }
 }
 
-void NV12ToARGBRow_C(const uint8* y_buf,
-                     const uint8* uv_buf,
+void NV12ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_uv,
                      uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
-  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(y_buf[1], uv_buf[0], uv_buf[1], rgb_buf + 4, 24, 16, 8, 0);
-    y_buf += 2;
-    uv_buf += 2;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_uv[0], src_uv[1],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_uv += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_y[0], src_uv[0], src_uv[1],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
   }
 }
 
-void NV21ToARGBRow_C(const uint8* y_buf,
-                     const uint8* vu_buf,
+void NV21ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_vu,
                      uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
-  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(y_buf[1], vu_buf[1], vu_buf[0], rgb_buf + 4, 24, 16, 8, 0);
-    y_buf += 2;
-    vu_buf += 2;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_vu[1], src_vu[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_vu += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_y[0], src_vu[1], src_vu[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+void NV12ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_uv,
+                       uint8* dst_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    src_y += 2;
+    src_uv += 2;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
   }
 }
 
-void I422ToBGRARow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
                      uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
-  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24);
-    y_buf += 2;
-    u_buf += 1;
-    v_buf += 1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_yuy2 += 4;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 0, 8, 16, 24);
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
   }
 }
 
-void I422ToABGRRow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
+void UYVYToARGBRow_C(const uint8* src_uyvy,
                      uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
-  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16);
-    y_buf += 2;
-    u_buf += 1;
-    v_buf += 1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_uyvy += 4;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
   }
 }
 
-void I422ToRGBARow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
+void I422ToRGBARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
                      uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
-  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 24, 16, 8);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 24, 16, 8);
-    y_buf += 2;
-    u_buf += 1;
-    v_buf += 1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+    rgb_buf[0] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants);
+    rgb_buf[4] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 24, 16, 8);
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+    rgb_buf[0] = 255;
   }
 }
 
-void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) {
-  for (int x = 0; x < width; ++x) {
-    YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0);
-    y_buf += 1;
-    rgb_buf += 4;  // Advance 1 pixel.
+void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
   }
 }
 
 void MirrorRow_C(const uint8* src, uint8* dst, int width) {
+  int x;
   src += width - 1;
-  for (int x = 0; x < width - 1; x += 2) {
+  for (x = 0; x < width - 1; x += 2) {
     dst[x] = src[0];
     dst[x + 1] = src[-1];
     src -= 2;
@@ -674,9 +1754,10 @@ void MirrorRow_C(const uint8* src, uint8* dst, int width) {
   }
 }
 
-void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  int x;
   src_uv += (width - 1) << 1;
-  for (int x = 0; x < width - 1; x += 2) {
+  for (x = 0; x < width - 1; x += 2) {
     dst_u[x] = src_uv[0];
     dst_u[x + 1] = src_uv[-2];
     dst_v[x] = src_uv[1];
@@ -690,10 +1771,11 @@ void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
 }
 
 void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
-  const uint32* src32 = reinterpret_cast<const uint32*>(src);
-  uint32* dst32 = reinterpret_cast<uint32*>(dst);
+  int x;
+  const uint32* src32 = (const uint32*)(src);
+  uint32* dst32 = (uint32*)(dst);
   src32 += width - 1;
-  for (int x = 0; x < width - 1; x += 2) {
+  for (x = 0; x < width - 1; x += 2) {
     dst32[x] = src32[0];
     dst32[x + 1] = src32[-1];
     src32 -= 2;
@@ -703,8 +1785,9 @@ void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
   }
 }
 
-void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
-  for (int x = 0; x < width - 1; x += 2) {
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
     dst_u[x] = src_uv[0];
     dst_u[x + 1] = src_uv[2];
     dst_v[x] = src_uv[1];
@@ -717,29 +1800,39 @@ void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
   }
 }
 
+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = src_u[x];
+    dst_uv[1] = src_v[x];
+    dst_uv[2] = src_u[x + 1];
+    dst_uv[3] = src_v[x + 1];
+    dst_uv += 4;
+  }
+  if (width & 1) {
+    dst_uv[0] = src_u[width - 1];
+    dst_uv[1] = src_v[width - 1];
+  }
+}
+
 void CopyRow_C(const uint8* src, uint8* dst, int count) {
   memcpy(dst, src, count);
 }
 
-void SetRow8_C(uint8* dst, uint32 v8, int count) {
-#ifdef _MSC_VER
-  // VC will generate rep stosb.
-  for (int x = 0; x < count; ++x) {
-    dst[x] = v8;
-  }
-#else
-  memset(dst, v8, count);
-#endif
+void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
+  memcpy(dst, src, count * 2);
+}
+
+void SetRow_C(uint8* dst, uint8 v8, int width) {
+  memset(dst, v8, width);
 }
 
-void SetRows32_C(uint8* dst, uint32 v32, int width,
-                 int dst_stride, int height) {
-  for (int y = 0; y < height; ++y) {
-    uint32* d = reinterpret_cast<uint32*>(dst);
-    for (int x = 0; x < width; ++x) {
-      d[x] = v32;
-    }
-    dst += dst_stride;
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
+  uint32* d = (uint32*)(dst_argb);
+  int x;
+  for (x = 0; x < width; ++x) {
+    d[x] = v32;
   }
 }
 
@@ -747,7 +1840,8 @@ void SetRows32_C(uint8* dst, uint32 v32, int width,
 void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
                    uint8* dst_u, uint8* dst_v, int width) {
   // Output a row of UV values, filtering 2 rows of YUY2.
-  for (int x = 0; x < width; x += 2) {
+  int x;
+  for (x = 0; x < width; x += 2) {
     dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
     dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
     src_yuy2 += 4;
@@ -760,7 +1854,8 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
 void YUY2ToUV422Row_C(const uint8* src_yuy2,
                       uint8* dst_u, uint8* dst_v, int width) {
   // Output a row of UV values.
-  for (int x = 0; x < width; x += 2) {
+  int x;
+  for (x = 0; x < width; x += 2) {
     dst_u[0] = src_yuy2[1];
     dst_v[0] = src_yuy2[3];
     src_yuy2 += 4;
@@ -772,7 +1867,8 @@ void YUY2ToUV422Row_C(const uint8* src_yuy2,
 // Copy row of YUY2 Y's (422) into Y (420/422).
 void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
   // Output a row of Y values.
-  for (int x = 0; x < width - 1; x += 2) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
     dst_y[x] = src_yuy2[0];
     dst_y[x + 1] = src_yuy2[2];
     src_yuy2 += 4;
@@ -786,7 +1882,8 @@ void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
 void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
                    uint8* dst_u, uint8* dst_v, int width) {
   // Output a row of UV values.
-  for (int x = 0; x < width; x += 2) {
+  int x;
+  for (x = 0; x < width; x += 2) {
     dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
     dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
     src_uyvy += 4;
@@ -799,7 +1896,8 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
 void UYVYToUV422Row_C(const uint8* src_uyvy,
                       uint8* dst_u, uint8* dst_v, int width) {
   // Output a row of UV values.
-  for (int x = 0; x < width; x += 2) {
+  int x;
+  for (x = 0; x < width; x += 2) {
     dst_u[0] = src_uyvy[0];
     dst_v[0] = src_uyvy[2];
     src_uyvy += 4;
@@ -811,7 +1909,8 @@ void UYVYToUV422Row_C(const uint8* src_uyvy,
 // Copy row of UYVY Y's (422) into Y (420/422).
 void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
   // Output a row of Y values.
-  for (int x = 0; x < width - 1; x += 2) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
     dst_y[x] = src_uyvy[1];
     dst_y[x + 1] = src_uyvy[3];
     src_uyvy += 4;
@@ -828,7 +1927,8 @@ void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
 // This code mimics the SSSE3 version for better testability.
 void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
                     uint8* dst_argb, int width) {
-  for (int x = 0; x < width - 1; x += 2) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
     uint32 fb = src_argb0[0];
     uint32 fg = src_argb0[1];
     uint32 fr = src_argb0[2];
@@ -872,12 +1972,32 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
   }
 }
 #undef BLEND
+
+#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8
+void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
+                     const uint8* alpha, uint8* dst, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
+    dst[1] = UBLEND(src0[1], src1[1], alpha[1]);
+    src0 += 2;
+    src1 += 2;
+    alpha += 2;
+    dst += 2;
+  }
+  if (width & 1) {
+    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
+  }
+}
+#undef UBLEND
+
 #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
 
 // Multiply source RGB by alpha and store to destination.
 // This code mimics the SSSE3 version for better testability.
 void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
-  for (int i = 0; i < width - 1; i += 2) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
     uint32 b = src_argb[0];
     uint32 g = src_argb[1];
     uint32 r = src_argb[2];
@@ -916,10 +2036,10 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
 // g = (g * 255 + (a / 2)) / a;
 // r = (r * 255 + (a / 2)) / a;
 // Reciprocal method is off by 1 on some values. ie 125
-// 8.16 fixed point inverse table
-#define T(a) 0x10000 / a
-uint32 fixed_invtbl8[256] = {
-  0x0100, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
+// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
+#define T(a) 0x01000000 + (0x10000 / a)
+const uint32 fixed_invtbl8[256] = {
+  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
   T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
   T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
   T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
@@ -950,202 +2070,35 @@ uint32 fixed_invtbl8[256] = {
   T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
   T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
   T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
-  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x0100 };
+  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
 #undef T
 
 void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
-  for (int i = 0; i < width; ++i) {
+  int i;
+  for (i = 0; i < width; ++i) {
     uint32 b = src_argb[0];
     uint32 g = src_argb[1];
     uint32 r = src_argb[2];
     const uint32 a = src_argb[3];
-    if (a) {
-      const uint32 ia = fixed_invtbl8[a];  // 8.16 fixed point
-      b = (b * ia) >> 8;
-      g = (g * ia) >> 8;
-      r = (r * ia) >> 8;
-      // Clamping should not be necessary but is free in assembly.
-      if (b > 255) {
-        b = 255;
-      }
-      if (g > 255) {
-        g = 255;
-      }
-      if (r > 255) {
-        r = 255;
-      }
-    }
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
+    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
+    b = (b * ia) >> 8;
+    g = (g * ia) >> 8;
+    r = (r * ia) >> 8;
+    // Clamping should not be necessary but is free in assembly.
+    dst_argb[0] = clamp255(b);
+    dst_argb[1] = clamp255(g);
+    dst_argb[2] = clamp255(r);
     dst_argb[3] = a;
     src_argb += 4;
     dst_argb += 4;
   }
 }
 
-// Wrappers to handle odd width
-#define YANY(NAMEANY, I420TORGB_SSE, I420TORGB_C, UV_SHIFT)                    \
-    void NAMEANY(const uint8* y_buf,                                           \
-                 const uint8* u_buf,                                           \
-                 const uint8* v_buf,                                           \
-                 uint8* rgb_buf,                                               \
-                 int width) {                                                  \
-      int n = width & ~7;                                                      \
-      I420TORGB_SSE(y_buf, u_buf, v_buf, rgb_buf, n);                          \
-      I420TORGB_C(y_buf + n,                                                   \
-                  u_buf + (n >> UV_SHIFT),                                     \
-                  v_buf + (n >> UV_SHIFT),                                     \
-                  rgb_buf + n * 4, width & 7);                                 \
-    }
-
-// Wrappers to handle odd width
-#define Y2NY(NAMEANY, NV12TORGB_SSE, NV12TORGB_C, UV_SHIFT)                    \
-    void NAMEANY(const uint8* y_buf,                                           \
-                 const uint8* uv_buf,                                          \
-                 uint8* rgb_buf,                                               \
-                 int width) {                                                  \
-      int n = width & ~7;                                                      \
-      NV12TORGB_SSE(y_buf, uv_buf, rgb_buf, n);                                \
-      NV12TORGB_C(y_buf + n,                                                   \
-                  uv_buf + (n >> UV_SHIFT),                                    \
-                  rgb_buf + n * 4, width & 7);                                 \
-    }
-
-
-#ifdef HAS_I422TOARGBROW_SSSE3
-YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0)
-YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1)
-YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2)
-Y2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, 0)
-Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0)
-YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
-YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
-#endif
-#ifdef HAS_I422TORGB24ROW_SSSE3
-YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3,                 \
-     I422ToRGB24Row_C, 1)
-YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1)
-#endif
-#ifdef HAS_I422TORGBAROW_SSSE3
-YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
-#endif
-#ifdef HAS_I422TOARGBROW_NEON
-YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1)
-YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1)
-YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1)
-YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1)
-Y2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0)
-Y2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0)
-YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1)
-YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1)
-#endif
-#undef YANY
-
-#define RGBANY(NAMEANY, ARGBTORGB, BPP)                                        \
-    void NAMEANY(const uint8* argb_buf,                                        \
-                 uint8* rgb_buf,                                               \
-                 int width) {                                                  \
-      SIMD_ALIGNED(uint8 row[kMaxStride]);                                     \
-      ARGBTORGB(argb_buf, row, width);                                         \
-      memcpy(rgb_buf, row, width * BPP);                                       \
-    }
-
-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
-RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 3)
-RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 3)
-RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2)
-RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2)
-RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_NEON)
-RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 3)
-RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 3)
-#endif
-#undef RGBANY
-
-#define YANY(NAMEANY, ARGBTOY_SSE, BPP)                                        \
-    void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \
-      ARGBTOY_SSE(src_argb, dst_y, width - 16);                                \
-      ARGBTOY_SSE(src_argb + (width - 16) * BPP, dst_y + (width - 16), 16);    \
-    }
-
-#ifdef HAS_ARGBTOYROW_SSSE3
-YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4)
-YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4)
-YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4)
-#endif
-#ifdef HAS_RGBATOYROW_SSSE3
-YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4)
-#endif
-#ifdef HAS_YUY2TOYROW_SSE2
-YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2)
-YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2)
-#endif
-#ifdef HAS_YUY2TOYROW_NEON
-YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2)
-YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2)
-#endif
-#undef YANY
-
-#define UVANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP)                            \
-    void NAMEANY(const uint8* src_argb, int src_stride_argb,                   \
-                 uint8* dst_u, uint8* dst_v, int width) {                      \
-      int n = width & ~15;                                                     \
-      ANYTOUV_SSE(src_argb, src_stride_argb, dst_u, dst_v, n);                 \
-      ANYTOUV_C(src_argb  + n * BPP, src_stride_argb,                          \
-                 dst_u + (n >> 1),                                             \
-                 dst_v + (n >> 1),                                             \
-                 width & 15);                                                  \
-    }
-
-#ifdef HAS_ARGBTOUVROW_SSSE3
-UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4)
-UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4)
-UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4)
-#endif
-#ifdef HAS_RGBATOYROW_SSSE3
-UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4)
-#endif
-#ifdef HAS_YUY2TOUVROW_SSE2
-UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2)
-UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2)
-#endif
-#ifdef HAS_YUY2TOUVROW_NEON
-UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2)
-UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2)
-#endif
-#undef UVANY
-
-#define UV422ANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP)                         \
-    void NAMEANY(const uint8* src_argb,                                        \
-                 uint8* dst_u, uint8* dst_v, int width) {                      \
-      int n = width & ~15;                                                     \
-      ANYTOUV_SSE(src_argb, dst_u, dst_v, n);                                  \
-      ANYTOUV_C(src_argb  + n * BPP,                                           \
-                 dst_u + (n >> 1),                                             \
-                 dst_v + (n >> 1),                                             \
-                 width & 15);                                                  \
-    }
-
-#ifdef HAS_YUY2TOUV422ROW_SSE2
-UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,               \
-         YUY2ToUV422Row_C, 2)
-UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,               \
-         UYVYToUV422Row_C, 2)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_NEON
-UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON,                         \
-         YUY2ToUV422Row_C, 2)
-UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,                         \
-         UYVYToUV422Row_C, 2)
-#endif
-#undef UV422ANY
-
 void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
                                const int32* previous_cumsum, int width) {
   int32 row_sum[4] = {0, 0, 0, 0};
-  for (int x = 0; x < width; ++x) {
+  int x;
+  for (x = 0; x < width; ++x) {
     row_sum[0] += row[x * 4 + 0];
     row_sum[1] += row[x * 4 + 1];
     row_sum[2] += row[x * 4 + 2];
@@ -1157,59 +2110,35 @@ void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
   }
 }
 
-void CumulativeSumToAverage_C(const int32* tl, const int32* bl,
-                              int w, int area, uint8* dst, int count) {
+void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
+                                int w, int area, uint8* dst, int count) {
   float ooa = 1.0f / area;
-  for (int i = 0; i < count; ++i) {
-    dst[0] = static_cast<uint8>((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
-    dst[1] = static_cast<uint8>((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
-    dst[2] = static_cast<uint8>((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
-    dst[3] = static_cast<uint8>((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+  int i;
+  for (i = 0; i < count; ++i) {
+    dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+    dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+    dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+    dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
     dst += 4;
     tl += 4;
     bl += 4;
   }
 }
 
-#define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 24
-
-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                    uint32 value) {
-  const uint32 b_scale = REPEAT8(value & 0xff);
-  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
-  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
-  const uint32 a_scale = REPEAT8(value >> 24);
-
-  for (int i = 0; i < width; ++i) {
-    const uint32 b = REPEAT8(src_argb[0]);
-    const uint32 g = REPEAT8(src_argb[1]);
-    const uint32 r = REPEAT8(src_argb[2]);
-    const uint32 a = REPEAT8(src_argb[3]);
-    dst_argb[0] = SHADE(b, b_scale);
-    dst_argb[1] = SHADE(g, g_scale);
-    dst_argb[2] = SHADE(r, r_scale);
-    dst_argb[3] = SHADE(a, a_scale);
-    src_argb += 4;
-    dst_argb += 4;
-  }
-}
-#undef REPEAT8
-#undef SHADE
-
 // Copy pixels from rotated source to destination row with a slope.
 LIBYUV_API
 void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
                      uint8* dst_argb, const float* uv_dudv, int width) {
+  int i;
   // Render a row of pixels from source into a buffer.
   float uv[2];
   uv[0] = uv_dudv[0];
   uv[1] = uv_dudv[1];
-  for (int i = 0; i < width; ++i) {
-    int x = static_cast<int>(uv[0]);
-    int y = static_cast<int>(uv[1]);
-    *reinterpret_cast<uint32*>(dst_argb) =
-        *reinterpret_cast<const uint32*>(src_argb + y * src_argb_stride +
+  for (i = 0; i < width; ++i) {
+    int x = (int)(uv[0]);
+    int y = (int)(uv[1]);
+    *(uint32*)(dst_argb) =
+        *(const uint32*)(src_argb + y * src_argb_stride +
                                          x * 4);
     dst_argb += 4;
     uv[0] += uv_dudv[2];
@@ -1217,29 +2146,481 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
   }
 }
 
+// Blend 2 rows into 1.
+static void HalfRow_C(const uint8* src_uv, ptrdiff_t src_uv_stride,
+                      uint8* dst_uv, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+  }
+}
+
+static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride,
+                         uint16* dst_uv, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+  }
+}
+
 // C version 2x2 -> 2x1.
-void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride,
-                          int dst_width, int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                      ptrdiff_t src_stride,
+                      int width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction ;
   int y0_fraction = 256 - y1_fraction;
   const uint8* src_ptr1 = src_ptr + src_stride;
-  uint8* end = dst_ptr + (dst_width << 2);
-  do {
+  int x;
+  if (y1_fraction == 0) {
+    memcpy(dst_ptr, src_ptr, width);
+    return;
+  }
+  if (y1_fraction == 128) {
+    HalfRow_C(src_ptr, src_stride, dst_ptr, width);
+    return;
+  }
+  for (x = 0; x < width - 1; x += 2) {
+    dst_ptr[0] =
+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+    dst_ptr[1] =
+        (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
+    src_ptr += 2;
+    src_ptr1 += 2;
+    dst_ptr += 2;
+  }
+  if (width & 1) {
+    dst_ptr[0] =
+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+  }
+}
+
+void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         ptrdiff_t src_stride,
+                         int width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16* src_ptr1 = src_ptr + src_stride;
+  int x;
+  if (source_y_fraction == 0) {
+    memcpy(dst_ptr, src_ptr, width * 2);
+    return;
+  }
+  if (source_y_fraction == 128) {
+    HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
+    return;
+  }
+  for (x = 0; x < width - 1; x += 2) {
     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
     dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
-    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
-    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
-    dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
-    dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
-    dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
-    dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
-    src_ptr += 8;
-    src_ptr1 += 8;
-    dst_ptr += 8;
-  } while (dst_ptr < end);
+    src_ptr += 2;
+    src_ptr1 += 2;
+    dst_ptr += 2;
+  }
+  if (width & 1) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+  }
+}
+
+// Use first 4 shuffler values to reorder ARGB channels.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+                      const uint8* shuffler, int width) {
+  int index0 = shuffler[0];
+  int index1 = shuffler[1];
+  int index2 = shuffler[2];
+  int index3 = shuffler[3];
+  // Shuffle a row of ARGB.
+  int x;
+  for (x = 0; x < width; ++x) {
+    // To support in-place conversion.
+    uint8 b = src_argb[index0];
+    uint8 g = src_argb[index1];
+    uint8 r = src_argb[index2];
+    uint8 a = src_argb[index3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void I422ToYUY2Row_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_frame, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_frame[0] = src_y[0];
+    dst_frame[1] = src_u[0];
+    dst_frame[2] = src_y[1];
+    dst_frame[3] = src_v[0];
+    dst_frame += 4;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+  }
+  if (width & 1) {
+    dst_frame[0] = src_y[0];
+    dst_frame[1] = src_u[0];
+    dst_frame[2] = 0;
+    dst_frame[3] = src_v[0];
+  }
 }
 
+void I422ToUYVYRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_frame, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_frame[0] = src_u[0];
+    dst_frame[1] = src_y[0];
+    dst_frame[2] = src_v[0];
+    dst_frame[3] = src_y[1];
+    dst_frame += 4;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+  }
+  if (width & 1) {
+    dst_frame[0] = src_u[0];
+    dst_frame[1] = src_y[0];
+    dst_frame[2] = src_v[0];
+    dst_frame[3] = 0;
+  }
+}
+
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+                         uint8* dst_argb,
+                         const float* poly,
+                         int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    float b = (float)(src_argb[0]);
+    float g = (float)(src_argb[1]);
+    float r = (float)(src_argb[2]);
+    float a = (float)(src_argb[3]);
+    float b2 = b * b;
+    float g2 = g * g;
+    float r2 = r * r;
+    float a2 = a * a;
+    float db = poly[0] + poly[4] * b;
+    float dg = poly[1] + poly[5] * g;
+    float dr = poly[2] + poly[6] * r;
+    float da = poly[3] + poly[7] * a;
+    float b3 = b2 * b;
+    float g3 = g2 * g;
+    float r3 = r2 * r;
+    float a3 = a2 * a;
+    db += poly[8] * b2;
+    dg += poly[9] * g2;
+    dr += poly[10] * r2;
+    da += poly[11] * a2;
+    db += poly[12] * b3;
+    dg += poly[13] * g3;
+    dr += poly[14] * r3;
+    da += poly[15] * a3;
+
+    dst_argb[0] = Clamp((int32)(db));
+    dst_argb[1] = Clamp((int32)(dg));
+    dst_argb[2] = Clamp((int32)(dr));
+    dst_argb[3] = Clamp((int32)(da));
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                             const uint8* luma, uint32 lumacoeff) {
+  uint32 bc = lumacoeff & 0xff;
+  uint32 gc = (lumacoeff >> 8) & 0xff;
+  uint32 rc = (lumacoeff >> 16) & 0xff;
+
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    // Luminance in rows, color values in columns.
+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+                           src_argb[2] * rc) & 0x7F00u) + luma;
+    const uint8* luma1;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
+              src_argb[6] * rc) & 0x7F00u) + luma;
+    dst_argb[4] = luma1[src_argb[4]];
+    dst_argb[5] = luma1[src_argb[5]];
+    dst_argb[6] = luma1[src_argb[6]];
+    dst_argb[7] = src_argb[7];
+    src_argb += 8;
+    dst_argb += 8;
+  }
+  if (width & 1) {
+    // Luminance in rows, color values in columns.
+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+                           src_argb[2] * rc) & 0x7F00u) + luma;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+  }
+}
+
+void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst[3] = src[3];
+    dst[7] = src[7];
+    dst += 8;
+    src += 8;
+  }
+  if (width & 1) {
+    dst[3] = src[3];
+  }
+}
+
+void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst_a[0] = src_argb[3];
+    dst_a[1] = src_argb[7];
+    dst_a += 2;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    dst_a[0] = src_argb[3];
+  }
+}
+
+void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst[3] = src[0];
+    dst[7] = src[1];
+    dst += 8;
+    src += 2;
+  }
+  if (width & 1) {
+    dst[3] = src[0];
+  }
+}
+
+// Maximum temporary width for wrappers to process at a time, in pixels.
+#define MAXTWIDTH 2048
+
+#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
+    defined(HAS_I422TORGB565ROW_SSSE3)
+// row_win.cc has asm version, but GCC uses 2 step wrapper.
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_rgb565,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb1555,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb4444,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_uv,
+                           uint8* dst_rgb565,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TORGB565ROW_AVX2)
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+    ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
+#else
+    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+#endif
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+    ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
+#else
+    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+#endif
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TORGB24ROW_AVX2)
+void I422ToRGB24Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+    // TODO(fbarchard): ARGBToRGB24Row_AVX2
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+void NV12ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/source/row_gcc.cc b/files/source/row_gcc.cc
new file mode 100644
index 00000000..1ac7ef1a
--- /dev/null
+++ b/files/source/row_gcc.cc
@@ -0,0 +1,5534 @@
+// VERSION 2
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+// Constants for ARGB
+static vec8 kARGBToY = {
+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+// JPeg full range.
+static vec8 kARGBToYJ = {
+  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+static vec8 kARGBToU = {
+  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static vec8 kARGBToUJ = {
+  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
+static vec8 kARGBToV = {
+  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static vec8 kARGBToVJ = {
+  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+// Constants for BGRA
+static vec8 kBGRAToY = {
+  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+static vec8 kBGRAToU = {
+  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+static vec8 kBGRAToV = {
+  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR
+static vec8 kABGRToY = {
+  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+static vec8 kABGRToU = {
+  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+static vec8 kABGRToV = {
+  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+// Constants for RGBA.
+static vec8 kRGBAToY = {
+  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+static vec8 kRGBAToU = {
+  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+static vec8 kRGBAToV = {
+  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+static uvec8 kAddY16 = {
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
+// 7 bit fixed point 0.5.
+static vec16 kAddYJ64 = {
+  64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static uvec8 kAddUV128 = {
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+static uvec16 kAddUVJ128 = {
+  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+
+// Shuffle table for converting RGB24 to ARGB.
+static uvec8 kShuffleMaskRGB24ToARGB = {
+  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+static uvec8 kShuffleMaskRAWToARGB = {
+  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Shuffle table for converting RAW to RGB24.  First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting RAW to RGB24.  Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting RAW to RGB24.  Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+static uvec8 kShuffleMaskARGBToRGB24 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static uvec8 kShuffleMaskARGBToRAW = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
+static uvec8 kShuffleMaskARGBToRGB24_0 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {
+  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
+  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
+};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {
+  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
+  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
+};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {
+  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
+  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
+};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {
+  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
+  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
+};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
+#endif  // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_J400TOARGBROW_SSE2
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm1                   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y),     // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)        // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_J400TOARGBROW_SSE2
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
+    "pslld     $0x18,%%xmm5                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x30,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm2                   \n"
+    "palignr   $0x8,%%xmm1,%%xmm2              \n"
+    "pshufb    %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "palignr   $0x4,%%xmm3,%%xmm3              \n"
+    "pshufb    %%xmm4,%%xmm3                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleMaskRGB24ToARGB)  // %3
+  : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
+    "pslld     $0x18,%%xmm5                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x30,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm2                   \n"
+    "palignr   $0x8,%%xmm1,%%xmm2              \n"
+    "pshufb    %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "palignr   $0x4,%%xmm3,%%xmm3              \n"
+    "pshufb    %%xmm4,%%xmm3                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleMaskRAWToARGB)  // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
+  asm volatile (
+   "movdqa     %3,%%xmm3                       \n"
+   "movdqa     %4,%%xmm4                       \n"
+   "movdqa     %5,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x4,0) ",%%xmm1    \n"
+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm2    \n"
+    "lea       " MEMLEA(0x18,0) ",%0           \n"
+    "pshufb    %%xmm3,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
+    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_raw),    // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(width)       // %2
+  : "m"(kShuffleMaskRAWToRGB24_0),  // %3
+    "m"(kShuffleMaskRAWToRGB24_1),  // %4
+    "m"(kShuffleMaskRAWToRGB24_2)   // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "mov       $0x1080108,%%eax                \n"
+    "movd      %%eax,%%xmm5                    \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "mov       $0x20802080,%%eax               \n"
+    "movd      %%eax,%%xmm6                    \n"
+    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psllw     $0xb,%%xmm3                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0xa,%%xmm4                     \n"
+    "psrlw     $0x5,%%xmm4                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psllw     $0x8,%%xmm7                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "psllw     $0xb,%%xmm2                     \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "psllw     $0x8,%%xmm1                     \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "pmulhuw   %%xmm6,%%xmm0                   \n"
+    "por       %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm1                   \n"
+    "punpckhbw %%xmm0,%%xmm2                   \n"
+    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  :
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "mov       $0x1080108,%%eax                \n"
+    "movd      %%eax,%%xmm5                    \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "mov       $0x42004200,%%eax               \n"
+    "movd      %%eax,%%xmm6                    \n"
+    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psllw     $0xb,%%xmm3                     \n"
+    "movdqa    %%xmm3,%%xmm4                   \n"
+    "psrlw     $0x6,%%xmm4                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psllw     $0x8,%%xmm7                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psllw     $0x1,%%xmm1                     \n"
+    "psllw     $0xb,%%xmm2                     \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "psllw     $0x8,%%xmm1                     \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "pmulhuw   %%xmm6,%%xmm0                   \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm1                   \n"
+    "punpckhbw %%xmm0,%%xmm2                   \n"
+    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  :
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "mov       $0xf0f0f0f,%%eax                \n"
+    "movd      %%eax,%%xmm4                    \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "pslld     $0x4,%%xmm5                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "psllw     $0x4,%%xmm1                     \n"
+    "psrlw     $0x4,%%xmm3                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm2,%%xmm0                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  :
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "movdqa    %3,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "pshufb    %%xmm6,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm6,%%xmm2                   \n"
+    "pshufb    %%xmm6,%%xmm3                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "psrldq    $0x4,%%xmm1                     \n"
+    "pslldq    $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm2,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pslldq    $0x8,%%xmm5                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "psrldq    $0x8,%%xmm2                     \n"
+    "pslldq    $0x4,%%xmm3                     \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x30,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  : "m"(kShuffleMaskARGBToRGB24)  // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "movdqa    %3,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "pshufb    %%xmm6,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm6,%%xmm2                   \n"
+    "pshufb    %%xmm6,%%xmm3                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "psrldq    $0x4,%%xmm1                     \n"
+    "pslldq    $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm2,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pslldq    $0x8,%%xmm5                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "psrldq    $0x8,%%xmm2                     \n"
+    "pslldq    $0x4,%%xmm3                     \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x30,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  : "m"(kShuffleMaskARGBToRAW)  // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psrld     $0x1b,%%xmm3                    \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psrld     $0x1a,%%xmm4                    \n"
+    "pslld     $0x5,%%xmm4                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0xb,%%xmm5                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pslld     $0x8,%%xmm0                     \n"
+    "psrld     $0x3,%%xmm1                     \n"
+    "psrld     $0x5,%%xmm2                     \n"
+    "psrad     $0x10,%%xmm0                    \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm4,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    "movd       %3,%%xmm6                      \n"
+    "punpcklbw  %%xmm6,%%xmm6                  \n"
+    "movdqa     %%xmm6,%%xmm7                  \n"
+    "punpcklwd  %%xmm6,%%xmm6                  \n"
+    "punpckhwd  %%xmm7,%%xmm7                  \n"
+    "pcmpeqb    %%xmm3,%%xmm3                  \n"
+    "psrld      $0x1b,%%xmm3                   \n"
+    "pcmpeqb    %%xmm4,%%xmm4                  \n"
+    "psrld      $0x1a,%%xmm4                   \n"
+    "pslld      $0x5,%%xmm4                    \n"
+    "pcmpeqb    %%xmm5,%%xmm5                  \n"
+    "pslld      $0xb,%%xmm5                    \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu     (%0),%%xmm0                    \n"
+    "paddusb    %%xmm6,%%xmm0                  \n"
+    "movdqa     %%xmm0,%%xmm1                  \n"
+    "movdqa     %%xmm0,%%xmm2                  \n"
+    "pslld      $0x8,%%xmm0                    \n"
+    "psrld      $0x3,%%xmm1                    \n"
+    "psrld      $0x5,%%xmm2                    \n"
+    "psrad      $0x10,%%xmm0                   \n"
+    "pand       %%xmm3,%%xmm1                  \n"
+    "pand       %%xmm4,%%xmm2                  \n"
+    "pand       %%xmm5,%%xmm0                  \n"
+    "por        %%xmm2,%%xmm1                  \n"
+    "por        %%xmm1,%%xmm0                  \n"
+    "packssdw   %%xmm0,%%xmm0                  \n"
+    "lea        0x10(%0),%0                    \n"
+    "movq       %%xmm0,(%1)                    \n"
+    "lea        0x8(%1),%1                     \n"
+    "sub        $0x4,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  : "m"(dither4) // %3
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    "vbroadcastss %3,%%xmm6                    \n"
+    "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
+    "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
+    "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
+    "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
+    "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
+    "vpslld     $0x5,%%ymm4,%%ymm4             \n"
+    "vpslld     $0xb,%%ymm3,%%ymm5             \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    (%0),%%ymm0                    \n"
+    "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
+    "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
+    "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
+    "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
+    "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
+    "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "lea        0x20(%0),%0                    \n"
+    "vmovdqu    %%xmm0,(%1)                    \n"
+    "lea        0x10(%1),%1                    \n"
+    "sub        $0x8,%2                        \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  : "m"(dither4) // %3
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
+
+
+void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psrld     $0x1b,%%xmm4                    \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "pslld     $0x5,%%xmm5                     \n"
+    "movdqa    %%xmm4,%%xmm6                   \n"
+    "pslld     $0xa,%%xmm6                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "pslld     $0xf,%%xmm7                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "psrad     $0x10,%%xmm0                    \n"
+    "psrld     $0x3,%%xmm1                     \n"
+    "psrld     $0x6,%%xmm2                     \n"
+    "psrld     $0x9,%%xmm3                     \n"
+    "pand      %%xmm7,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm6,%%xmm3                   \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  :: "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm4,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm3                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm3,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm1                   \n"
+    "psrlq     $0x4,%%xmm0                     \n"
+    "psrlq     $0x8,%%xmm1                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_ARGBTOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  : "m"(kARGBToYJ),  // %3
+    "m"(kAddYJ64)    // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+  0, 4, 1, 5, 2, 6, 3, 7
+};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vbroadcastf128 %4,%%ymm5                  \n"
+    "vmovdqu    %5,%%ymm6                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16),    // %4
+    "m"(kPermdARGBToY_AVX)  // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vbroadcastf128 %4,%%ymm5                  \n"
+    "vmovdqu    %5,%%ymm6                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
+    "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  : "m"(kARGBToYJ),   // %3
+    "m"(kAddYJ64),    // %4
+    "m"(kPermdARGBToY_AVX)  // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_AVX2
+
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kARGBToV),  // %5
+    "m"(kARGBToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_SSSE3
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+};
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vbroadcastf128 %5,%%ymm5                  \n"
+    "vbroadcastf128 %6,%%ymm6                  \n"
+    "vbroadcastf128 %7,%%ymm7                  \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
+    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+
+    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpshufb    %8,%%ymm0,%%ymm0               \n"
+    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
+
+    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kAddUV128),  // %5
+    "m"(kARGBToV),   // %6
+    "m"(kARGBToU),   // %7
+    "m"(kShufARGBToUV_AVX)  // %8
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_AVX2
+void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vbroadcastf128 %5,%%ymm5                  \n"
+    "vbroadcastf128 %6,%%ymm6                  \n"
+    "vbroadcastf128 %7,%%ymm7                  \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
+    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+
+    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
+    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpshufb    %8,%%ymm0,%%ymm0               \n"
+
+    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kAddUVJ128),  // %5
+    "m"(kARGBToVJ),  // %6
+    "m"(kARGBToUJ),  // %7
+    "m"(kShufARGBToUV_AVX)  // %8
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVJROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kARGBToVJ),  // %5
+    "m"(kARGBToUJ),  // %6
+    "m"(kAddUVJ128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVJROW_SSSE3
+
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                          int width) {
+  asm volatile (
+    "movdqa    %4,%%xmm3                       \n"
+    "movdqa    %5,%%xmm4                       \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm6                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm2                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "packsswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    "pmaddubsw %%xmm3,%%xmm0                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm2                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "packsswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),        // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "m"(kARGBToV),  // %4
+    "m"(kARGBToU),  // %5
+    "m"(kAddUV128)  // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOUV444ROW_SSSE3
+
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  : "m"(kBGRAToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_bgra0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_bgra)), // %4
+    "m"(kBGRAToV),  // %5
+    "m"(kBGRAToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  : "m"(kABGRToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  : "m"(kRGBAToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_abgr0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_abgr)), // %4
+    "m"(kABGRToV),  // %5
+    "m"(kABGRToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+
+void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgba0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_rgba)), // %4
+    "m"(kRGBAToV),  // %5
+    "m"(kRGBAToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+
+#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
+
+// Read 8 UV from 444
+#define READYUV444                                                             \
+    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422                                                             \
+    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+
+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
+#define READYUVA422                                                            \
+    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
+    "movq       " MEMACCESS([a_buf]) ",%%xmm5                   \n"            \
+    "lea        " MEMLEA(0x8, [a_buf]) ",%[a_buf]               \n"
+
+// Read 2 UV from 411, upsample to 8 UV.
+// reading 4 bytes is an msan violation.
+//    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"
+//    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)
+// pinsrw fails with drmemory
+//  __asm pinsrw     xmm0, [esi], 0        /* U */
+//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
+#define READYUV411_TEMP                                                        \
+    "movzwl     " MEMACCESS([u_buf]) ",%[temp]                  \n"            \
+    "movd       %[temp],%%xmm0                                  \n"            \
+    MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) "       \n"            \
+    "movd       %[temp],%%xmm1                                  \n"            \
+    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
+    "punpckldq  %%xmm0,%%xmm0                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12                                                               \
+    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
+    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+
+// Read 4 VU from NV21, upsample to 8 UV
+#define READNV21                                                               \
+    "movq       " MEMACCESS([vu_buf]) ",%%xmm0                  \n"            \
+    "lea        " MEMLEA(0x8, [vu_buf]) ",%[vu_buf]             \n"            \
+    "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+
+// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
+#define READYUY2                                                               \
+    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                \n"            \
+    "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n"            \
+    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm0                \n"            \
+    "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n"            \
+    "lea        " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf]        \n"
+
+// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
+#define READUYVY                                                               \
+    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm4                \n"            \
+    "pshufb     %[kShuffleUYVYY], %%xmm4                        \n"            \
+    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm0                \n"            \
+    "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n"            \
+    "lea        " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf]        \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP(yuvconstants)                                           \
+    "movdqa     " MEMACCESS([yuvconstants]) ",%%xmm8            \n"            \
+    "movdqa     " MEMACCESS2(32, [yuvconstants]) ",%%xmm9       \n"            \
+    "movdqa     " MEMACCESS2(64, [yuvconstants]) ",%%xmm10      \n"            \
+    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm11      \n"            \
+    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm12     \n"            \
+    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm13     \n"            \
+    "movdqa     " MEMACCESS2(192, [yuvconstants]) ",%%xmm14     \n"
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB(yuvconstants)                                                 \
+    "movdqa     %%xmm0,%%xmm1                                   \n"            \
+    "movdqa     %%xmm0,%%xmm2                                   \n"            \
+    "movdqa     %%xmm0,%%xmm3                                   \n"            \
+    "movdqa     %%xmm11,%%xmm0                                  \n"            \
+    "pmaddubsw  %%xmm8,%%xmm1                                   \n"            \
+    "psubw      %%xmm1,%%xmm0                                   \n"            \
+    "movdqa     %%xmm12,%%xmm1                                  \n"            \
+    "pmaddubsw  %%xmm9,%%xmm2                                   \n"            \
+    "psubw      %%xmm2,%%xmm1                                   \n"            \
+    "movdqa     %%xmm13,%%xmm2                                  \n"            \
+    "pmaddubsw  %%xmm10,%%xmm3                                  \n"            \
+    "psubw      %%xmm3,%%xmm2                                   \n"            \
+    "pmulhuw    %%xmm14,%%xmm4                                  \n"            \
+    "paddsw     %%xmm4,%%xmm0                                   \n"            \
+    "paddsw     %%xmm4,%%xmm1                                   \n"            \
+    "paddsw     %%xmm4,%%xmm2                                   \n"            \
+    "psraw      $0x6,%%xmm0                                     \n"            \
+    "psraw      $0x6,%%xmm1                                     \n"            \
+    "psraw      $0x6,%%xmm2                                     \n"            \
+    "packuswb   %%xmm0,%%xmm0                                   \n"            \
+    "packuswb   %%xmm1,%%xmm1                                   \n"            \
+    "packuswb   %%xmm2,%%xmm2                                   \n"
+#define YUVTORGB_REGS \
+    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+
+#else
+#define YUVTORGB_SETUP(yuvconstants)
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB(yuvconstants)                                                 \
+    "movdqa     %%xmm0,%%xmm1                                   \n"            \
+    "movdqa     %%xmm0,%%xmm2                                   \n"            \
+    "movdqa     %%xmm0,%%xmm3                                   \n"            \
+    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm0       \n"            \
+    "pmaddubsw  " MEMACCESS([yuvconstants]) ",%%xmm1            \n"            \
+    "psubw      %%xmm1,%%xmm0                                   \n"            \
+    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm1      \n"            \
+    "pmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%xmm2       \n"            \
+    "psubw      %%xmm2,%%xmm1                                   \n"            \
+    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm2      \n"            \
+    "pmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%xmm3       \n"            \
+    "psubw      %%xmm3,%%xmm2                                   \n"            \
+    "pmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%xmm4      \n"            \
+    "paddsw     %%xmm4,%%xmm0                                   \n"            \
+    "paddsw     %%xmm4,%%xmm1                                   \n"            \
+    "paddsw     %%xmm4,%%xmm2                                   \n"            \
+    "psraw      $0x6,%%xmm0                                     \n"            \
+    "psraw      $0x6,%%xmm1                                     \n"            \
+    "psraw      $0x6,%%xmm2                                     \n"            \
+    "packuswb   %%xmm0,%%xmm0                                   \n"            \
+    "packuswb   %%xmm1,%%xmm1                                   \n"            \
+    "packuswb   %%xmm2,%%xmm2                                   \n"
+#define YUVTORGB_REGS
+#endif
+
+// Store 8 ARGB values.
+#define STOREARGB                                                              \
+    "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
+    "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
+    "movdqa     %%xmm0,%%xmm1                                    \n"           \
+    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
+    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
+    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
+    "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
+    "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
+
+// Store 8 RGBA values.
+#define STORERGBA                                                              \
+    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
+    "punpcklbw %%xmm2,%%xmm1                                     \n"           \
+    "punpcklbw %%xmm0,%%xmm5                                     \n"           \
+    "movdqa    %%xmm5,%%xmm0                                     \n"           \
+    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
+    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
+    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
+    "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV444
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* dst_rgb24,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
+    "sub       %[u_buf],%[v_buf]               \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(yuvconstants)
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
+    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
+    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
+    "subl      $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
+    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+                                     const uint8* u_buf,
+                                     const uint8* v_buf,
+                                     const uint8* a_buf,
+                                     uint8* dst_argb,
+                                     const struct YuvConstants* yuvconstants,
+                                     int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUVA422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "subl      $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [a_buf]"+r"(a_buf),    // %[a_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I422ALPHATOARGBROW_SSSE3
+
+#ifdef HAS_I411TOARGBROW_SSSE3
+void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  int temp;
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV411_TEMP
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "subl      $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),        // %[y_buf]
+    [u_buf]"+r"(u_buf),        // %[u_buf]
+    [v_buf]"+r"(v_buf),        // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [temp]"=&r"(temp),         // %[temp]
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)         // %[width]
+#else
+    [width]"+rm"(width)        // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* uv_buf,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV12
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* vu_buf,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV21
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleNV21]"m"(kShuffleNV21)
+    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUY2
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READUYVY
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleUYVYY]"m"(kShuffleUYVYY),
+    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_rgba,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(yuvconstants)
+    STORERGBA
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+#endif  // HAS_I422TOARGBROW_SSSE3
+
+// Read 16 UV from 444
+#define READYUV444_AVX2                                                        \
+    "vmovdqu    " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
+    MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1)                         \
+    "lea        " MEMLEA(0x10, [u_buf]) ",%[u_buf]                  \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n"        \
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2                                                        \
+    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
+    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+
+// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
+#define READYUVA422_AVX2                                                       \
+    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
+    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"        \
+    "vmovdqu    " MEMACCESS([a_buf]) ",%%xmm5                       \n"        \
+    "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n"        \
+    "lea        " MEMLEA(0x10, [a_buf]) ",%[a_buf]                  \n"
+
+// Read 4 UV from 411, upsample to 16 UV.
+#define READYUV411_AVX2                                                        \
+    "vmovd      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
+    MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
+    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]                   \n"        \
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpunpckldq %%ymm0,%%ymm0,%%ymm0                                \n"        \
+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2                                                          \
+    "vmovdqu    " MEMACCESS([uv_buf]) ",%%xmm0                      \n"        \
+    "lea        " MEMLEA(0x10, [uv_buf]) ",%[uv_buf]                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+
+// Read 8 VU from NV21, upsample to 16 UV.
+#define READNV21_AVX2                                                          \
+    "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                      \n"        \
+    "lea        " MEMLEA(0x10, [vu_buf]) ",%[vu_buf]                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \
+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2                                                          \
+    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                    \n"        \
+    "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n"        \
+    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm0                    \n"        \
+    "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n"        \
+    "lea        " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf]            \n"
+
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2                                                          \
+    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm4                   \n"        \
+    "vpshufb     %[kShuffleUYVYY], %%ymm4, %%ymm4                   \n"        \
+    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm0                   \n"        \
+    "vpshufb     %[kShuffleUYVYUV], %%ymm0, %%ymm0                  \n"        \
+    "lea        " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf]            \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP_AVX2(yuvconstants)                                      \
+    "vmovdqa     " MEMACCESS([yuvconstants]) ",%%ymm8            \n"           \
+    "vmovdqa     " MEMACCESS2(32, [yuvconstants]) ",%%ymm9       \n"           \
+    "vmovdqa     " MEMACCESS2(64, [yuvconstants]) ",%%ymm10      \n"           \
+    "vmovdqa     " MEMACCESS2(96, [yuvconstants]) ",%%ymm11      \n"           \
+    "vmovdqa     " MEMACCESS2(128, [yuvconstants]) ",%%ymm12     \n"           \
+    "vmovdqa     " MEMACCESS2(160, [yuvconstants]) ",%%ymm13     \n"           \
+    "vmovdqa     " MEMACCESS2(192, [yuvconstants]) ",%%ymm14     \n"
+#define YUVTORGB_AVX2(yuvconstants)                                            \
+    "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n"        \
+    "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n"        \
+    "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n"        \
+    "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n"        \
+    "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n"        \
+    "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n"        \
+    "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n"        \
+    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
+    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
+    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
+    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
+    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
+    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
+    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
+    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
+    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+#define YUVTORGB_REGS_AVX2 \
+    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+#else  // Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_SETUP_AVX2(yuvconstants)
+#define YUVTORGB_AVX2(yuvconstants)                                            \
+    "vpmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2   \n"        \
+    "vpmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1   \n"        \
+    "vpmaddubsw  " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0        \n"        \
+    "vmovdqu     " MEMACCESS2(160, [yuvconstants]) ",%%ymm3         \n"        \
+    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
+    "vmovdqu     " MEMACCESS2(128, [yuvconstants]) ",%%ymm3         \n"        \
+    "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
+    "vmovdqu     " MEMACCESS2(96, [yuvconstants]) ",%%ymm3          \n"        \
+    "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
+    "vpmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4  \n"        \
+    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
+    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
+    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
+    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
+    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
+    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
+    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
+    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
+    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+#define YUVTORGB_REGS_AVX2
+#endif
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2                                                         \
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n"        \
+    "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n"        \
+    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n"        \
+    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n"        \
+    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "                    \n"        \
+    "vmovdqu    %%ymm0," MEMACCESS2(0x20, [dst_argb]) "             \n"        \
+    "lea       " MEMLEA(0x40, [dst_argb]) ", %[dst_argb]            \n"
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV444_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I444TOARGBROW_AVX2
+
+#ifdef HAS_I411TOARGBROW_AVX2
+// 16 pixels
+// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV411_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I411TOARGBROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I422TOARGBROW_AVX2
+
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               const uint8* a_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUVA422_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "subl      $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [a_buf]"+r"(a_buf),    // %[a_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I422ALPHATOARGBROW_AVX2
+
+#if defined(HAS_I422TORGBAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+
+    // Step 3: Weave into RGBA
+    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
+    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I422TORGBAROW_AVX2
+
+#if defined(HAS_NV12TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* uv_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV12_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+    "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_NV12TOARGBROW_AVX2
+
+#if defined(HAS_NV21TOARGBROW_AVX2)
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* vu_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV21_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleNV21]"m"(kShuffleNV21)
+    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_NV21TOARGBROW_AVX2
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUY2_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_YUY2TOARGBROW_AVX2
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READUYVY_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleUYVYY]"m"(kShuffleUYVYY),
+    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_UYVYTOARGBROW_AVX2
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
+  asm volatile (
+    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
+    "movd      %%eax,%%xmm2                    \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
+    "movd      %%eax,%%xmm3                    \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    LABELALIGN
+  "1:                                          \n"
+    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "psubusw   %%xmm3,%%xmm0                   \n"
+    "psrlw     $6, %%xmm0                      \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+
+    // Step 2: Weave into ARGB
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm1                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "por       %%xmm4,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(y_buf),     // %0
+    "+r"(dst_argb),  // %1
+    "+rm"(width)     // %2
+  :
+  : "memory", "cc", "eax"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
+  asm volatile (
+    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
+    "vmovd      %%eax,%%xmm2                   \n"
+    "vbroadcastss %%xmm2,%%ymm2                \n"
+    "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
+    "vmovd      %%eax,%%xmm3                   \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpslld     $0x18,%%ymm4,%%ymm4            \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+    "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
+    "lea        " MEMLEA(0x10,0) ",%0          \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
+    "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
+    "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub        $0x10,%2                       \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(y_buf),     // %0
+    "+r"(dst_argb),  // %1
+    "+rm"(width)     // %2
+  :
+  : "memory", "cc", "eax"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static uvec8 kShuffleMirror = {
+  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "movdqa    %3,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kShuffleMirror) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm5                  \n"
+    LABELALIGN
+  "1:                                          \n"
+    MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
+    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kShuffleMirror) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "movdqa    %4,%%xmm1                       \n"
+    "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(-0x10,0) ",%0          \n"
+    "pshufb    %%xmm1,%%xmm0                   \n"
+    "movlpd    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $8,%3                           \n"
+    "jg        1b                              \n"
+  : "+r"(src),      // %0
+    "+r"(dst_u),    // %1
+    "+r"(dst_v),    // %2
+    "+r"(temp_width)  // %3
+  : "m"(kShuffleMirrorUV)  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
+    "lea       " MEMLEA(-0x10,0) ",%0          \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "vmovdqu    %3,%%ymm5                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "sub        $0x8,%2                        \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kARGBShuffleMirror_AVX2) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_AVX2
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
+    "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
+    "sub        %1,%2                            \n"
+    LABELALIGN
+  "1:                                            \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
+    "lea        " MEMLEA(0x40,0) ",%0            \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
+    "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
+    "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
+    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
+    "lea        " MEMLEA(0x20,1) ",%1            \n"
+    "sub        $0x20,%3                         \n"
+    "jg         1b                               \n"
+    "vzeroupper                                  \n"
+  : "+r"(src_uv),     // %0
+    "+r"(dst_u),      // %1
+    "+r"(dst_v),      // %2
+    "+r"(width)         // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  asm volatile (
+    "pcmpeqb    %%xmm5,%%xmm5                    \n"
+    "psrlw      $0x8,%%xmm5                      \n"
+    "sub        %1,%2                            \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
+    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
+    "lea        " MEMLEA(0x20,0) ",%0            \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "pand       %%xmm5,%%xmm0                    \n"
+    "pand       %%xmm5,%%xmm1                    \n"
+    "packuswb   %%xmm1,%%xmm0                    \n"
+    "psrlw      $0x8,%%xmm2                      \n"
+    "psrlw      $0x8,%%xmm3                      \n"
+    "packuswb   %%xmm3,%%xmm2                    \n"
+    "movdqu     %%xmm0," MEMACCESS(1) "          \n"
+    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
+    "lea        " MEMLEA(0x10,1) ",%1            \n"
+    "sub        $0x10,%3                         \n"
+    "jg         1b                               \n"
+  : "+r"(src_uv),     // %0
+    "+r"(dst_u),      // %1
+    "+r"(dst_v),      // %2
+    "+r"(width)         // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+    "sub       %0,%1                             \n"
+    LABELALIGN
+  "1:                                            \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
+    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
+    "lea       " MEMLEA(0x20,0) ",%0             \n"
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
+    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
+    "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
+    "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
+    "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
+    "lea       " MEMLEA(0x40,2) ",%2             \n"
+    "sub       $0x20,%3                          \n"
+    "jg        1b                                \n"
+    "vzeroupper                                  \n"
+  : "+r"(src_u),     // %0
+    "+r"(src_v),     // %1
+    "+r"(dst_uv),    // %2
+    "+r"(width)      // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+    "sub       %0,%1                             \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
+    "movdqa    %%xmm0,%%xmm2                     \n"
+    "punpcklbw %%xmm1,%%xmm0                     \n"
+    "punpckhbw %%xmm1,%%xmm2                     \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "           \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
+    "lea       " MEMLEA(0x20,2) ",%2             \n"
+    "sub       $0x10,%3                          \n"
+    "jg        1b                                \n"
+  : "+r"(src_u),     // %0
+    "+r"(src_v),     // %1
+    "+r"(dst_uv),    // %2
+    "+r"(width)      // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    "test       $0xf,%0                        \n"
+    "jne        2f                             \n"
+    "test       $0xf,%1                        \n"
+    "jne        2f                             \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       9f                              \n"
+    LABELALIGN
+  "2:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        2b                              \n"
+  "9:                                          \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x40,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_COPYROW_AVX
+
+#ifdef HAS_COPYROW_ERMS
+// Multiple of 1.
+void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep movsb " MEMMOVESTRING(0,1) "          \n"
+  : "+S"(src),  // %0
+    "+D"(dst),  // %1
+    "+c"(width_tmp) // %2
+  :
+  : "memory", "cc"
+  );
+}
+#endif  // HAS_COPYROW_ERMS
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm0,%%xmm0                   \n"
+    "pslld     $0x18,%%xmm0                    \n"
+    "pcmpeqb   %%xmm1,%%xmm1                   \n"
+    "psrld     $0x8,%%xmm1                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "pand      %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm0,%%xmm3                   \n"
+    "pand      %%xmm1,%%xmm4                   \n"
+    "pand      %%xmm1,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
+    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
+    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+// width in pixels
+void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
+ asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ", %%xmm0        \n"
+    "movdqu    " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
+    "lea       " MEMLEA(0x20, 0) ", %0         \n"
+    "psrld     $0x18, %%xmm0                   \n"
+    "psrld     $0x18, %%xmm1                   \n"
+    "packssdw  %%xmm1, %%xmm0                  \n"
+    "packuswb  %%xmm0, %%xmm0                  \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8, 1) ", %1          \n"
+    "sub       $0x8, %2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_a),     // %1
+    "+rm"(width)     // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm0,%%xmm0                   \n"
+    "pslld     $0x18,%%xmm0                    \n"
+    "pcmpeqb   %%xmm1,%%xmm1                   \n"
+    "psrld     $0x8,%%xmm1                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "punpckhwd %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm2,%%xmm2                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "pand      %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm0,%%xmm3                   \n"
+    "pand      %%xmm1,%%xmm4                   \n"
+    "pand      %%xmm1,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
+    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
+    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
+    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
+    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
+    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+void SetRow_X86(uint8* dst, uint8 v8, int width) {
+  size_t width_tmp = (size_t)(width >> 2);
+  const uint32 v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
+  asm volatile (
+    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
+    : "+D"(dst),       // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v32)         // %2
+    : "memory", "cc");
+}
+
+void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep stosb " MEMSTORESTRING(al,0) "        \n"
+    : "+D"(dst),       // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v8)          // %2
+    : "memory", "cc");
+}
+
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
+    : "+D"(dst_argb),  // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v32)         // %2
+    : "memory", "cc");
+}
+#endif  // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  : "r"((intptr_t)(stride_yuy2))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  : "r"((intptr_t)(stride_uyvy))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_YUY2TOYROW_AVX2
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "lea      " MEMLEA(0x20,1) ",%1            \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  : "r"((intptr_t)(stride_yuy2))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "lea      " MEMLEA(0x20,1) ",%1            \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  : "r"((intptr_t)(stride_uyvy))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static uvec8 kShuffleAlpha = {
+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+
+// Blend 8 pixels at a time
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                        uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0xf,%%xmm7                     \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x8,%%xmm6                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psllw     $0x8,%%xmm5                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "40:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        99f                             \n"
+
+    // 1 pixel loop.
+  "91:                                         \n"
+    "movd      " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm2         \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
+    "jge       91b                             \n"
+  "99:                                         \n"
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  : "m"(kShuffleAlpha)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_SSSE3
+// Blend 8 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
+                         const uint8* alpha, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb    %%xmm5,%%xmm5                  \n"
+    "psllw      $0x8,%%xmm5                    \n"
+    "mov        $0x80808080,%%eax              \n"
+    "movd       %%eax,%%xmm6                   \n"
+    "pshufd     $0x0,%%xmm6,%%xmm6             \n"
+    "mov        $0x807f807f,%%eax              \n"
+    "movd       %%eax,%%xmm7                   \n"
+    "pshufd     $0x0,%%xmm7,%%xmm7             \n"
+    "sub        %2,%0                          \n"
+    "sub        %2,%1                          \n"
+    "sub        %2,%3                          \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq       (%2),%%xmm0                    \n"
+    "punpcklbw  %%xmm0,%%xmm0                  \n"
+    "pxor       %%xmm5,%%xmm0                  \n"
+    "movq       (%0,%2,1),%%xmm1               \n"
+    "movq       (%1,%2,1),%%xmm2               \n"
+    "punpcklbw  %%xmm2,%%xmm1                  \n"
+    "psubb      %%xmm6,%%xmm1                  \n"
+    "pmaddubsw  %%xmm1,%%xmm0                  \n"
+    "paddw      %%xmm7,%%xmm0                  \n"
+    "psrlw      $0x8,%%xmm0                    \n"
+    "packuswb   %%xmm0,%%xmm0                  \n"
+    "movq       %%xmm0,(%3,%2,1)               \n"
+    "lea        0x8(%2),%2                     \n"
+    "sub        $0x8,%4                        \n"
+    "jg        1b                              \n"
+  : "+r"(src0),       // %0
+    "+r"(src1),       // %1
+    "+r"(alpha),      // %2
+    "+r"(dst),        // %3
+    "+rm"(width)      // %4
+  :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
+                        const uint8* alpha, uint8* dst, int width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
+    "mov        $0x80808080,%%eax              \n"
+    "vmovd      %%eax,%%xmm6                   \n"
+    "vbroadcastss %%xmm6,%%ymm6                \n"
+    "mov        $0x807f807f,%%eax              \n"
+    "vmovd      %%eax,%%xmm7                   \n"
+    "vbroadcastss %%xmm7,%%ymm7                \n"
+    "sub        %2,%0                          \n"
+    "sub        %2,%1                          \n"
+    "sub        %2,%3                          \n"
+
+    // 32 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    (%2),%%ymm0                    \n"
+    "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
+    "vmovdqu    (%0,%2,1),%%ymm1               \n"
+    "vmovdqu    (%1,%2,1),%%ymm2               \n"
+    "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
+    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+    "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
+    "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
+    "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
+    "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
+    "vmovdqu    %%ymm0,(%3,%2,1)               \n"
+    "lea        0x20(%2),%2                    \n"
+    "sub        $0x20,%4                       \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src0),       // %0
+    "+r"(src1),       // %1
+    "+r"(alpha),      // %2
+    "+r"(dst),        // %3
+    "+rm"(width)      // %4
+  :: "memory", "cc", "eax",
+     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_BLENDPLANEROW_AVX2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha
+static uvec8 kShuffleAlpha0 = {
+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
+};
+static uvec8 kShuffleAlpha1 = {
+  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
+};
+// Attenuate 4 pixels at a time.
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "pslld     $0x18,%%xmm3                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "punpcklbw %%xmm1,%%xmm1                   \n"
+    "pmulhuw   %%xmm1,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "punpckhbw %%xmm2,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "pand      %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleAlpha0),  // %3
+    "m"(kShuffleAlpha1)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {
+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
+};
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpslld     $0x18,%%ymm5,%%ymm5            \n"
+    "sub        %0,%1                          \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
+    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+    "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
+    "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub        $0x8,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleAlpha_AVX2)  // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  uintptr_t alpha;
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "movlhps   %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "movlhps   %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),     // %0
+    "+r"(dst_argb),     // %1
+    "+r"(width),        // %2
+    "=&r"(alpha)        // %3
+  : "r"(fixed_invtbl8)  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
+};
+// Unattenuate 8 pixels at a time.
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  uintptr_t alpha;
+  asm volatile (
+    "sub        %0,%1                          \n"
+    "vbroadcastf128 %5,%%ymm5                  \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    // replace VPGATHER
+    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
+    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
+    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
+    "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
+    "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
+    "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
+    "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
+    "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
+    "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
+    "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
+    "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
+    "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
+    "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
+    // end of VPGATHER
+
+    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
+    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+    "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
+    "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
+    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub        $0x8,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb),      // %1
+    "+r"(width),         // %2
+    "=&r"(alpha)         // %3
+  : "r"(fixed_invtbl8),  // %4
+    "m"(kUnattenShuffleAlpha_AVX2)  // %5
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBUNATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrld     $0x18,%%xmm2                    \n"
+    "psrld     $0x18,%%xmm3                    \n"
+    "packuswb  %%xmm3,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm3                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm3,%%xmm0                   \n"
+    "punpckhwd %%xmm3,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "m"(kARGBToYJ),   // %3
+    "m"(kAddYJ64)     // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone
+static vec8 kARGBToSepiaB = {
+  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+static vec8 kARGBToSepiaG = {
+  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+static vec8 kARGBToSepiaR = {
+  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+  asm volatile (
+    "movdqa    %2,%%xmm2                       \n"
+    "movdqa    %3,%%xmm3                       \n"
+    "movdqa    %4,%%xmm4                       \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "pmaddubsw %%xmm2,%%xmm6                   \n"
+    "phaddw    %%xmm6,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm3,%%xmm5                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm5                   \n"
+    "psrlw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm4,%%xmm5                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm5                   \n"
+    "psrlw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "psrld     $0x18,%%xmm6                    \n"
+    "psrld     $0x18,%%xmm1                    \n"
+    "packuswb  %%xmm1,%%xmm6                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm5                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm5,%%xmm0                   \n"
+    "punpckhwd %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x8,%1                         \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),      // %0
+    "+r"(width)          // %1
+  : "m"(kARGBToSepiaB),  // %2
+    "m"(kARGBToSepiaG),  // %3
+    "m"(kARGBToSepiaR)   // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width) {
+  asm volatile (
+    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
+    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
+    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
+    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
+    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "pmaddubsw %%xmm2,%%xmm7                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "phaddsw   %%xmm7,%%xmm0                   \n"
+    "phaddsw   %%xmm1,%%xmm6                   \n"
+    "psraw     $0x6,%%xmm0                     \n"
+    "psraw     $0x6,%%xmm6                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm7                   \n"
+    "phaddsw   %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm7                   \n"
+    "phaddsw   %%xmm7,%%xmm6                   \n"
+    "psraw     $0x6,%%xmm1                     \n"
+    "psraw     $0x6,%%xmm6                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "punpcklwd %%xmm1,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm6                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb),      // %1
+    "+r"(width)          // %2
+  : "r"(matrix_argb)     // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "movd      %2,%%xmm2                       \n"
+    "movd      %3,%%xmm3                       \n"
+    "movd      %4,%%xmm4                       \n"
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
+    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
+    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "pslld     $0x18,%%xmm6                    \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "pmullw    %%xmm3,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm6,%%xmm7                   \n"
+    "paddw     %%xmm4,%%xmm0                   \n"
+    "paddw     %%xmm4,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "por       %%xmm7,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x4,%1                         \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "movd      %3,%%xmm2                       \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "punpcklqdq %%xmm2,%%xmm2                  \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(value)       // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    "pxor      %%xmm5,%%xmm5                  \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqu    %%xmm0,%%xmm1                   \n"
+    "movdqu    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpckhbw %%xmm5,%%xmm3                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "pmulhuw   %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
+    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
+    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea       " MEMLEA(0x20,2) ",%2           \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+#if defined(__AVX2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea        " MEMLEA(0x20,2) ",%2          \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "psubusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea        " MEMLEA(0x20,2) ",%2          \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "sub       %0,%2                           \n"
+    "sub       %0,%3                           \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "psubw     %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
+    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "psubw     %%xmm2,%%xmm1                   \n"
+    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
+    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "psubw     %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm1                   \n"
+    "pmaxsw    %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x8,%4                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "sub       %0,%2                           \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "psubw     %%xmm1,%%xmm0                   \n"
+    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
+    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "psubw     %%xmm2,%%xmm1                   \n"
+    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
+    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "psubw     %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm1                   \n"
+    "pmaxsw    %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x8,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm2                   \n"
+    "punpckhbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm1                   \n"
+    "punpckhwd %%xmm2,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklwd %%xmm0,%%xmm3                   \n"
+    "punpckhwd %%xmm0,%%xmm0                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm1," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
+    "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "paddusb   %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "punpckhbw %%xmm5,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "punpcklbw %%xmm2,%%xmm4                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm4,%%xmm6                   \n"
+    "punpcklwd %%xmm3,%%xmm6                   \n"
+    "punpckhwd %%xmm3,%%xmm4                   \n"
+    "movdqa    %%xmm1,%%xmm7                   \n"
+    "punpcklwd %%xmm0,%%xmm7                   \n"
+    "punpckhwd %%xmm0,%%xmm1                   \n"
+    "movdqu    %%xmm6," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
+    "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width) {
+  asm volatile (
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+    "test      $0xf,%1                         \n"
+    "jne       49f                             \n"
+
+  // 4 pixel loop                              \n"
+    LABELALIGN
+  "40:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm2,%%xmm4                   \n"
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm1,%%xmm2                   \n"
+    "punpckhwd %%xmm1,%%xmm3                   \n"
+    "punpckhbw %%xmm1,%%xmm4                   \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "punpcklwd %%xmm1,%%xmm4                   \n"
+    "punpckhwd %%xmm1,%%xmm5                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
+    "paddd     %%xmm0,%%xmm3                   \n"
+    "paddd     %%xmm4,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
+    "paddd     %%xmm0,%%xmm4                   \n"
+    "paddd     %%xmm5,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "paddd     %%xmm0,%%xmm5                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
+    "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    LABELALIGN
+  "10:                                         \n"
+    "movd      " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "punpcklwd %%xmm1,%%xmm2                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x1,%3                         \n"
+    "jge       10b                             \n"
+
+  "19:                                         \n"
+  : "+r"(row),  // %0
+    "+r"(cumsum),  // %1
+    "+r"(previous_cumsum),  // %2
+    "+r"(width)  // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+                                    int width, int area, uint8* dst,
+                                    int count) {
+  asm volatile (
+    "movd      %5,%%xmm5                       \n"
+    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
+    "rcpss     %%xmm5,%%xmm4                   \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+    "cmpl      $0x80,%5                        \n"
+    "ja        40f                             \n"
+
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrld     $0x10,%%xmm6                    \n"
+    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
+    "addps     %%xmm6,%%xmm5                   \n"
+    "mulps     %%xmm4,%%xmm5                   \n"
+    "cvtps2dq  %%xmm5,%%xmm5                   \n"
+    "packssdw  %%xmm5,%%xmm5                   \n"
+
+  // 4 pixel small loop                        \n"
+    LABELALIGN
+  "4:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
+    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
+    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
+    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
+    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
+    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
+    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm5,%%xmm0                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       4b                              \n"
+    "jmp       49f                             \n"
+
+  // 4 pixel loop                              \n"
+    LABELALIGN
+  "40:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
+    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
+    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
+    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
+    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
+    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
+    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
+    "mulps     %%xmm4,%%xmm0                   \n"
+    "mulps     %%xmm4,%%xmm1                   \n"
+    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
+    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
+    "mulps     %%xmm4,%%xmm2                   \n"
+    "mulps     %%xmm4,%%xmm3                   \n"
+    "cvtps2dq  %%xmm0,%%xmm0                   \n"
+    "cvtps2dq  %%xmm1,%%xmm1                   \n"
+    "cvtps2dq  %%xmm2,%%xmm2                   \n"
+    "cvtps2dq  %%xmm3,%%xmm3                   \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm3,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    LABELALIGN
+  "10:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "mulps     %%xmm4,%%xmm0                   \n"
+    "cvtps2dq  %%xmm0,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(topleft),  // %0
+    "+r"(botleft),  // %1
+    "+r"(dst),      // %2
+    "+rm"(count)    // %3
+  : "r"((intptr_t)(width)),  // %4
+    "rm"(area)     // %5
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* src_dudv, int width) {
+  intptr_t src_argb_stride_temp = src_argb_stride;
+  intptr_t temp;
+  asm volatile (
+    "movq      " MEMACCESS(3) ",%%xmm2         \n"
+    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
+    "shl       $0x10,%1                        \n"
+    "add       $0x4,%1                         \n"
+    "movd      %1,%%xmm5                       \n"
+    "sub       $0x4,%4                         \n"
+    "jl        49f                             \n"
+
+    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "addps     %%xmm7,%%xmm0                   \n"
+    "movlhps   %%xmm0,%%xmm2                   \n"
+    "movdqa    %%xmm7,%%xmm4                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
+
+  // 4 pixel loop                              \n"
+    LABELALIGN
+  "40:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
+    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
+    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
+    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
+    "movd      %%xmm0,%k1                      \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    "movd      %%xmm0,%k5                      \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
+    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
+    "punpckldq %%xmm6,%%xmm1                   \n"
+    "addps     %%xmm4,%%xmm2                   \n"
+    "movq      %%xmm1," MEMACCESS(2) "         \n"
+    "movd      %%xmm0,%k1                      \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    "movd      %%xmm0,%k5                      \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
+    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
+    "punpckldq %%xmm6,%%xmm0                   \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%4                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%4                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    LABELALIGN
+  "10:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "pmaddwd   %%xmm5,%%xmm0                   \n"
+    "addps     %%xmm7,%%xmm2                   \n"
+    "movd      %%xmm0,%k1                      \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x04,2) ",%2           \n"
+    "sub       $0x1,%4                         \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_stride_temp),  // %1
+    "+r"(dst_argb),  // %2
+    "+r"(src_dudv),  // %3
+    "+rm"(width),    // %4
+    "=&r"(temp)      // %5
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
+  asm volatile (
+    "sub       %1,%0                           \n"
+    "cmp       $0x0,%3                         \n"
+    "je        100f                            \n"
+    "cmp       $0x80,%3                        \n"
+    "je        50f                             \n"
+
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x100,%3                       \n"
+    "movd      %3,%%xmm5                       \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
+    "punpcklwd %%xmm5,%%xmm5                   \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "mov       $0x80808080,%%eax               \n"
+    "movd      %%eax,%%xmm4                    \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+
+    // General purpose row blend.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
+    "movdqa     %%xmm0,%%xmm1                  \n"
+    "punpcklbw  %%xmm2,%%xmm0                  \n"
+    "punpckhbw  %%xmm2,%%xmm1                  \n"
+    "psubb      %%xmm4,%%xmm0                  \n"
+    "psubb      %%xmm4,%%xmm1                  \n"
+    "movdqa     %%xmm5,%%xmm2                  \n"
+    "movdqa     %%xmm5,%%xmm3                  \n"
+    "pmaddubsw  %%xmm0,%%xmm2                  \n"
+    "pmaddubsw  %%xmm1,%%xmm3                  \n"
+    "paddw      %%xmm4,%%xmm2                  \n"
+    "paddw      %%xmm4,%%xmm3                  \n"
+    "psrlw      $0x8,%%xmm2                    \n"
+    "psrlw      $0x8,%%xmm3                    \n"
+    "packuswb   %%xmm3,%%xmm2                  \n"
+    MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    LABELALIGN
+  "50:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    LABELALIGN
+  "100:                                        \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        100b                            \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+rm"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"((intptr_t)(src_stride))  // %4
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  asm volatile (
+    "cmp       $0x0,%3                         \n"
+    "je        100f                            \n"
+    "sub       %1,%0                           \n"
+    "cmp       $0x80,%3                        \n"
+    "je        50f                             \n"
+
+    "vmovd      %3,%%xmm0                      \n"
+    "neg        %3                             \n"
+    "add        $0x100,%3                      \n"
+    "vmovd      %3,%%xmm5                      \n"
+    "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
+    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
+    "vbroadcastss %%xmm5,%%ymm5                \n"
+    "mov        $0x80808080,%%eax              \n"
+    "vmovd      %%eax,%%xmm4                   \n"
+    "vbroadcastss %%xmm4,%%ymm4                \n"
+
+    // General purpose row blend.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
+    "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
+    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
+    "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
+    "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    LABELALIGN
+  "50:                                         \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    LABELALIGN
+  "100:                                        \n"
+    "rep movsb " MEMMOVESTRING(1,0) "          \n"
+    "jmp       999f                            \n"
+
+  "99:                                         \n"
+    "vzeroupper                                \n"
+  "999:                                        \n"
+  : "+D"(dst_ptr),    // %0
+    "+S"(src_ptr),    // %1
+    "+cm"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"((intptr_t)(src_stride))  // %4
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int width) {
+  asm volatile (
+    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)        // %2
+  : "r"(shuffler)    // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_SSSE3
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width) {
+  asm volatile (
+    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)        // %2
+  : "r"(shuffler)    // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width) {
+  uintptr_t pixel_temp;
+  asm volatile (
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "mov       " MEMACCESS(4) ",%k2            \n"
+    "cmp       $0x3000102,%k2                  \n"
+    "je        3012f                           \n"
+    "cmp       $0x10203,%k2                    \n"
+    "je        123f                            \n"
+    "cmp       $0x30201,%k2                    \n"
+    "je        321f                            \n"
+    "cmp       $0x2010003,%k2                  \n"
+    "je        2103f                           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(4) ",%2             \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS(1) "            \n"
+    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x1,1) "       \n"
+    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x2,1) "       \n"
+    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x3,1) "       \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
+    "sub       $0x1,%3                         \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "123:                                        \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        123b                            \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "321:                                        \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        321b                            \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "2103:                                       \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        2103b                           \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "3012:                                       \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        3012b                           \n"
+
+  "99:                                         \n"
+  : "+r"(src_argb),     // %0
+    "+r"(dst_argb),     // %1
+    "=&d"(pixel_temp),  // %2
+    "+r"(width)         // %3
+  : "r"(shuffler)       // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_SSE2
+
+#ifdef HAS_I422TOYUY2ROW_SSE2
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+ asm volatile (
+    "sub       %1,%2                             \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movq      " MEMACCESS(1) ",%%xmm2           \n"
+    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
+    "lea       " MEMLEA(0x8,1) ",%1              \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
+    "movdqa    %%xmm0,%%xmm1                     \n"
+    "punpcklbw %%xmm2,%%xmm0                     \n"
+    "punpckhbw %%xmm2,%%xmm1                     \n"
+    "movdqu    %%xmm0," MEMACCESS(3) "           \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
+    "lea       " MEMLEA(0x20,3) ",%3             \n"
+    "sub       $0x10,%4                          \n"
+    "jg         1b                               \n"
+    : "+r"(src_y),  // %0
+      "+r"(src_u),  // %1
+      "+r"(src_v),  // %2
+      "+r"(dst_frame),  // %3
+      "+rm"(width)  // %4
+    :
+    : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+#endif  // HAS_I422TOYUY2ROW_SSE2
+
+#ifdef HAS_I422TOUYVYROW_SSE2
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+ asm volatile (
+    "sub        %1,%2                            \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movq      " MEMACCESS(1) ",%%xmm2           \n"
+    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
+    "lea       " MEMLEA(0x8,1) ",%1              \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    "movdqa    %%xmm2,%%xmm1                     \n"
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
+    "punpcklbw %%xmm0,%%xmm1                     \n"
+    "punpckhbw %%xmm0,%%xmm2                     \n"
+    "movdqu    %%xmm1," MEMACCESS(3) "           \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
+    "lea       " MEMLEA(0x20,3) ",%3             \n"
+    "sub       $0x10,%4                          \n"
+    "jg         1b                               \n"
+    : "+r"(src_y),  // %0
+      "+r"(src_u),  // %1
+      "+r"(src_v),  // %2
+      "+r"(dst_frame),  // %3
+      "+rm"(width)  // %4
+    :
+    : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+#endif  // HAS_I422TOUYVYROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  asm volatile (
+    "pxor      %%xmm3,%%xmm3                   \n"
+
+    // 2 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm3,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm4                   \n"
+    "punpcklwd %%xmm3,%%xmm0                   \n"
+    "punpckhwd %%xmm3,%%xmm4                   \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
+    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
+    "addps     " MEMACCESS(3) ",%%xmm0         \n"
+    "addps     " MEMACCESS(3) ",%%xmm4         \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm5,%%xmm6                   \n"
+    "mulps     %%xmm1,%%xmm2                   \n"
+    "mulps     %%xmm5,%%xmm6                   \n"
+    "mulps     %%xmm2,%%xmm1                   \n"
+    "mulps     %%xmm6,%%xmm5                   \n"
+    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
+    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
+    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
+    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
+    "addps     %%xmm2,%%xmm0                   \n"
+    "addps     %%xmm6,%%xmm4                   \n"
+    "addps     %%xmm1,%%xmm0                   \n"
+    "addps     %%xmm5,%%xmm4                   \n"
+    "cvttps2dq %%xmm0,%%xmm0                   \n"
+    "cvttps2dq %%xmm4,%%xmm4                   \n"
+    "packuswb  %%xmm4,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x2,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(poly)        // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  asm volatile (
+    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
+    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
+    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
+    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
+
+    // 2 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
+    "lea         " MEMLEA(0x8,0) ",%0          \n"
+    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
+    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
+    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
+    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
+    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
+    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
+    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
+    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
+    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
+    "vmovq       %%xmm0," MEMACCESS(1) "       \n"
+    "lea         " MEMLEA(0x8,1) ",%1          \n"
+    "sub         $0x2,%2                       \n"
+    "jg          1b                            \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(poly)        // %3
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width) {
+  uintptr_t pixel_temp;
+  asm volatile (
+    // 1 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(0) ",%1             \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
+    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
+    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
+    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
+    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
+    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
+    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
+    "dec       %2                              \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),     // %0
+    "=&d"(pixel_temp),  // %1
+    "+r"(width)         // %2
+  : "r"(table_argb)     // %3
+  : "memory", "cc");
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+  uintptr_t pixel_temp;
+  asm volatile (
+    // 1 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(0) ",%1             \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
+    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
+    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
+    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
+    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
+    "dec       %2                              \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),     // %0
+    "=&d"(pixel_temp),  // %1
+    "+r"(width)         // %2
+  : "r"(table_argb)     // %3
+  : "memory", "cc");
+}
+#endif  // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff) {
+  uintptr_t pixel_temp;
+  uintptr_t table_temp;
+  asm volatile (
+    "movd      %6,%%xmm3                       \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0x8,%%xmm4                     \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
+    "pmaddubsw %%xmm3,%%xmm0                   \n"
+    "phaddw    %%xmm0,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "punpcklwd %%xmm5,%%xmm0                   \n"
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS(2) ",%0             \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS(3) "            \n"
+    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x1,3) "       \n"
+    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x2,3) "       \n"
+    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0x3,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x4,3) "       \n"
+    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x5,3) "       \n"
+    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x6,3) "       \n"
+    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0x7,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x8,3) "       \n"
+    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x9,3) "       \n"
+    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xa,3) "       \n"
+    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0xb,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+
+    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xc,3) "       \n"
+    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xd,3) "       \n"
+    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xe,3) "       \n"
+    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0xf,3) "       \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "lea       " MEMLEA(0x10,3) ",%3           \n"
+    "sub       $0x4,%4                         \n"
+    "jg        1b                              \n"
+  : "=&d"(pixel_temp),  // %0
+    "=&a"(table_temp),  // %1
+    "+r"(src_argb),     // %2
+    "+r"(dst_argb),     // %3
+    "+rm"(width)        // %4
+  : "r"(luma),          // %5
+    "rm"(lumacoeff)     // %6
+  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/row_mips.cc b/files/source/row_mips.cc
new file mode 100644
index 00000000..285f0b5a
--- /dev/null
+++ b/files/source/row_mips.cc
@@ -0,0 +1,782 @@
+/*
+ *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+#ifdef HAS_COPYROW_MIPS
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
+  __asm__ __volatile__ (
+    ".set      noreorder                         \n"
+    ".set      noat                              \n"
+    "slti      $at, %[count], 8                  \n"
+    "bne       $at ,$zero, $last8                \n"
+    "xor       $t8, %[src], %[dst]               \n"
+    "andi      $t8, $t8, 0x3                     \n"
+
+    "bne       $t8, $zero, unaligned             \n"
+    "negu      $a3, %[dst]                       \n"
+    // make dst/src aligned
+    "andi      $a3, $a3, 0x3                     \n"
+    "beq       $a3, $zero, $chk16w               \n"
+    // word-aligned now count is the remining bytes count
+    "subu     %[count], %[count], $a3            \n"
+
+    "lwr       $t8, 0(%[src])                    \n"
+    "addu      %[src], %[src], $a3               \n"
+    "swr       $t8, 0(%[dst])                    \n"
+    "addu      %[dst], %[dst], $a3               \n"
+
+    // Now the dst/src are mutually word-aligned with word-aligned addresses
+    "$chk16w:                                    \n"
+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+    // t8 is the byte count after 64-byte chunks
+    "beq       %[count], $t8, chk8w              \n"
+    // There will be at most 1 32-byte chunk after it
+    "subu      $a3, %[count], $t8                \n"  // the reminder
+    // Here a3 counts bytes in 16w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // Now a3 is the final dst after 64-byte chunks
+    "addu      $t0, %[dst], %[count]             \n"
+    // t0 is the "past the end" address
+
+    // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
+    // the "t0-32" address
+    // This means: for x=128 the last "safe" a1 address is "t0-160"
+    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
+    // we will use "pref 30,128(a1)", so "t0-160" is the limit
+    "subu      $t9, $t0, 160                     \n"
+    // t9 is the "last safe pref 30,128(a1)" address
+    "pref      0, 0(%[src])                      \n"  // first line of src
+    "pref      0, 32(%[src])                     \n"  // second line of src
+    "pref      0, 64(%[src])                     \n"
+    "pref      30, 32(%[dst])                    \n"
+    // In case the a1 > t9 don't use "pref 30" at all
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bgtz      $v1, $loop16w                     \n"
+    "nop                                         \n"
+    // otherwise, start with using pref30
+    "pref      30, 64(%[dst])                    \n"
+    "$loop16w:                                    \n"
+    "pref      0, 96(%[src])                     \n"
+    "lw        $t0, 0(%[src])                    \n"
+    "bgtz      $v1, $skip_pref30_96              \n"  // skip
+    "lw        $t1, 4(%[src])                    \n"
+    "pref      30, 96(%[dst])                    \n"  // continue
+    "$skip_pref30_96:                            \n"
+    "lw        $t2, 8(%[src])                    \n"
+    "lw        $t3, 12(%[src])                   \n"
+    "lw        $t4, 16(%[src])                   \n"
+    "lw        $t5, 20(%[src])                   \n"
+    "lw        $t6, 24(%[src])                   \n"
+    "lw        $t7, 28(%[src])                   \n"
+    "pref      0, 128(%[src])                    \n"
+    //  bring the next lines of src, addr 128
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "lw        $t0, 32(%[src])                   \n"
+    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
+    "lw        $t1, 36(%[src])                   \n"
+    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
+    "$skip_pref30_128:                           \n"
+    "lw        $t2, 40(%[src])                   \n"
+    "lw        $t3, 44(%[src])                   \n"
+    "lw        $t4, 48(%[src])                   \n"
+    "lw        $t5, 52(%[src])                   \n"
+    "lw        $t6, 56(%[src])                   \n"
+    "lw        $t7, 60(%[src])                   \n"
+    "pref      0, 160(%[src])                    \n"
+    // bring the next lines of src, addr 160
+    "sw        $t0, 32(%[dst])                   \n"
+    "sw        $t1, 36(%[dst])                   \n"
+    "sw        $t2, 40(%[dst])                   \n"
+    "sw        $t3, 44(%[dst])                   \n"
+    "sw        $t4, 48(%[dst])                   \n"
+    "sw        $t5, 52(%[dst])                   \n"
+    "sw        $t6, 56(%[dst])                   \n"
+    "sw        $t7, 60(%[dst])                   \n"
+
+    "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bne       %[dst], $a3, $loop16w             \n"
+    " addiu    %[src], %[src], 64                \n"  // adding 64 to src
+    "move      %[count], $t8                     \n"
+
+    // Here we have src and dest word-aligned but less than 64-bytes to go
+
+    "chk8w:                                      \n"
+    "pref      0, 0x0(%[src])                    \n"
+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+    // the t8 is the reminder count past 32-bytes
+    "beq       %[count], $t8, chk1w              \n"
+    // count=t8,no 32-byte chunk
+    " nop                                        \n"
+
+    "lw        $t0, 0(%[src])                    \n"
+    "lw        $t1, 4(%[src])                    \n"
+    "lw        $t2, 8(%[src])                    \n"
+    "lw        $t3, 12(%[src])                   \n"
+    "lw        $t4, 16(%[src])                   \n"
+    "lw        $t5, 20(%[src])                   \n"
+    "lw        $t6, 24(%[src])                   \n"
+    "lw        $t7, 28(%[src])                   \n"
+    "addiu     %[src], %[src], 32                \n"
+
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "addiu     %[dst], %[dst], 32                \n"
+
+    "chk1w:                                      \n"
+    "andi      %[count], $t8, 0x3                \n"
+    // now count is the reminder past 1w chunks
+    "beq       %[count], $t8, $last8             \n"
+    " subu     $a3, $t8, %[count]                \n"
+    // a3 is count of bytes in 1w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // now a3 is the dst address past the 1w chunks
+    // copying in words (4-byte chunks)
+    "$wordCopy_loop:                             \n"
+    "lw        $t3, 0(%[src])                    \n"
+    // the first t3 may be equal t0 ... optimize?
+    "addiu     %[src], %[src],4                  \n"
+    "addiu     %[dst], %[dst],4                  \n"
+    "bne       %[dst], $a3,$wordCopy_loop        \n"
+    " sw       $t3, -4(%[dst])                   \n"
+
+    // For the last (<8) bytes
+    "$last8:                                     \n"
+    "blez      %[count], leave                   \n"
+    " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address
+    "$last8loop:                                 \n"
+    "lb        $v1, 0(%[src])                    \n"
+    "addiu     %[src], %[src], 1                 \n"
+    "addiu     %[dst], %[dst], 1                 \n"
+    "bne       %[dst], $a3, $last8loop           \n"
+    " sb       $v1, -1(%[dst])                   \n"
+
+    "leave:                                      \n"
+    "  j       $ra                               \n"
+    "  nop                                       \n"
+
+    //
+    // UNALIGNED case
+    //
+
+    "unaligned:                                  \n"
+    // got here with a3="negu a1"
+    "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?
+    "beqz      $a3, $ua_chk16w                   \n"
+    " subu     %[count], %[count], $a3           \n"
+    // bytes left after initial a3 bytes
+    "lwr       $v1, 0(%[src])                    \n"
+    "lwl       $v1, 3(%[src])                    \n"
+    "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3
+    "swr       $v1, 0(%[dst])                    \n"
+    "addu      %[dst], %[dst], $a3               \n"
+    // below the dst will be word aligned (NOTE1)
+    "$ua_chk16w:                                 \n"
+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+    // t8 is the byte count after 64-byte chunks
+    "beq       %[count], $t8, ua_chk8w           \n"
+    // if a2==t8, no 64-byte chunks
+    // There will be at most 1 32-byte chunk after it
+    "subu      $a3, %[count], $t8                \n"  // the reminder
+    // Here a3 counts bytes in 16w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // Now a3 is the final dst after 64-byte chunks
+    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
+    "subu      $t9, $t0, 160                     \n"
+    // t9 is the "last safe pref 30,128(a1)" address
+    "pref      0, 0(%[src])                      \n"  // first line of src
+    "pref      0, 32(%[src])                     \n"  // second line  addr 32
+    "pref      0, 64(%[src])                     \n"
+    "pref      30, 32(%[dst])                    \n"
+    // safe, as we have at least 64 bytes ahead
+    // In case the a1 > t9 don't use "pref 30" at all
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bgtz      $v1, $ua_loop16w                  \n"
+    // skip "pref 30,64(a1)" for too short arrays
+    " nop                                        \n"
+    // otherwise, start with using pref30
+    "pref      30, 64(%[dst])                    \n"
+    "$ua_loop16w:                                \n"
+    "pref      0, 96(%[src])                     \n"
+    "lwr       $t0, 0(%[src])                    \n"
+    "lwl       $t0, 3(%[src])                    \n"
+    "lwr       $t1, 4(%[src])                    \n"
+    "bgtz      $v1, $ua_skip_pref30_96           \n"
+    " lwl      $t1, 7(%[src])                    \n"
+    "pref      30, 96(%[dst])                    \n"
+    // continue setting up the dest, addr 96
+    "$ua_skip_pref30_96:                         \n"
+    "lwr       $t2, 8(%[src])                    \n"
+    "lwl       $t2, 11(%[src])                   \n"
+    "lwr       $t3, 12(%[src])                   \n"
+    "lwl       $t3, 15(%[src])                   \n"
+    "lwr       $t4, 16(%[src])                   \n"
+    "lwl       $t4, 19(%[src])                   \n"
+    "lwr       $t5, 20(%[src])                   \n"
+    "lwl       $t5, 23(%[src])                   \n"
+    "lwr       $t6, 24(%[src])                   \n"
+    "lwl       $t6, 27(%[src])                   \n"
+    "lwr       $t7, 28(%[src])                   \n"
+    "lwl       $t7, 31(%[src])                   \n"
+    "pref      0, 128(%[src])                    \n"
+    // bring the next lines of src, addr 128
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "lwr       $t0, 32(%[src])                   \n"
+    "lwl       $t0, 35(%[src])                   \n"
+    "lwr       $t1, 36(%[src])                   \n"
+    "bgtz      $v1, ua_skip_pref30_128           \n"
+    " lwl      $t1, 39(%[src])                   \n"
+    "pref      30, 128(%[dst])                   \n"
+    // continue setting up the dest, addr 128
+    "ua_skip_pref30_128:                         \n"
+
+    "lwr       $t2, 40(%[src])                   \n"
+    "lwl       $t2, 43(%[src])                   \n"
+    "lwr       $t3, 44(%[src])                   \n"
+    "lwl       $t3, 47(%[src])                   \n"
+    "lwr       $t4, 48(%[src])                   \n"
+    "lwl       $t4, 51(%[src])                   \n"
+    "lwr       $t5, 52(%[src])                   \n"
+    "lwl       $t5, 55(%[src])                   \n"
+    "lwr       $t6, 56(%[src])                   \n"
+    "lwl       $t6, 59(%[src])                   \n"
+    "lwr       $t7, 60(%[src])                   \n"
+    "lwl       $t7, 63(%[src])                   \n"
+    "pref      0, 160(%[src])                    \n"
+    // bring the next lines of src, addr 160
+    "sw        $t0, 32(%[dst])                   \n"
+    "sw        $t1, 36(%[dst])                   \n"
+    "sw        $t2, 40(%[dst])                   \n"
+    "sw        $t3, 44(%[dst])                   \n"
+    "sw        $t4, 48(%[dst])                   \n"
+    "sw        $t5, 52(%[dst])                   \n"
+    "sw        $t6, 56(%[dst])                   \n"
+    "sw        $t7, 60(%[dst])                   \n"
+
+    "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest
+    "sgtu      $v1,%[dst],$t9                    \n"
+    "bne       %[dst],$a3,$ua_loop16w            \n"
+    " addiu    %[src],%[src],64                  \n"  // adding 64 to src
+    "move      %[count],$t8                      \n"
+
+    // Here we have src and dest word-aligned but less than 64-bytes to go
+
+    "ua_chk8w:                                   \n"
+    "pref      0, 0x0(%[src])                    \n"
+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+    // the t8 is the reminder count
+    "beq       %[count], $t8, $ua_chk1w          \n"
+    // when count==t8, no 32-byte chunk
+
+    "lwr       $t0, 0(%[src])                    \n"
+    "lwl       $t0, 3(%[src])                    \n"
+    "lwr       $t1, 4(%[src])                    \n"
+    "lwl       $t1, 7(%[src])                    \n"
+    "lwr       $t2, 8(%[src])                    \n"
+    "lwl       $t2, 11(%[src])                   \n"
+    "lwr       $t3, 12(%[src])                   \n"
+    "lwl       $t3, 15(%[src])                   \n"
+    "lwr       $t4, 16(%[src])                   \n"
+    "lwl       $t4, 19(%[src])                   \n"
+    "lwr       $t5, 20(%[src])                   \n"
+    "lwl       $t5, 23(%[src])                   \n"
+    "lwr       $t6, 24(%[src])                   \n"
+    "lwl       $t6, 27(%[src])                   \n"
+    "lwr       $t7, 28(%[src])                   \n"
+    "lwl       $t7, 31(%[src])                   \n"
+    "addiu     %[src], %[src], 32                \n"
+
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "addiu     %[dst], %[dst], 32                \n"
+
+    "$ua_chk1w:                                  \n"
+    "andi      %[count], $t8, 0x3                \n"
+    // now count is the reminder past 1w chunks
+    "beq       %[count], $t8, ua_smallCopy       \n"
+    "subu      $a3, $t8, %[count]                \n"
+    // a3 is count of bytes in 1w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // now a3 is the dst address past the 1w chunks
+
+    // copying in words (4-byte chunks)
+    "$ua_wordCopy_loop:                          \n"
+    "lwr       $v1, 0(%[src])                    \n"
+    "lwl       $v1, 3(%[src])                    \n"
+    "addiu     %[src], %[src], 4                 \n"
+    "addiu     %[dst], %[dst], 4                 \n"
+    // note: dst=a1 is word aligned here, see NOTE1
+    "bne       %[dst], $a3, $ua_wordCopy_loop    \n"
+    " sw       $v1,-4(%[dst])                    \n"
+
+    // Now less than 4 bytes (value in count) left to copy
+    "ua_smallCopy:                               \n"
+    "beqz      %[count], leave                   \n"
+    " addu     $a3, %[dst], %[count]             \n" // a3 = last dst address
+    "$ua_smallCopy_loop:                         \n"
+    "lb        $v1, 0(%[src])                    \n"
+    "addiu     %[src], %[src], 1                 \n"
+    "addiu     %[dst], %[dst], 1                 \n"
+    "bne       %[dst],$a3,$ua_smallCopy_loop     \n"
+    " sb       $v1, -1(%[dst])                   \n"
+
+    "j         $ra                               \n"
+    " nop                                        \n"
+    ".set      at                                \n"
+    ".set      reorder                           \n"
+       : [dst] "+r" (dst), [src] "+r" (src)
+       : [count] "r" (count)
+       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
+       "t8", "t9", "a3", "v1", "at"
+  );
+}
+#endif  // HAS_COPYROW_MIPS
+
+// DSPR2 functions
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
+    (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
+
+void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width) {
+  __asm__ __volatile__ (
+    ".set push                                     \n"
+    ".set noreorder                                \n"
+    "srl             $t4, %[width], 4              \n"  // multiplies of 16
+    "blez            $t4, 2f                       \n"
+    " andi           %[width], %[width], 0xf       \n"  // residual
+
+  "1:                                              \n"
+    "addiu           $t4, $t4, -1                  \n"
+    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
+    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
+    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
+    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
+    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
+    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10
+    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12
+    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14
+    "addiu           %[src_uv], %[src_uv], 32      \n"
+    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
+    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
+    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
+    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
+    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
+    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
+    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
+    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
+    "sw              $t9, 0(%[dst_v])              \n"
+    "sw              $t0, 0(%[dst_u])              \n"
+    "sw              $t1, 4(%[dst_v])              \n"
+    "sw              $t2, 4(%[dst_u])              \n"
+    "sw              $t3, 8(%[dst_v])              \n"
+    "sw              $t5, 8(%[dst_u])              \n"
+    "sw              $t6, 12(%[dst_v])             \n"
+    "sw              $t7, 12(%[dst_u])             \n"
+    "addiu           %[dst_v], %[dst_v], 16        \n"
+    "bgtz            $t4, 1b                       \n"
+    " addiu          %[dst_u], %[dst_u], 16        \n"
+
+    "beqz            %[width], 3f                  \n"
+    " nop                                          \n"
+
+  "2:                                              \n"
+    "lbu             $t0, 0(%[src_uv])             \n"
+    "lbu             $t1, 1(%[src_uv])             \n"
+    "addiu           %[src_uv], %[src_uv], 2       \n"
+    "addiu           %[width], %[width], -1        \n"
+    "sb              $t0, 0(%[dst_u])              \n"
+    "sb              $t1, 0(%[dst_v])              \n"
+    "addiu           %[dst_u], %[dst_u], 1         \n"
+    "bgtz            %[width], 2b                  \n"
+    " addiu          %[dst_v], %[dst_v], 1         \n"
+
+  "3:                                              \n"
+    ".set pop                                      \n"
+     : [src_uv] "+r" (src_uv),
+       [width] "+r" (width),
+       [dst_u] "+r" (dst_u),
+       [dst_v] "+r" (dst_v)
+     :
+     : "t0", "t1", "t2", "t3",
+     "t4", "t5", "t6", "t7", "t8", "t9"
+  );
+}
+
+void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
+  __asm__ __volatile__ (
+    ".set push                             \n"
+    ".set noreorder                        \n"
+
+    "srl       $t4, %[width], 4            \n"  // multiplies of 16
+    "andi      $t5, %[width], 0xf          \n"
+    "blez      $t4, 2f                     \n"
+    " addu     %[src], %[src], %[width]    \n"  // src += width
+
+   "1:                                     \n"
+    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
+    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
+    "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
+    "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
+    "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
+    "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
+    "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
+    "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
+    "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
+    "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
+    "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
+    "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
+    "addiu     %[src], %[src], -16         \n"
+    "addiu     $t4, $t4, -1                \n"
+    "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
+    "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
+    "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
+    "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
+    "bgtz      $t4, 1b                     \n"
+    " addiu    %[dst], %[dst], 16          \n"
+    "beqz      $t5, 3f                     \n"
+    " nop                                  \n"
+
+   "2:                                     \n"
+    "lbu       $t0, -1(%[src])             \n"
+    "addiu     $t5, $t5, -1                \n"
+    "addiu     %[src], %[src], -1          \n"
+    "sb        $t0, 0(%[dst])              \n"
+    "bgez      $t5, 2b                     \n"
+    " addiu    %[dst], %[dst], 1           \n"
+
+   "3:                                     \n"
+    ".set pop                              \n"
+      : [src] "+r" (src), [dst] "+r" (dst)
+      : [width] "r" (width)
+      : "t0", "t1", "t2", "t3", "t4", "t5"
+  );
+}
+
+void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  int x;
+  int y;
+  __asm__ __volatile__ (
+    ".set push                                    \n"
+    ".set noreorder                               \n"
+
+    "addu            $t4, %[width], %[width]      \n"
+    "srl             %[x], %[width], 4            \n"
+    "andi            %[y], %[width], 0xf          \n"
+    "blez            %[x], 2f                     \n"
+    " addu           %[src_uv], %[src_uv], $t4    \n"
+
+   "1:                                            \n"
+    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
+    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
+    "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
+    "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
+    "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
+    "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
+    "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
+    "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
+
+    "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
+    "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
+    "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
+    "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
+    "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
+    "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
+    "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
+    "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
+    "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
+    "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
+    "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
+    "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
+    "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
+    "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
+    "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
+    "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
+    "addiu           %[src_uv], %[src_uv], -32    \n"
+    "addiu           %[x], %[x], -1               \n"
+    "swr             $t4, 0(%[dst_u])             \n"
+    "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
+    "swr             $t6, 0(%[dst_v])             \n"
+    "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
+    "swr             $t2, 4(%[dst_u])             \n"
+    "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
+    "swr             $t3, 4(%[dst_v])             \n"
+    "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
+    "swr             $t0, 8(%[dst_u])             \n"
+    "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
+    "swr             $t1, 8(%[dst_v])             \n"
+    "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
+    "swr             $t9, 12(%[dst_u])            \n"
+    "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
+    "swr             $t5, 12(%[dst_v])            \n"
+    "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
+    "addiu           %[dst_v], %[dst_v], 16       \n"
+    "bgtz            %[x], 1b                     \n"
+    " addiu          %[dst_u], %[dst_u], 16       \n"
+    "beqz            %[y], 3f                     \n"
+    " nop                                         \n"
+    "b               2f                           \n"
+    " nop                                         \n"
+
+   "2:                                            \n"
+    "lbu             $t0, -2(%[src_uv])           \n"
+    "lbu             $t1, -1(%[src_uv])           \n"
+    "addiu           %[src_uv], %[src_uv], -2     \n"
+    "addiu           %[y], %[y], -1               \n"
+    "sb              $t0, 0(%[dst_u])             \n"
+    "sb              $t1, 0(%[dst_v])             \n"
+    "addiu           %[dst_u], %[dst_u], 1        \n"
+    "bgtz            %[y], 2b                     \n"
+    " addiu          %[dst_v], %[dst_v], 1        \n"
+
+   "3:                                            \n"
+    ".set pop                                     \n"
+      : [src_uv] "+r" (src_uv),
+        [dst_u] "+r" (dst_u),
+        [dst_v] "+r" (dst_v),
+        [x] "=&r" (x),
+        [y] "=&r" (y)
+      : [width] "r" (width)
+      : "t0", "t1", "t2", "t3", "t4",
+      "t5", "t7", "t8", "t9"
+  );
+}
+
+// Convert (4 Y and 2 VU) I422 and arrange RGB values into
+// t5 = | 0 | B0 | 0 | b0 |
+// t4 = | 0 | B1 | 0 | b1 |
+// t9 = | 0 | G0 | 0 | g0 |
+// t8 = | 0 | G1 | 0 | g1 |
+// t2 = | 0 | R0 | 0 | r0 |
+// t1 = | 0 | R1 | 0 | r1 |
+#define YUVTORGB                                                               \
+      "lw                $t0, 0(%[y_buf])       \n"                            \
+      "lhu               $t1, 0(%[u_buf])       \n"                            \
+      "lhu               $t2, 0(%[v_buf])       \n"                            \
+      "preceu.ph.qbr     $t1, $t1               \n"                            \
+      "preceu.ph.qbr     $t2, $t2               \n"                            \
+      "preceu.ph.qbra    $t3, $t0               \n"                            \
+      "preceu.ph.qbla    $t0, $t0               \n"                            \
+      "subu.ph           $t1, $t1, $s5          \n"                            \
+      "subu.ph           $t2, $t2, $s5          \n"                            \
+      "subu.ph           $t3, $t3, $s4          \n"                            \
+      "subu.ph           $t0, $t0, $s4          \n"                            \
+      "mul.ph            $t3, $t3, $s0          \n"                            \
+      "mul.ph            $t0, $t0, $s0          \n"                            \
+      "shll.ph           $t4, $t1, 0x7          \n"                            \
+      "subu.ph           $t4, $t4, $t1          \n"                            \
+      "mul.ph            $t6, $t1, $s1          \n"                            \
+      "mul.ph            $t1, $t2, $s2          \n"                            \
+      "addq_s.ph         $t5, $t4, $t3          \n"                            \
+      "addq_s.ph         $t4, $t4, $t0          \n"                            \
+      "shra.ph           $t5, $t5, 6            \n"                            \
+      "shra.ph           $t4, $t4, 6            \n"                            \
+      "addiu             %[u_buf], 2            \n"                            \
+      "addiu             %[v_buf], 2            \n"                            \
+      "addu.ph           $t6, $t6, $t1          \n"                            \
+      "mul.ph            $t1, $t2, $s3          \n"                            \
+      "addu.ph           $t9, $t6, $t3          \n"                            \
+      "addu.ph           $t8, $t6, $t0          \n"                            \
+      "shra.ph           $t9, $t9, 6            \n"                            \
+      "shra.ph           $t8, $t8, 6            \n"                            \
+      "addu.ph           $t2, $t1, $t3          \n"                            \
+      "addu.ph           $t1, $t1, $t0          \n"                            \
+      "shra.ph           $t2, $t2, 6            \n"                            \
+      "shra.ph           $t1, $t1, 6            \n"                            \
+      "subu.ph           $t5, $t5, $s5          \n"                            \
+      "subu.ph           $t4, $t4, $s5          \n"                            \
+      "subu.ph           $t9, $t9, $s5          \n"                            \
+      "subu.ph           $t8, $t8, $s5          \n"                            \
+      "subu.ph           $t2, $t2, $s5          \n"                            \
+      "subu.ph           $t1, $t1, $s5          \n"                            \
+      "shll_s.ph         $t5, $t5, 8            \n"                            \
+      "shll_s.ph         $t4, $t4, 8            \n"                            \
+      "shll_s.ph         $t9, $t9, 8            \n"                            \
+      "shll_s.ph         $t8, $t8, 8            \n"                            \
+      "shll_s.ph         $t2, $t2, 8            \n"                            \
+      "shll_s.ph         $t1, $t1, 8            \n"                            \
+      "shra.ph           $t5, $t5, 8            \n"                            \
+      "shra.ph           $t4, $t4, 8            \n"                            \
+      "shra.ph           $t9, $t9, 8            \n"                            \
+      "shra.ph           $t8, $t8, 8            \n"                            \
+      "shra.ph           $t2, $t2, 8            \n"                            \
+      "shra.ph           $t1, $t1, 8            \n"                            \
+      "addu.ph           $t5, $t5, $s5          \n"                            \
+      "addu.ph           $t4, $t4, $s5          \n"                            \
+      "addu.ph           $t9, $t9, $s5          \n"                            \
+      "addu.ph           $t8, $t8, $s5          \n"                            \
+      "addu.ph           $t2, $t2, $s5          \n"                            \
+      "addu.ph           $t1, $t1, $s5          \n"
+
+// TODO(fbarchard): accept yuv conversion constants.
+void I422ToARGBRow_DSPR2(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  __asm__ __volatile__ (
+    ".set push                                \n"
+    ".set noreorder                           \n"
+    "beqz              %[width], 2f           \n"
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s5, 128               \n"  // |128|128| // clipping
+    "lui               $s6, 0xff00            \n"
+    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
+
+   "1:                                        \n"
+      YUVTORGB
+// Arranging into argb format
+    "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
+    "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
+    "addiu             %[width], -4           \n"
+    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|
+    "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|
+    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+
+    "addiu             %[y_buf], 4            \n"
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+    "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|
+    "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|
+    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|
+    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|
+    "sll               $t9, $t9, 16           \n"
+    "sll               $t8, $t8, 16           \n"
+    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|
+    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|
+// Store results.
+    "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
+    "bnez              %[width], 1b           \n"
+    " addiu            %[rgb_buf], 16         \n"
+   "2:                                        \n"
+    ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+      "t6", "t7", "t8", "t9",
+      "s0", "s1", "s2", "s3",
+      "s4", "s5", "s6"
+  );
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
+    int y0_fraction = 256 - source_y_fraction;
+    const uint8* src_ptr1 = src_ptr + src_stride;
+
+  __asm__ __volatile__ (
+     ".set push                                           \n"
+     ".set noreorder                                      \n"
+
+     "replv.ph          $t0, %[y0_fraction]               \n"
+     "replv.ph          $t1, %[source_y_fraction]         \n"
+
+   "1:                                                    \n"
+     "lw                $t2, 0(%[src_ptr])                \n"
+     "lw                $t3, 0(%[src_ptr1])               \n"
+     "lw                $t4, 4(%[src_ptr])                \n"
+     "lw                $t5, 4(%[src_ptr1])               \n"
+     "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"
+     "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"
+     "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"
+     "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"
+     "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"
+     "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"
+     "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"
+     "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"
+     "addq.ph           $t6, $t6, $t8                     \n"
+     "addq.ph           $t7, $t7, $t9                     \n"
+     "addq.ph           $t2, $t2, $t4                     \n"
+     "addq.ph           $t3, $t3, $t5                     \n"
+     "shra.ph           $t6, $t6, 8                       \n"
+     "shra.ph           $t7, $t7, 8                       \n"
+     "shra.ph           $t2, $t2, 8                       \n"
+     "shra.ph           $t3, $t3, 8                       \n"
+     "precr.qb.ph       $t6, $t6, $t7                     \n"
+     "precr.qb.ph       $t2, $t2, $t3                     \n"
+     "addiu             %[src_ptr], %[src_ptr], 8         \n"
+     "addiu             %[src_ptr1], %[src_ptr1], 8       \n"
+     "addiu             %[dst_width], %[dst_width], -8    \n"
+     "sw                $t6, 0(%[dst_ptr])                \n"
+     "sw                $t2, 4(%[dst_ptr])                \n"
+     "bgtz              %[dst_width], 1b                  \n"
+     " addiu            %[dst_ptr], %[dst_ptr], 8         \n"
+
+     ".set pop                                            \n"
+  : [dst_ptr] "+r" (dst_ptr),
+    [src_ptr1] "+r" (src_ptr1),
+    [src_ptr] "+r" (src_ptr),
+    [dst_width] "+r" (dst_width)
+  : [source_y_fraction] "r" (source_y_fraction),
+    [y0_fraction] "r" (y0_fraction),
+    [src_stride] "r" (src_stride)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
+#endif  // __mips_dsp_rev >= 2
+
+#endif  // defined(__mips__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc
index 19a78330..909df060 100644
--- a/files/source/row_neon.cc
+++ b/files/source/row_neon.cc
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -16,812 +16,2826 @@ extern "C" {
 #endif
 
 // This module is for GCC Neon
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
 
 // Read 8 Y, 4 U and 4 V from 422
 #define READYUV422                                                             \
-    "vld1.u8    {d0}, [%0]!                    \n"                             \
-    "vld1.u32   {d2[0]}, [%1]!                 \n"                             \
-    "vld1.u32   {d2[1]}, [%2]!                 \n"
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.32    {d2[1]}, [%2]!                 \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vzip.u8    d2, d3                         \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.8     {d3}, [%2]!                    \n"                             \
+    "vpaddl.u8  q1, q1                         \n"                             \
+    "vrshrn.u16 d2, q1, #1                     \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "vmov.u8    d2, #128                       \n"
 
 // Read 8 Y and 4 UV from NV12
 #define READNV12                                                               \
-    "vld1.u8    {d0}, [%0]!                    \n"                             \
-    "vld1.u8    {d2}, [%1]!                    \n"                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
     "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
     "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
 
 // Read 8 Y and 4 VU from NV21
 #define READNV21                                                               \
-    "vld1.u8    {d0}, [%0]!                    \n"                             \
-    "vld1.u8    {d2}, [%1]!                    \n"                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
     "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
     "vuzp.u8    d3, d2                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"                             \
-
-#define YUV422TORGB                                                            \
-    "veor.u8    d2, d26                        \n"/*subtract 128 from u and v*/\
-    "vmull.s8   q8, d2, d24                    \n"/*  u/v B/R component      */\
-    "vmull.s8   q9, d2, d25                    \n"/*  u/v G component        */\
-    "vmov.u8    d1, #0                         \n"/*  split odd/even y apart */\
-    "vtrn.u8    d0, d1                         \n"                             \
-    "vsub.s16   q0, q0, q15                    \n"/*  offset y               */\
-    "vmul.s16   q0, q0, q14                    \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 YUY2
+#define READYUY2                                                               \
+    MEMACCESS(0)                                                               \
+    "vld2.8     {d0, d2}, [%0]!                \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 UYVY
+#define READUYVY                                                               \
+    MEMACCESS(0)                                                               \
+    "vld2.8     {d2, d3}, [%0]!                \n"                             \
+    "vmov.u8    d0, d3                         \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+#define YUVTORGB_SETUP                                                         \
+    MEMACCESS([kUVToRB])                                                       \
+    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
+    MEMACCESS([kUVToG])                                                        \
+    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
+    MEMACCESS([kYToRgb])                                                       \
+    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
+
+#define YUVTORGB                                                               \
+    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\
+    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\
+    "vmovl.u8   q0, d0                         \n" /* Y                      */\
+    "vmovl.s16  q10, d1                        \n"                             \
+    "vmovl.s16  q0, d0                         \n"                             \
+    "vmul.s32   q10, q10, q15                  \n"                             \
+    "vmul.s32   q0, q0, q15                    \n"                             \
+    "vqshrun.s32 d0, q0, #16                   \n"                             \
+    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\
     "vadd.s16   d18, d19                       \n"                             \
-    "vqadd.s16  d20, d0, d16                   \n"                             \
-    "vqadd.s16  d21, d1, d16                   \n"                             \
-    "vqadd.s16  d22, d0, d17                   \n"                             \
-    "vqadd.s16  d23, d1, d17                   \n"                             \
-    "vqadd.s16  d16, d0, d18                   \n"                             \
-    "vqadd.s16  d17, d1, d18                   \n"                             \
-    "vqrshrun.s16 d0, q10, #6                  \n"                             \
-    "vqrshrun.s16 d1, q11, #6                  \n"                             \
-    "vqrshrun.s16 d2, q8, #6                   \n"                             \
-    "vmovl.u8   q10, d0                        \n"/*  set up for reinterleave*/\
-    "vmovl.u8   q11, d1                        \n"                             \
-    "vmovl.u8   q8, d2                         \n"                             \
-    "vtrn.u8    d20, d21                       \n"                             \
-    "vtrn.u8    d22, d23                       \n"                             \
-    "vtrn.u8    d16, d17                       \n"                             \
-    "vmov.u8    d21, d16                       \n"
-
-#if defined(HAS_I422TOARGBROW_NEON) || defined(HAS_I422TOBGRAROW_NEON) ||      \
-    defined(HAS_I422TOABGRROW_NEON) || defined(HAS_I422TORGBAROW_NEON)
-static const vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102,
-                               0, 0, 0, 0, 0, 0, 0, 0 };
-static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
-                             0, 0, 0, 0, 0, 0, 0, 0 };
-#endif
+    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\
+    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\
+    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\
+    "vaddw.u16  q1, q1, d16                    \n"                             \
+    "vaddw.u16  q10, q10, d17                  \n"                             \
+    "vaddw.u16  q3, q3, d18                    \n"                             \
+    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \
+    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \
+    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \
+    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \
+    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \
+    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \
+    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \
+    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \
+    "vqshrun.s16 d21, q0, #6                   \n" /* G */
 
-#ifdef HAS_I422TOARGBROW_NEON
-void I422ToARGBRow_NEON(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
-    "vld1.u8    {d24}, [%5]                    \n"
-    "vld1.u8    {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    YUVTORGB_SETUP
+    "vmov.u8    d23, #255                      \n"
   "1:                                          \n"
-    READYUV422
-    YUV422TORGB
+    READYUV444
+    YUVTORGB
     "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
     "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(u_buf),    // %1
-      "+r"(v_buf),    // %2
-      "+r"(rgb_buf),  // %3
-      "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
-#endif  // HAS_I422TOARGBROW_NEON
 
-#ifdef HAS_I422TOBGRAROW_NEON
-void I422ToBGRARow_NEON(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
-    "vld1.u8    {d24}, [%5]                    \n"
-    "vld1.u8    {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    YUVTORGB_SETUP
+    "vmov.u8    d23, #255                      \n"
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
+    YUVTORGB
     "subs       %4, %4, #8                     \n"
-    "vswp.u8    d20, d22                       \n"
-    "vmov.u8    d19, #255                      \n"
-    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
     "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(u_buf),    // %1
-      "+r"(v_buf),    // %2
-      "+r"(rgb_buf),  // %3
-      "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
-#endif  // HAS_I422TOBGRAROW_NEON
 
-#ifdef HAS_I422TOABGRROW_NEON
-void I422ToABGRRow_NEON(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) {
+void I422AlphaToARGBRow_NEON(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             const uint8* src_a,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
   asm volatile (
-    "vld1.u8    {d24}, [%5]                    \n"
-    "vld1.u8    {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    YUVTORGB_SETUP
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vswp.u8    d20, d22                       \n"
+    YUVTORGB
+    "subs       %5, %5, #8                     \n"
+    MEMACCESS(3)
+    "vld1.8     {d23}, [%3]!                   \n"
+    MEMACCESS(4)
+    "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(src_a),     // %3
+      "+r"(dst_argb),  // %4
+      "+r"(width)      // %5
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
     "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    READYUV411
+    YUVTORGB
+    "subs       %4, %4, #8                     \n"
+    MEMACCESS(3)
     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
     "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(u_buf),    // %1
-      "+r"(v_buf),    // %2
-      "+r"(rgb_buf),  // %3
-      "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
-#endif  // HAS_I422TOABGRROW_NEON
 
-#ifdef HAS_I422TORGBAROW_NEON
-void I422ToRGBARow_NEON(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
-    "vld1.u8    {d24}, [%5]                    \n"
-    "vld1.u8    {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    YUVTORGB_SETUP
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
+    YUVTORGB
     "subs       %4, %4, #8                     \n"
-    "vmov.u8    d19, #255                      \n"
+    "vmov.u8    d19, #255                      \n"  // d19 modified by YUVTORGB
+    MEMACCESS(3)
     "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
     "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(u_buf),    // %1
-      "+r"(v_buf),    // %2
-      "+r"(rgb_buf),  // %3
-      "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgba),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
-#endif  // HAS_I422TORGBAROW_NEON
 
-#ifdef HAS_I422TORGB24ROW_NEON
-void I422ToRGB24Row_NEON(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) {
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
   asm volatile (
-    "vld1.u8    {d24}, [%5]                    \n"
-    "vld1.u8    {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    YUVTORGB_SETUP
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
+    YUVTORGB
     "subs       %4, %4, #8                     \n"
+    MEMACCESS(3)
     "vst3.8     {d20, d21, d22}, [%3]!         \n"
     "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(u_buf),    // %1
-      "+r"(v_buf),    // %2
-      "+r"(rgb_buf),  // %3
+    : "+r"(src_y),      // %0
+      "+r"(src_u),      // %1
+      "+r"(src_v),      // %2
+      "+r"(dst_rgb24),  // %3
+      "+r"(width)       // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#define ARGBTORGB565                                                           \
+    "vshll.u8    q0, d22, #8                   \n"  /* R                    */ \
+    "vshll.u8    q8, d21, #8                   \n"  /* G                    */ \
+    "vshll.u8    q9, d20, #8                   \n"  /* B                    */ \
+    "vsri.16     q0, q8, #5                    \n"  /* RG                   */ \
+    "vsri.16     q0, q9, #11                   \n"  /* RGB                  */
+
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "subs       %4, %4, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_rgb565),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
-#endif  // HAS_I422TORGB24ROW_NEON
 
-#ifdef HAS_I422TORAWROW_NEON
-void I422ToRAWRow_NEON(const uint8* y_buf,
-                       const uint8* u_buf,
-                       const uint8* v_buf,
-                       uint8* rgb_buf,
-                       int width) {
+#define ARGBTOARGB1555                                                         \
+    "vshll.u8    q0, d23, #8                   \n"  /* A                    */ \
+    "vshll.u8    q8, d22, #8                   \n"  /* R                    */ \
+    "vshll.u8    q9, d21, #8                   \n"  /* G                    */ \
+    "vshll.u8    q10, d20, #8                  \n"  /* B                    */ \
+    "vsri.16     q0, q8, #1                    \n"  /* AR                   */ \
+    "vsri.16     q0, q9, #6                    \n"  /* ARG                  */ \
+    "vsri.16     q0, q10, #11                  \n"  /* ARGB                 */
+
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
   asm volatile (
-    "vld1.u8    {d24}, [%5]                    \n"
-    "vld1.u8    {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    YUVTORGB_SETUP
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
+    YUVTORGB
     "subs       %4, %4, #8                     \n"
-    "vswp.u8    d20, d22                       \n"
-    "vst3.8     {d20, d21, d22}, [%3]!         \n"
+    "vmov.u8    d23, #255                      \n"
+    ARGBTOARGB1555
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
     "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(u_buf),    // %1
-      "+r"(v_buf),    // %2
-      "+r"(rgb_buf),  // %3
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb1555),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
-#endif  // HAS_I422TORAWROW_NEON
 
-#ifdef HAS_NV12TOARGBROW_NEON
-void NV12ToARGBRow_NEON(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
+#define ARGBTOARGB4444                                                         \
+    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
+    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
+    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
+    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
+    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
+    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
+    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
+
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    ARGBTOARGB4444
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb4444),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    READYUV400
+    YUVTORGB
+    "subs       %2, %2, #8                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
+      [kUVToG]"r"(&kYuvI601Constants.kUVToG),
+      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
+      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void J400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d20}, [%0]!                   \n"
+    "vmov       d21, d20                       \n"
+    "vmov       d22, d20                       \n"
+    "subs       %2, %2, #8                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    :
+    : "cc", "memory", "d20", "d21", "d22", "d23"
+  );
+}
+
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
-    "vld1.u8    {d24}, [%4]                    \n"
-    "vld1.u8    {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    YUVTORGB_SETUP
+    "vmov.u8    d23, #255                      \n"
   "1:                                          \n"
     READNV12
-    YUV422TORGB
+    YUVTORGB
     "subs       %3, %3, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(2)
     "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
     "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(uv_buf),   // %1
-      "+r"(rgb_buf),  // %2
-      "+r"(width)     // %3
-    : "r"(&kUVToRB),  // %4
-      "r"(&kUVToG)    // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
-#endif  // HAS_NV12TOARGBROW_NEON
 
-#ifdef HAS_NV21TOARGBROW_NEON
-void NV21ToARGBRow_NEON(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
-    "vld1.u8    {d24}, [%4]                    \n"
-    "vld1.u8    {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    YUVTORGB_SETUP
+    "vmov.u8    d23, #255                      \n"
   "1:                                          \n"
     READNV21
-    YUV422TORGB
+    YUVTORGB
     "subs       %3, %3, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(2)
     "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
     "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(uv_buf),   // %1
-      "+r"(rgb_buf),  // %2
-      "+r"(width)     // %3
-    : "r"(&kUVToRB),  // %4
-      "r"(&kUVToG)    // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "+r"(src_y),     // %0
+      "+r"(src_vu),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
-#endif  // HAS_NV21TOARGBROW_NEON
 
-#ifdef HAS_SPLITUV_NEON
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
-// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
-void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
   asm volatile (
-    ".p2align  2                               \n"
+    YUVTORGB_SETUP
   "1:                                          \n"
-    "vld2.u8    {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
+    READNV12
+    YUVTORGB
+    "subs       %3, %3, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    READYUY2
+    YUVTORGB
+    "subs       %2, %2, #8                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_yuy2),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    READUYVY
+    YUVTORGB
+    "subs       %2, %2, #8                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_uyvy),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    "vst1.u8    {q0}, [%1]!                    \n"  // store U
-    "vst1.u8    {q1}, [%2]!                    \n"  // Store V
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store U
+    MEMACCESS(2)
+    "vst1.8     {q1}, [%2]!                    \n"  // store V
     "bgt        1b                             \n"
     : "+r"(src_uv),  // %0
       "+r"(dst_u),   // %1
       "+r"(dst_v),   // %2
       "+r"(width)    // %3  // Output registers
     :                       // Input registers
-    : "memory", "cc", "q0", "q1"  // Clobber List
+    : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load U
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"  // load V
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(2)
+    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
+    "bgt        1b                             \n"
+    :
+      "+r"(src_u),   // %0
+      "+r"(src_v),   // %1
+      "+r"(dst_uv),  // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "q0", "q1"  // Clobber List
   );
 }
-#endif  // HAS_SPLITUV_NEON
 
-#ifdef HAS_COPYROW_NEON
-// Copy multiple of 64
+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
   asm volatile (
-    ".p2align  2                               \n"
   "1:                                          \n"
-    "vldm       %0!, {q0, q1, q2, q3}          \n"  // load 64
-    "subs       %2, %2, #64                    \n"  // 64 processed per loop
-    "vstm       %1!, {q0, q1, q2, q3}          \n"  // store 64
+    MEMACCESS(0)
+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
+    "subs       %2, %2, #32                    \n"  // 32 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
     "bgt        1b                             \n"
-    : "+r"(src),   // %0
-      "+r"(dst),   // %1
-      "+r"(count)  // %2  // Output registers
-    :                     // Input registers
-    : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2  // Output registers
+  :                     // Input registers
+  : "cc", "memory", "q0", "q1"  // Clobber List
   );
 }
-#endif  // HAS_COPYROW_NEON
 
-#ifdef HAS_SETROW_NEON
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
-void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
-  asm volatile (  // NOLINT
-    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
-    "1:                                        \n"
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
+  asm volatile (
+    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
+  "1:                                          \n"
     "subs      %1, %1, #16                     \n"  // 16 bytes per loop
-    "vst1.u32  {q0}, [%0]!                     \n"  // store
+    MEMACCESS(0)
+    "vst1.8    {q0}, [%0]!                     \n"  // store
     "bgt       1b                              \n"
-    : "+r"(dst),   // %0
-      "+r"(count)  // %1
-    : "r"(v32)     // %2
-    : "q0", "memory", "cc");
-}
-
-// TODO(fbarchard): Make fully assembler
-// SetRow32 writes 'count' words using a 32 bit value repeated.
-void SetRows32_NEON(uint8* dst, uint32 v32, int width,
-                    int dst_stride, int height) {
-  for (int y = 0; y < height; ++y) {
-    SetRow8_NEON(dst, v32, width << 2);
-    dst += dst_stride;
-  }
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v8)      // %2
+  : "cc", "memory", "q0"
+  );
 }
-#endif  // HAS_SETROW_NEON
 
-#ifdef HAS_MIRRORROW_NEON
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
   asm volatile (
-    // compute where to start writing destination
-    "add         %1, %2                        \n"
-    // work on segments that are multiples of 16
-    "lsrs        r3, %2, #4                    \n"
-    // the output is written in two block. 8 bytes followed
-    // by another 8. reading is done sequentially, from left to
-    // right. writing is done from right to left in block sizes
-    // %1, the destination pointer is incremented after writing
-    // the first of the two blocks. need to subtract that 8 off
-    // along with 16 to get the next location.
-    "mov         r3, #-24                      \n"
-    "beq         2f                            \n"
-
-    // back of destination by the size of the register that is
-    // going to be mirrored
-    "sub         %1, #16                       \n"
-    // the loop needs to run on blocks of 16. what will be left
-    // over is either a negative number, the residuals that need
-    // to be done, or 0. If this isn't subtracted off here the
-    // loop will run one extra time.
-    "sub         %2, #16                       \n"
-
-    // mirror the bytes in the 64 bit segments. unable to mirror
-    // the bytes in the entire 128 bits in one go.
-    // because of the inability to mirror the entire 128 bits
-    // mirror the writing out of the two 64 bit segments.
-    ".p2align  2                               \n"
-  "1:                                          \n"
-    "vld1.8      {q0}, [%0]!                   \n"  // src += 16
-    "subs        %2, #16                       \n"
-    "vrev64.8    q0, q0                        \n"
-    "vst1.8      {d1}, [%1]!                   \n"
-    "vst1.8      {d0}, [%1], r3                \n"  // dst -= 16
-    "bge         1b                            \n"
-
-    // add 16 back to the counter. if the result is 0 there is no
-    // residuals so jump past
-    "adds        %2, #16                       \n"
-    "beq         5f                            \n"
-    "add         %1, #16                       \n"
-  "2:                                          \n"
-    "mov         r3, #-3                       \n"
-    "sub         %1, #2                        \n"
-    "subs        %2, #2                        \n"
-    // check for 16*n+1 scenarios where segments_of_2 should not
-    // be run, but there is something left over.
-    "blt         4f                            \n"
-
-// do this in neon registers as per
-// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-  "3:                                          \n"
-    "vld2.8      {d0[0], d1[0]}, [%0]!         \n"  // src += 2
-    "subs        %2, #2                        \n"
-    "vst1.8      {d1[0]}, [%1]!                \n"
-    "vst1.8      {d0[0]}, [%1], r3             \n"  // dst -= 2
-    "bge         3b                            \n"
-
-    "adds        %2, #2                        \n"
-    "beq         5f                            \n"
-  "4:                                          \n"
-    "add         %1, #1                        \n"
-    "vld1.8      {d0[0]}, [%0]                 \n"
-    "vst1.8      {d0[0]}, [%1]                 \n"
-  "5:                                          \n"
-    : "+r"(src),   // %0
-      "+r"(dst),   // %1
-      "+r"(width)  // %2
-    :
-    : "memory", "cc", "r3", "q0"
-  );
-}
-#endif  // HAS_MIRRORROW_NEON
-
-#ifdef HAS_MIRRORROWUV_NEON
-void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
-  asm volatile (
-    // compute where to start writing destination
-    "add         %1, %3                        \n"  // dst_a + width
-    "add         %2, %3                        \n"  // dst_b + width
-    // work on input segments that are multiples of 16, but
-    // width that has been passed is output segments, half
-    // the size of input.
-    "lsrs        r12, %3, #3                   \n"
-    "beq         2f                            \n"
-    // the output is written in to two blocks.
-    "mov         r12, #-8                      \n"
-    // back of destination by the size of the register that is
-    // going to be mirrord
-    "sub         %1, #8                        \n"
-    "sub         %2, #8                        \n"
-    // the loop needs to run on blocks of 8. what will be left
-    // over is either a negative number, the residuals that need
-    // to be done, or 0. if this isn't subtracted off here the
-    // loop will run one extra time.
-    "sub         %3, #8                        \n"
-
-    // mirror the bytes in the 64 bit segments
-    ".p2align  2                               \n"
-  "1:                                          \n"
-    "vld2.8      {d0, d1}, [%0]!               \n"  // src += 16
-    "subs        %3, #8                        \n"
-    "vrev64.8    q0, q0                        \n"
-    "vst1.8      {d0}, [%1], r12               \n"  // dst_a -= 8
-    "vst1.8      {d1}, [%2], r12               \n"  // dst_b -= 8
-    "bge         1b                            \n"
-
-    // add 8 back to the counter. if the result is 0 there is no
-    // residuals so return
-    "adds        %3, #8                        \n"
-    "beq         4f                            \n"
-    "add         %1, #8                        \n"
-    "add         %2, #8                        \n"
-  "2:                                          \n"
-    "mov         r12, #-1                      \n"
-    "sub         %1, #1                        \n"
-    "sub         %2, #1                        \n"
-  "3:                                          \n"
-      "vld2.8      {d0[0], d1[0]}, [%0]!       \n"  // src += 2
-      "subs        %3, %3, #1                  \n"
-      "vst1.8      {d0[0]}, [%1], r12          \n"  // dst_a -= 1
-      "vst1.8      {d1[0]}, [%2], r12          \n"  // dst_b -= 1
-      "bgt         3b                          \n"
-  "4:                                          \n"
-    : "+r"(src),    // %0
-      "+r"(dst_a),  // %1
-      "+r"(dst_b),  // %2
-      "+r"(width)   // %3
-    :
-    : "memory", "cc", "r12", "q0"
+    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+  "1:                                          \n"
+    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
+    MEMACCESS(0)
+    "vst1.8    {q0}, [%0]!                     \n"  // store
+    "bgt       1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v32)     // %2
+  : "cc", "memory", "q0"
   );
 }
-#endif  // HAS_MIRRORROWUV_NEON
 
-#ifdef HAS_BGRATOARGBROW_NEON
-void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) {
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
   asm volatile (
-    ".p2align  2                               \n"
+    // Start at end of source row.
+    "mov        r3, #-16                       \n"
+    "add        %0, %0, %2                     \n"
+    "sub        %0, #16                        \n"
+
   "1:                                          \n"
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d2                         \n"  // swap G, R
-    "vswp.u8    d0, d3                         \n"  // swap B, A
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+    "subs       %2, #16                        \n"  // 16 pixels per loop.
+    "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"
     "bgt        1b                             \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
   :
-  : "memory", "cc", "d0", "d1", "d2", "d3"  // Clobber List
+  : "cc", "memory", "r3", "q0"
   );
 }
-#endif  // HAS_BGRATOARGBROW_NEON
 
-#ifdef HAS_ABGRTOARGBROW_NEON
-void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) {
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width) {
   asm volatile (
-    ".p2align  2                               \n"
+    // Start at end of source row.
+    "mov        r12, #-16                      \n"
+    "add        %0, %0, %3, lsl #1             \n"
+    "sub        %0, #16                        \n"
+
   "1:                                          \n"
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d0, d2                         \n"  // swap R, B
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    MEMACCESS(0)
+    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
+    "subs       %3, #8                         \n"  // 8 pixels per loop.
+    "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"
     "bgt        1b                             \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
+  : "+r"(src_uv),  // %0
+    "+r"(dst_u),   // %1
+    "+r"(dst_v),   // %2
+    "+r"(width)    // %3
   :
-  : "memory", "cc", "d0", "d1", "d2", "d3"  // Clobber List
+  : "cc", "memory", "r12", "q0"
   );
 }
-#endif  // HAS_ABGRTOARGBROW_NEON
 
-#ifdef HAS_RGBATOARGBROW_NEON
-void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) {
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
   asm volatile (
-    ".p2align  2                               \n"
-  "1:                                           \n"
-    "vld1.8     {d0, d1, d2, d3}, [%0]!         \n"  // load 8 pixels of RGBA.
-    "subs       %2, %2, #8                      \n"  // 8 processed per loop.
-    "vmov.u8    d4, d0                          \n"  // move A after RGB
-    "vst4.8     {d1, d2, d3, d4}, [%1]!         \n"  // store 8 pixels of ARGB.
-    "bgt        1b                              \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
+    // Start at end of source row.
+    "mov        r3, #-16                       \n"
+    "add        %0, %0, %2, lsl #2             \n"
+    "sub        %0, #16                        \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+    "subs       %2, #4                         \n"  // 4 pixels per loop.
+    "vrev64.32  q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
   :
-  : "memory", "cc", "d0", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "r3", "q0"
   );
 }
-#endif  // HAS_RGBATOARGBROW_NEON
 
-#ifdef HAS_RGB24TOARGBROW_NEON
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
   asm volatile (
     "vmov.u8    d4, #255                       \n"  // Alpha
-    ".p2align  2                               \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
     "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_rgb24),  // %0
     "+r"(dst_argb),   // %1
-    "+r"(pix)         // %2
+    "+r"(width)         // %2
   :
-  : "memory", "cc", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
   );
 }
-#endif  // HAS_RGB24TOARGBROW_NEON
 
-#ifdef HAS_RAWTOARGBROW_NEON
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
   asm volatile (
     "vmov.u8    d4, #255                       \n"  // Alpha
-    ".p2align  2                               \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
     "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_raw),   // %0
     "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
+    "+r"(width)      // %2
   :
-  : "memory", "cc", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
   );
 }
-#endif  // HAS_RAWTOARGBROW_NEON
 
-#ifdef HAS_ARGBTORGBAROW_NEON
-void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) {
+void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
   asm volatile (
-    ".p2align  2                               \n"
   "1:                                          \n"
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    MEMACCESS(0)
+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmov.u8    d0, d4                         \n"  // move A before RGB.
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of RGBA.
+    "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
     "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_rgba),  // %1
-    "+r"(pix)        // %2
+  : "+r"(src_raw),    // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(width)       // %2
   :
-  : "memory", "cc", "d0", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "d1", "d2", "d3"  // Clobber List
   );
 }
-#endif  // HAS_ARGBTORGBAROW_NEON
 
-#ifdef HAS_ARGBTORGB24ROW_NEON
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+#define RGB565TOARGB                                                           \
+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
+    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
+    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
+    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
   asm volatile (
-    ".p2align  2                               \n"
+    "vmov.u8    d3, #255                       \n"  // Alpha
   "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    RGB565TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+#define ARGB1555TOARGB                                                         \
+    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
+    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
+    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
+    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
+    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
+    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
+    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
+    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
+    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
+    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
+    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                           \
+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
+    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
+    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
+    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+#define ARGB4444TOARGB                                                         \
+    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
+    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
+    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
+    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
+    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
+    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
+    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
+    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
+
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
     "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_rgb24),  // %1
-    "+r"(pix)         // %2
+    "+r"(width)         // %2
   :
-  : "memory", "cc", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
   );
 }
-#endif  // HAS_ARGBTORGB24ROW_NEON
 
-#ifdef HAS_ARGBTORAWROW_NEON
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
   asm volatile (
-    ".p2align  2                               \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
     "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_raw),   // %1
-    "+r"(pix)        // %2
+    "+r"(width)        // %2
   :
-  : "memory", "cc", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
   );
 }
-#endif  // HAS_ARGBTORAWROW_NEON
 
-#ifdef HAS_YUY2TOYROW_NEON
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
   asm volatile (
-    ".p2align  2                               \n"
   "1:                                          \n"
-    "vld2.u8    {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
     "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    "vst1.u8    {q0}, [%1]!                    \n"  // store 16 pixels of Y.
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
     "bgt        1b                             \n"
   : "+r"(src_yuy2),  // %0
     "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
+    "+r"(width)        // %2
   :
-  : "memory", "cc", "q0", "q1"  // Clobber List
+  : "cc", "memory", "q0", "q1"  // Clobber List
   );
 }
-#endif  // HAS_YUY2TOYROW_NEON
 
-#ifdef HAS_UYVYTOYROW_NEON
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
   asm volatile (
-    ".p2align  2                               \n"
   "1:                                          \n"
-    "vld2.u8    {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
     "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    "vst1.u8    {q1}, [%1]!                    \n"  // store 16 pixels of Y.
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
     "bgt        1b                             \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
+    "+r"(width)        // %2
   :
-  : "memory", "cc", "q0", "q1"  // Clobber List
+  : "cc", "memory", "q0", "q1"  // Clobber List
   );
 }
-#endif  // HAS_UYVYTOYROW_NEON
 
-#ifdef HAS_YUY2TOYROW_NEON
 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
-                         int pix) {
+                         int width) {
   asm volatile (
-    ".p2align  2                               \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
-    "vst1.u8    {d1}, [%1]!                    \n"  // store 8 U.
-    "vst1.u8    {d3}, [%2]!                    \n"  // store 8 V.
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
+    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_yuy2),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
-    "+r"(pix)        // %3
+    "+r"(width)        // %3
   :
-  : "memory", "cc", "d0", "d1", "d2", "d3"  // Clobber List
+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
   );
 }
-#endif  // HAS_YUY2TOYROW_NEON
 
-#ifdef HAS_UYVYTOYROW_NEON
 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
-                         int pix) {
+                         int width) {
   asm volatile (
-    ".p2align  2                               \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
-    "vst1.u8    {d0}, [%1]!                    \n"  // store 8 U.
-    "vst1.u8    {d2}, [%2]!                    \n"  // store 8 V.
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
+    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
-    "+r"(pix)        // %3
+    "+r"(width)        // %3
   :
-  : "memory", "cc", "d0", "d1", "d2", "d3"  // Clobber List
+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
   );
 }
-#endif  // HAS_UYVYTOYROW_NEON
 
-#ifdef HAS_YUY2TOYROW_NEON
 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix) {
+                      uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
-    "adds       %1, %0, %1                     \n"  // stride + src_yuy2
-    ".p2align  2                               \n"
+    "add        %1, %0, %1                     \n"  // stride + src_yuy2
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
     "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
     "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
-    "vst1.u8    {d1}, [%2]!                    \n"  // store 8 U.
-    "vst1.u8    {d3}, [%3]!                    \n"  // store 8 V.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
+    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_yuy2),     // %0
     "+r"(stride_yuy2),  // %1
     "+r"(dst_u),        // %2
     "+r"(dst_v),        // %3
-    "+r"(pix)           // %4
+    "+r"(width)           // %4
   :
-  : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
   );
 }
-#endif  // HAS_YUY2TOYROW_NEON
 
-#ifdef HAS_UYVYTOYROW_NEON
 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix) {
+                      uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
-    "adds       %1, %0, %1                     \n"  // stride + src_uyvy
-    ".p2align  2                               \n"
+    "add        %1, %0, %1                     \n"  // stride + src_uyvy
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
     "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
     "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
-    "vst1.u8    {d0}, [%2]!                    \n"  // store 8 U.
-    "vst1.u8    {d2}, [%3]!                    \n"  // store 8 V.
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
+    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_uyvy),     // %0
     "+r"(stride_uyvy),  // %1
     "+r"(dst_u),        // %2
     "+r"(dst_v),        // %3
-    "+r"(pix)           // %4
+    "+r"(width)           // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q2}, [%3]                     \n"  // shuffler
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
+    "subs       %2, %2, #4                     \n"  // 4 processed per loop
+    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
+    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)        // %2
+  : "r"(shuffler)    // %3
+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
+    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
+    "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_yuy2),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"
+  );
+}
+
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
+    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
+    "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_uyvy),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"
+  );
+}
+
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTORGB565
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_rgb565),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    "vdup.32    d2, %2                         \n"  // dither4
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d20, d20, d2                   \n"
+    "vqadd.u8   d21, d21, d2                   \n"
+    "vqadd.u8   d22, d22, d2                   \n"
+    ARGBTORGB565
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+  : "+r"(dst_rgb)    // %0
+  : "r"(src_argb),   // %1
+    "r"(dither4),    // %2
+    "r"(width)       // %3
+  : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+                            int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTOARGB1555
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb1555),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+                            int width) {
+  asm volatile (
+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTOARGB4444
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb4444),  // %1
+    "+r"(width)            // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q3}, [%1]!                    \n"  // store 16 A's.
+    "bgt       1b                              \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_a),      // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int width) {
+  asm volatile (
+    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
+    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
+    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
+    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
+    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlsl.u8   q2, d1, d25                    \n"  // G
+    "vmlsl.u8   q2, d2, d26                    \n"  // R
+    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
+
+    "vmull.u8   q3, d2, d24                    \n"  // R
+    "vmlsl.u8   q3, d1, d28                    \n"  // G
+    "vmlsl.u8   q3, d0, d27                    \n"  // B
+    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
+
+    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int width) {
+  asm volatile (
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(0)
+    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
+    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
+    "vpadd.u16  d1, d8, d9                     \n"  // B
+    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
+    "vpadd.u16  d3, d10, d11                   \n"  // G
+    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
+    "vpadd.u16  d5, d12, d13                   \n"  // R
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
+    "vmul.s16   q8, q0, q10                    \n"  // B
+    "vmls.s16   q8, q1, q11                    \n"  // G
+    "vmls.s16   q8, q2, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q2, q10                    \n"  // R
+    "vmls.s16   q9, q1, q14                    \n"  // G
+    "vmls.s16   q9, q0, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
+    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
+    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
+    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
+    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
+    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
+    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
+    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
+    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
+    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
+    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
+    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
+    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
+    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
+    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
+    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
+    "vrshr.u16  q2, q2, #1                     \n"
+    "vrshr.u16  q3, q3, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q3, q2, q1)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_stride_bgra),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_stride_abgr),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
+    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
+    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_stride_rgba),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
+    MEMACCESS(0)
+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
+    MEMACCESS(1)
+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_stride_rgb24),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_raw
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
+    MEMACCESS(0)
+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
+    MEMACCESS(1)
+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_stride_raw),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(src_stride_rgb565),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(src_stride_argb1555),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(src_stride_argb4444),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    RGB565TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_y),       // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(width)            // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(width)            // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d1, d4                     \n"  // R
+    "vmlal.u8   q8, d2, d5                     \n"  // G
+    "vmlal.u8   q8, d3, d6                     \n"  // B
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // R
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // B
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d1, d4                     \n"  // B
+    "vmlal.u8   q8, d2, d5                     \n"  // G
+    "vmlal.u8   q8, d3, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // B
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // B
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8* dst_ptr,
+                         const uint8* src_ptr, ptrdiff_t src_stride,
+                         int dst_width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  asm volatile (
+    "cmp        %4, #0                         \n"
+    "beq        100f                           \n"
+    "add        %2, %1                         \n"
+    "cmp        %4, #128                       \n"
+    "beq        50f                            \n"
+
+    "vdup.8     d5, %4                         \n"
+    "rsb        %4, #256                       \n"
+    "vdup.8     d4, %4                         \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vmull.u8   q13, d0, d4                    \n"
+    "vmull.u8   q14, d1, d4                    \n"
+    "vmlal.u8   q13, d2, d5                    \n"
+    "vmlal.u8   q14, d3, d5                    \n"
+    "vrshrn.u16 d0, q13, #8                    \n"
+    "vrshrn.u16 d1, q14, #8                    \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        1b                             \n"
+    "b          99f                            \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        50b                            \n"
+    "b          99f                            \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        100b                           \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(y1_fraction)       // %4
+  :
+  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
+  );
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  asm volatile (
+    "subs       %3, #8                         \n"
+    "blt        89f                            \n"
+    // Blend 8 pixels.
+  "8:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q10, d4, d3                    \n"  // db * a
+    "vmull.u8   q11, d5, d3                    \n"  // dg * a
+    "vmull.u8   q12, d6, d3                    \n"  // dr * a
+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
+    "vqadd.u8   d2, d2, d6                     \n"  // + sr
+    "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
+    "bge        8b                             \n"
+
+  "89:                                         \n"
+    "adds       %3, #8-1                       \n"
+    "blt        99f                            \n"
+
+    // Blend 1 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+    MEMACCESS(1)
+    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
+    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
+    "vmull.u8   q10, d4, d3                    \n"  // db * a
+    "vmull.u8   q11, d5, d3                    \n"  // dg * a
+    "vmull.u8   q12, d6, d3                    \n"  // dr * a
+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
+    "vqadd.u8   d2, d2, d6                     \n"  // + sr
+    "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
+    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
+    "bge        1b                             \n"
+
+  "99:                                         \n"
+
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
+  );
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    // Attenuate 8 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q10, d0, d3                    \n"  // b * a
+    "vmull.u8   q11, d1, d3                    \n"  // g * a
+    "vmull.u8   q12, d2, d3                    \n"  // r * a
+    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
+    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
+    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
+  );
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "vdup.u16   q8, %2                         \n"
+    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
+    "vdup.u16   q9, %3                         \n"  // interval multiply.
+    "vdup.u16   q10, %4                        \n"  // interval add
+
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
+    "vmovl.u8   q1, d2                         \n"
+    "vmovl.u8   q2, d4                         \n"
+    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
+    "vqdmulh.s16 q1, q1, q8                    \n"  // g
+    "vqdmulh.s16 q2, q2, q8                    \n"  // r
+    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
+    "vmul.u16   q1, q1, q9                     \n"  // g
+    "vmul.u16   q2, q2, q9                     \n"  // r
+    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
+    "vadd.u16   q1, q1, q10                    \n"  // g
+    "vadd.u16   q2, q2, q10                    \n"  // r
+    "vqmovn.u16 d0, q0                         \n"
+    "vqmovn.u16 d2, q1                         \n"
+    "vqmovn.u16 d4, q2                         \n"
+    MEMACCESS(0)
+    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+  );
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
+    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
+    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
+
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
+    "vmovl.u8   q11, d22                       \n"
+    "vmovl.u8   q12, d24                       \n"
+    "vmovl.u8   q13, d26                       \n"
+    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
+    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
+    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
+    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
+    "vqmovn.u16 d20, q10                       \n"
+    "vqmovn.u16 d22, q11                       \n"
+    "vqmovn.u16 d24, q12                       \n"
+    "vqmovn.u16 d26, q13                       \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),       // %0
+    "+r"(dst_argb),       // %1
+    "+r"(width)           // %2
+  : "r"(value)            // %3
+  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
+  );
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
+    "vmov       d1, d0                         \n"  // G
+    "vmov       d2, d0                         \n"  // R
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d20, #17                       \n"  // BB coefficient
+    "vmov.u8    d21, #68                       \n"  // BG coefficient
+    "vmov.u8    d22, #35                       \n"  // BR coefficient
+    "vmov.u8    d24, #22                       \n"  // GB coefficient
+    "vmov.u8    d25, #88                       \n"  // GG coefficient
+    "vmov.u8    d26, #45                       \n"  // GR coefficient
+    "vmov.u8    d28, #24                       \n"  // BB coefficient
+    "vmov.u8    d29, #98                       \n"  // BG coefficient
+    "vmov.u8    d30, #50                       \n"  // BR coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
+    "vmlal.u8   q2, d1, d21                    \n"  // G
+    "vmlal.u8   q2, d2, d22                    \n"  // R
+    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
+    "vmlal.u8   q3, d1, d25                    \n"  // G
+    "vmlal.u8   q3, d2, d26                    \n"  // R
+    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
+    "vmlal.u8   q8, d1, d29                    \n"  // G
+    "vmlal.u8   q8, d2, d30                    \n"  // R
+    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
+    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
+    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
+    MEMACCESS(0)
+    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),  // %0
+    "+r"(width)      // %1
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
+    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
+    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
+    "vmovl.u8   q9, d18                        \n"  // g
+    "vmovl.u8   q10, d20                       \n"  // r
+    "vmovl.u8   q11, d22                       \n"  // a
+    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
+    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
+    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
+    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
+    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
+    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
+    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
+    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
+    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
+    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
+    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
+    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
+    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
+    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
+    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
+    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
+    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+    MEMACCESS(1)
+    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "r"(matrix_argb)  // %3
+  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q0, d0, d1                     \n"  // multiply B
+    "vmull.u8   q1, d2, d3                     \n"  // multiply G
+    "vmull.u8   q2, d4, d5                     \n"  // multiply R
+    "vmull.u8   q3, d6, d7                     \n"  // multiply A
+    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
+    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
+    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
+    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
+    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
+    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // alpha
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d0, d0, d1                     \n"  // add
+    "vmov.u8    d1, d0                         \n"
+    "vmov.u8    d2, d0                         \n"
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    // 16 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+    "vqadd.u8   q0, q0, q1                     \n"  // add
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
   :
-  : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  : "cc", "memory", "q0", "q1"
   );
 }
-#endif  // HAS_UYVYTOYROW_NEON
 
-#endif  // __ARM_NEON__
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // alpha
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d1, d0, d2                     \n"  // add
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0],%5                  \n"  // top
+    MEMACCESS(0)
+    "vld1.8     {d1}, [%0],%6                  \n"
+    "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(1)
+    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%6                  \n"
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(2)
+    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
+    MEMACCESS(2)
+    "vld1.8     {d3}, [%2],%6                  \n"
+    "subs       %4, %4, #8                     \n"  // 8 pixels
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vabs.s16   q0, q0                         \n"
+    "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(3)
+    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
+    "bgt        1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  : "r"(2),            // %5
+    "r"(6)             // %6
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0],%4                  \n"  // left
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1],%4                  \n"
+    "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%4                  \n"
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0],%5                  \n"  // right
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%5                  \n"
+    "subs       %3, %3, #8                     \n"  // 8 pixels
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vabs.s16   q0, q0                         \n"
+    "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
+    "bgt        1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  : "r"(1),            // %4
+    "r"(6)             // %5
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/files/source/row_neon64.cc b/files/source/row_neon64.cc
new file mode 100644
index 00000000..6375d4f5
--- /dev/null
+++ b/files/source/row_neon64.cc
@@ -0,0 +1,2809 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v1.s}[0], [%1], #4            \n"                             \
+    MEMACCESS(2)                                                               \
+    "ld1        {v1.s}[1], [%2], #4            \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v2.h}[0], [%1], #2            \n"                             \
+    MEMACCESS(2)                                                               \
+    "ld1        {v2.h}[1], [%2], #2            \n"                             \
+    "zip1       v1.8b, v2.8b, v2.8b            \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v1.d}[0], [%1], #8            \n"                             \
+    MEMACCESS(2)                                                               \
+    "ld1        {v1.d}[1], [%2], #8            \n"                             \
+    "uaddlp     v1.8h, v1.16b                  \n"                             \
+    "rshrn      v1.8b, v1.8h, #1               \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    "movi       v1.8b , #128                   \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12                                                               \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v2.8b}, [%1], #8              \n"                             \
+    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21                                                               \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v2.8b}, [%1], #8              \n"                             \
+    "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 YUY2
+#define READYUY2                                                               \
+    MEMACCESS(0)                                                               \
+    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
+    "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
+    "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 UYVY
+#define READUYVY                                                               \
+    MEMACCESS(0)                                                               \
+    "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
+    "orr        v0.8b, v3.8b, v3.8b            \n"                             \
+    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+#define YUVTORGB_SETUP                                                         \
+    "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
+    "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
+    "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
+    "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
+    "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n"                             \
+    "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
+
+#define YUVTORGB(vR, vG, vB)                                                   \
+    "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
+    "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
+    "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
+    "ushll      v0.4s, v0.4h, #0               \n"                             \
+    "mul        v3.4s, v3.4s, v31.4s           \n"                             \
+    "mul        v0.4s, v0.4s, v31.4s           \n"                             \
+    "sqshrun    v0.4h, v0.4s, #16              \n"                             \
+    "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
+    "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
+    "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
+    "uxtl       v2.8h, v2.8b                   \n"                             \
+    "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
+    "mul        v3.8h, v1.8h, v27.8h           \n"                             \
+    "mul        v5.8h, v1.8h, v29.8h           \n"                             \
+    "mul        v6.8h, v2.8h, v30.8h           \n"                             \
+    "mul        v7.8h, v2.8h, v28.8h           \n"                             \
+    "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
+    "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
+    "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
+    "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
+    "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
+    "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
+    "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
+    "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
+    "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
+    "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n" /* A */
+  "1:                                          \n"
+    READYUV444
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n" /* A */
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void I422AlphaToARGBRow_NEON(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             const uint8* src_a,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    MEMACCESS(3)
+    "ld1        {v23.8b}, [%3], #8             \n"
+    "subs       %w5, %w5, #8                   \n"
+    MEMACCESS(4)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(src_a),     // %3
+      "+r"(dst_argb),  // %4
+      "+r"(width)      // %5
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n" /* A */
+  "1:                                          \n"
+    READYUV411
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v20.8b, #255                   \n" /* A */
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v23, v22, v21)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgba),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgb24), // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+#define ARGBTORGB565                                                           \
+    "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
+    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
+    "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
+    "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
+
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    ARGBTORGB565
+    MEMACCESS(3)
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_rgb565),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+#define ARGBTOARGB1555                                                         \
+    "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
+    "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
+    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
+    "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
+    "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
+    "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
+
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    ARGBTOARGB1555
+    MEMACCESS(3)
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb1555),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+#define ARGBTOARGB4444                                                         \
+    /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
+    "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
+    "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
+    "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
+    "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
+    "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
+    "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
+    "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
+
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    ARGBTOARGB4444
+    MEMACCESS(3)
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb4444),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void I400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READYUV400
+    YUVTORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
+      [kUVToG]"r"(&kYuvI601Constants.kUVToG),
+      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
+      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void J400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v20.8b}, [%0], #8             \n"
+    "orr        v21.8b, v20.8b, v20.8b         \n"
+    "orr        v22.8b, v20.8b, v20.8b         \n"
+    "subs       %w2, %w2, #8                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    :
+    : "cc", "memory", "v20", "v21", "v22", "v23"
+  );
+}
+
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READNV12
+    YUVTORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    MEMACCESS(2)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READNV21
+    YUVTORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    MEMACCESS(2)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_vu),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READNV12
+    YUVTORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READYUY2
+    YUVTORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_yuy2),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READUYVY
+    YUVTORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_uyvy),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store U
+    MEMACCESS(2)
+    "st1        {v1.16b}, [%2], #16            \n"  // store V
+    "b.gt       1b                             \n"
+    : "+r"(src_uv),  // %0
+      "+r"(dst_u),   // %1
+      "+r"(dst_v),   // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load U
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"  // load V
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    MEMACCESS(2)
+    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
+    "b.gt       1b                             \n"
+    :
+      "+r"(src_u),   // %0
+      "+r"(src_v),   // %1
+      "+r"(dst_uv),  // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
+    "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
+    MEMACCESS(1)
+    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
+    "b.gt       1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2  // Output registers
+  :                     // Input registers
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
+  asm volatile (
+    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
+  "1:                                          \n"
+    "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store
+    "b.gt       1b                             \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v8)      // %2
+  : "cc", "memory", "v0"
+  );
+}
+
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+  asm volatile (
+    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
+  "1:                                          \n"
+    "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store
+    "b.gt       1b                             \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v32)     // %2
+  : "cc", "memory", "v0"
+  );
+}
+
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    // Start at end of source row.
+    "add        %0, %0, %w2, sxtw              \n"
+    "sub        %0, %0, #16                    \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
+    "rev64      v0.16b, v0.16b                 \n"
+    MEMACCESS(1)
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+    MEMACCESS(1)
+    "st1        {v0.D}[0], [%1], #8            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
+  );
+}
+
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width) {
+  asm volatile (
+    // Start at end of source row.
+    "add        %0, %0, %w3, sxtw #1           \n"
+    "sub        %0, %0, #16                    \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
+    "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
+    "rev64      v0.8b, v0.8b                   \n"
+    "rev64      v1.8b, v1.8b                   \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_uv),  // %0
+    "+r"(dst_u),   // %1
+    "+r"(dst_v),   // %2
+    "+r"(width)    // %3
+  : "r"((ptrdiff_t)-16)      // %4
+  : "cc", "memory", "v0", "v1"
+  );
+}
+
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+  // Start at end of source row.
+    "add        %0, %0, %w2, sxtw #2           \n"
+    "sub        %0, %0, #16                    \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
+    "rev64      v0.4s, v0.4s                   \n"
+    MEMACCESS(1)
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+    MEMACCESS(1)
+    "st1        {v0.D}[0], [%1], #8            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
+  );
+}
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v4.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v5.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
+    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
+    MEMACCESS(1)
+    "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+
+void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
+    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
+    MEMACCESS(1)
+    "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),    // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+
+#define RGB565TOARGB                                                           \
+    "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
+    "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
+    "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
+    "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
+    "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
+    "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
+    "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
+    "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
+
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    RGB565TOARGB
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
+  );
+}
+
+#define ARGB1555TOARGB                                                         \
+    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
+    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
+    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
+                                                                               \
+    "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
+    "xtn2       v3.16b, v2.8h                  \n"                             \
+                                                                               \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
+                                                                               \
+    "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
+    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
+    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
+                                                                               \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
+    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
+    "dup        v1.2D, v0.D[1]                 \n"                             \
+    "dup        v3.2D, v2.D[1]                 \n"
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                           \
+    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
+    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
+    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
+                                                                               \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
+                                                                               \
+    "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
+    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
+    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
+                                                                               \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
+    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
+    "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
+
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int width) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+#define ARGB4444TOARGB                                                         \
+    "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
+    "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
+    "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
+    "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
+    "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
+    "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
+    "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
+    "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
+    "dup        v0.2D, v2.D[1]                 \n"                             \
+    "dup        v1.2D, v3.D[1]                 \n"
+
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(width)         // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
+    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
+    MEMACCESS(1)
+    "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_raw),   // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+                         int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
+    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
+    MEMACCESS(2)
+    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+                         int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
+    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
+    MEMACCESS(2)
+    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
+    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
+    MEMACCESS(3)
+    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_yuy2),     // %0
+    "+r"(src_yuy2b),    // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(width)           // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v5", "v6", "v7"  // Clobber List
+  );
+}
+
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_uyvyb = src_uyvy + stride_uyvy;
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
+    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
+    MEMACCESS(3)
+    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_uyvy),     // %0
+    "+r"(src_uyvyb),    // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(width)           // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v5", "v6", "v7"  // Clobber List
+  );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
+    "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
+    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)        // %2
+  : "r"(shuffler)    // %3
+  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+  );
+}
+
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
+    "orr        v2.8b, v1.8b, v1.8b            \n"
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
+    MEMACCESS(2)
+    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels
+    MEMACCESS(3)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_yuy2),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
+    "orr        v3.8b, v2.8b, v2.8b            \n"
+    MEMACCESS(1)
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
+    MEMACCESS(2)
+    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels
+    MEMACCESS(3)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_uyvy),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGBTORGB565
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_rgb565),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
+  );
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    "dup        v1.4s, %w2                     \n"  // dither4
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v20.8b, v20.8b, v1.8b          \n"
+    "uqadd      v21.8b, v21.8b, v1.8b          \n"
+    "uqadd      v22.8b, v22.8b, v1.8b          \n"
+    ARGBTORGB565
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+  : "+r"(dst_rgb)    // %0
+  : "r"(src_argb),   // %1
+    "r"(dither4),    // %2
+    "r"(width)       // %3
+  : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
+  );
+}
+
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+                            int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGBTOARGB1555
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb1555),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
+  );
+}
+
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+                            int width) {
+  asm volatile (
+    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGBTOARGB4444
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb4444),  // %1
+    "+r"(width)            // %2
+  :
+  : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
+  );
+}
+
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16 pixels
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_a),      // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
+    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
+    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+  );
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int width) {
+  asm volatile (
+    "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
+    "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
+    "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
+    "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
+    "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
+    "movi       v29.16b,#0x80                  \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
+    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
+    "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
+
+    "umull      v3.8h, v2.8b, v24.8b           \n"  // R
+    "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
+    "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
+    "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
+
+    "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v24", "v25", "v26", "v27", "v28", "v29"
+  );
+}
+
+#define RGBTOUV_SETUP_REG                                                      \
+    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
+    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
+    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
+    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
+    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
+    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
+
+// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int width) {
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(0)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
+    "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
+    "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
+    "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
+    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
+    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
+    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
+    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
+    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
+    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+    "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
+    "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
+    "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
+    "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
+    "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
+    "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
+    "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
+    "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
+    "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
+    "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+// TODO(fbarchard): consider ptrdiff_t for all strides.
+
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+    "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
+    "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
+    "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
+    "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
+    "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
+    "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
+    "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v3.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_bgra_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
+    "urshr      v2.8h, v2.8h, #1               \n"
+    "urshr      v1.8h, v1.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v2.8h, v1.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_abgr_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_rgba_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_rgb24_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_raw_1 = src_raw + src_stride_raw;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
+    "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
+    "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v0.8h, v0.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v2.8h, v1.8h, v0.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_raw_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
+  asm volatile (
+    "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
+    "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
+    "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
+    "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
+    "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
+    "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v17.D[0]             \n"
+    "ins        v18.D[1], v19.D[0]             \n"
+    "ins        v20.D[1], v21.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v18.8h, #1              \n"
+    "urshr      v6.8h, v20.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v16.8h, v4.8h, v22.8h          \n"  // B
+    "mls        v16.8h, v5.8h, v23.8h          \n"  // G
+    "mls        v16.8h, v6.8h, v24.8h          \n"  // R
+    "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
+    "mul        v17.8h, v6.8h, v22.8h          \n"  // R
+    "mls        v17.8h, v5.8h, v26.8h          \n"  // G
+    "mls        v17.8h, v4.8h, v25.8h          \n"  // B
+    "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(src_rgb565_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+    "v25", "v26", "v27"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v26.D[0]             \n"
+    "ins        v17.D[1], v27.D[0]             \n"
+    "ins        v18.D[1], v28.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v17.8h, #1              \n"
+    "urshr      v6.8h, v18.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(src_argb1555_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+    "v26", "v27", "v28"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v26.D[0]             \n"
+    "ins        v17.D[1], v27.D[0]             \n"
+    "ins        v18.D[1], v28.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v17.8h, #1              \n"
+    "urshr      v6.8h, v18.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(src_argb4444_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+    "v26", "v27", "v28"
+
+  );
+}
+
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+    "movi       v27.8b, #16                    \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    RGB565TOARGB
+    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v27.8b           \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_y),       // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
+    "v24", "v25", "v26", "v27"
+  );
+}
+
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(width)            // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+    "movi       v27.8b, #16                    \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v27.8b           \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(width)            // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
+  );
+}
+
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v1.8b, v4.8b           \n"  // R
+    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // R
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v1.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_y),      // %1
+    "+r"(width)         // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(dst_y),    // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8* dst_ptr,
+                         const uint8* src_ptr, ptrdiff_t src_stride,
+                         int dst_width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  asm volatile (
+    "cmp        %w4, #0                        \n"
+    "b.eq       100f                           \n"
+    "cmp        %w4, #128                      \n"
+    "b.eq       50f                            \n"
+
+    "dup        v5.16b, %w4                    \n"
+    "dup        v4.16b, %w5                    \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "umull      v2.8h, v0.8b,  v4.8b           \n"
+    "umull2     v3.8h, v0.16b, v4.16b          \n"
+    "umlal      v2.8h, v1.8b,  v5.8b           \n"
+    "umlal2     v3.8h, v1.16b, v5.16b          \n"
+    "rshrn      v0.8b,  v2.8h, #8              \n"
+    "rshrn2     v0.16b, v3.8h, #8              \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       1b                             \n"
+    "b          99f                            \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       50b                            \n"
+    "b          99f                            \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       100b                           \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_ptr1),         // %2
+    "+r"(dst_width),        // %3
+    "+r"(y1_fraction),      // %4
+    "+r"(y0_fraction)       // %5
+  :
+  : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
+  );
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  asm volatile (
+    "subs       %w3, %w3, #8                   \n"
+    "b.lt       89f                            \n"
+    // Blend 8 pixels.
+  "8:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+    "movi       v3.8b, #255                    \n"  // a = 255
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.ge       8b                             \n"
+
+  "89:                                         \n"
+    "adds       %w3, %w3, #8-1                 \n"
+    "b.lt       99f                            \n"
+
+    // Blend 1 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
+    MEMACCESS(1)
+    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
+    "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
+    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+    "movi       v3.8b, #255                    \n"  // a = 255
+    MEMACCESS(2)
+    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
+    "b.ge       1b                             \n"
+
+  "99:                                         \n"
+
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v16", "v17", "v18"
+  );
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    // Attenuate 8 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
+    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
+    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
+    "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
+    "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
+    "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+  );
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "dup        v4.8h, %w2                     \n"
+    "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
+    "dup        v5.8h, %w3                     \n"  // interval multiply.
+    "dup        v6.8h, %w4                     \n"  // interval add
+
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
+    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
+    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
+    "uxtl       v1.8h, v1.8b                   \n"
+    "uxtl       v2.8h, v2.8b                   \n"
+    "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
+    "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
+    "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
+    "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
+    "mul        v1.8h, v1.8h, v5.8h            \n"  // g
+    "mul        v2.8h, v2.8h, v5.8h            \n"  // r
+    "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
+    "add        v1.8h, v1.8h, v6.8h            \n"  // g
+    "add        v2.8h, v2.8h, v6.8h            \n"  // r
+    "uqxtn      v0.8b, v0.8h                   \n"
+    "uqxtn      v1.8b, v1.8h                   \n"
+    "uqxtn      v2.8b, v2.8h                   \n"
+    MEMACCESS(0)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+  );
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // duplicate scale value.
+    "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
+    "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
+
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
+    "uxtl       v5.8h, v5.8b                   \n"
+    "uxtl       v6.8h, v6.8b                   \n"
+    "uxtl       v7.8h, v7.8b                   \n"
+    "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
+    "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
+    "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
+    "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
+    "uqxtn      v4.8b, v4.8h                   \n"
+    "uqxtn      v5.8b, v5.8h                   \n"
+    "uqxtn      v6.8b, v6.8h                   \n"
+    "uqxtn      v7.8b, v7.8h                   \n"
+    MEMACCESS(1)
+    "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),       // %0
+    "+r"(dst_argb),       // %1
+    "+r"(width)           // %2
+  : "r"(value)            // %3
+  : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
+  );
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
+    "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
+    "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
+    "orr        v1.8b, v0.8b, v0.8b            \n"  // G
+    "orr        v2.8b, v0.8b, v0.8b            \n"  // R
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
+  );
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v20.8b, #17                    \n"  // BB coefficient
+    "movi       v21.8b, #68                    \n"  // BG coefficient
+    "movi       v22.8b, #35                    \n"  // BR coefficient
+    "movi       v24.8b, #22                    \n"  // GB coefficient
+    "movi       v25.8b, #88                    \n"  // GG coefficient
+    "movi       v26.8b, #45                    \n"  // GR coefficient
+    "movi       v28.8b, #24                    \n"  // BB coefficient
+    "movi       v29.8b, #98                    \n"  // BG coefficient
+    "movi       v30.8b, #50                    \n"  // BR coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
+    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
+    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
+    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
+    "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
+    "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
+    "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
+    "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
+    "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
+    "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
+    "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
+    "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
+    MEMACCESS(0)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(dst_argb),  // %0
+    "+r"(width)      // %1
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
+  );
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
+    "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
+    "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
+    "uxtl       v17.8h, v17.8b                 \n"  // g
+    "uxtl       v18.8h, v18.8b                 \n"  // r
+    "uxtl       v19.8h, v19.8b                 \n"  // a
+    "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
+    "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
+    "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
+    "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
+    "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
+    "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
+    "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
+    "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
+    "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
+    "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
+    "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
+    "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
+    "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
+    "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
+    "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
+    "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
+    "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
+    MEMACCESS(1)
+    "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "r"(matrix_argb)  // %3
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v22", "v23", "v24", "v25"
+  );
+}
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
+    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
+    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
+    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
+    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
+    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
+    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
+    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"
+    "uqadd      v3.8b, v3.8b, v7.8b            \n"
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqsub      v0.8b, v0.8b, v4.8b            \n"
+    "uqsub      v1.8b, v1.8b, v5.8b            \n"
+    "uqsub      v2.8b, v2.8b, v6.8b            \n"
+    "uqsub      v3.8b, v3.8b, v7.8b            \n"
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // alpha
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
+    "orr        v1.8b, v0.8b, v0.8b            \n"
+    "orr        v2.8b, v0.8b, v0.8b            \n"
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    // 16 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
+    MEMACCESS(2)
+    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1"
+  );
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // alpha
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0],%5               \n"  // top
+    MEMACCESS(0)
+    "ld1        {v1.8b}, [%0],%6               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
+    MEMACCESS(1)
+    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1],%6               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    MEMACCESS(2)
+    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
+    MEMACCESS(2)
+    "ld1        {v3.8b}, [%2],%6               \n"
+    "subs       %w4, %w4, #8                   \n"  // 8 pixels
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
+    MEMACCESS(3)
+    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
+    "b.gt       1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  : "r"(2LL),          // %5
+    "r"(6LL)           // %6
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0],%4               \n"  // left
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1],%4               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1],%4               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0],%5               \n"  // right
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1],%5               \n"
+    "subs       %w3, %w3, #8                   \n"  // 8 pixels
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
+    "b.gt       1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  : "r"(1LL),          // %4
+    "r"(6LL)           // %5
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/row_posix.cc b/files/source/row_posix.cc
deleted file mode 100644
index 33149dad..00000000
--- a/files/source/row_posix.cc
+++ /dev/null
@@ -1,3662 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64
-#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-
-// GCC 4.2 on OSX has link error when passing static or const to inline.
-// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
-#ifdef __APPLE__
-#define CONST
-#else
-#define CONST static const
-#endif
-
-#ifdef HAS_ARGBTOYROW_SSSE3
-
-// Constants for ARGB
-CONST vec8 kARGBToY = {
-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
-
-CONST vec8 kARGBToU = {
-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
-
-CONST vec8 kARGBToV = {
-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-};
-
-// Constants for BGRA
-CONST vec8 kBGRAToY = {
-  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
-
-CONST vec8 kBGRAToU = {
-  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
-
-CONST vec8 kBGRAToV = {
-  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
-
-// Constants for ABGR
-CONST vec8 kABGRToY = {
-  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
-
-CONST vec8 kABGRToU = {
-  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
-
-CONST vec8 kABGRToV = {
-  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
-
-CONST uvec8 kAddY16 = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
-
-CONST uvec8 kAddUV128 = {
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting RGB24 to ARGB.
-CONST uvec8 kShuffleMaskRGB24ToARGB = {
-  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
-
-// Shuffle table for converting RAW to ARGB.
-CONST uvec8 kShuffleMaskRAWToARGB = {
-  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
-
-// Shuffle table for converting ABGR to ARGB.
-CONST uvec8 kShuffleMaskABGRToARGB = {
-  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
-};
-
-// Shuffle table for converting BGRA to ARGB.
-CONST uvec8 kShuffleMaskBGRAToARGB = {
-  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
-};
-
-// Shuffle table for converting RGBA to ARGB.
-CONST uvec8 kShuffleMaskRGBAToARGB = {
-  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
-};
-
-// Shuffle table for converting ARGB to RGBA.
-CONST uvec8 kShuffleMaskARGBToRGBA = {
-  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
-};
-
-// Shuffle table for converting ARGB to RGB24.
-CONST uvec8 kShuffleMaskARGBToRGB24 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting ARGB to RAW.
-CONST uvec8 kShuffleMaskARGBToRAW = {
-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
-
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movq      (%0),%%xmm0                     \n"
-    "lea       0x8(%0),%0                      \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm1                   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y),     // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
-  asm volatile (
-    "movdqa    %3,%%xmm5                       \n"
-    "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
-    "jg        1b                              \n"
-
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "m"(kShuffleMaskABGRToARGB)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
-  );
-}
-
-void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
-  asm volatile (
-    "movdqa    %3,%%xmm5                       \n"
-    "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "m"(kShuffleMaskBGRAToARGB)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
-  );
-}
-
-void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
-  asm volatile (
-    "movdqa    %3,%%xmm5                       \n"
-    "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
-    "jg        1b                              \n"
-
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "m"(kShuffleMaskRGBAToARGB)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
-  );
-}
-
-void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
-  asm volatile (
-    "movdqa    %3,%%xmm5                       \n"
-    "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
-    "jg        1b                              \n"
-
-  : "+r"(src_argb),  // %0
-    "+r"(dst_rgba),  // %1
-    "+r"(pix)        // %2
-  : "m"(kShuffleMaskARGBToRGBA)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
-  );
-}
-
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
-    "pslld     $0x18,%%xmm5                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm3                 \n"
-    "lea       0x30(%0),%0                     \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "palignr   $0x8,%%xmm1,%%xmm2              \n"
-    "pshufb    %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqa    %%xmm2,0x20(%1)                 \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "palignr   $0x4,%%xmm3,%%xmm3              \n"
-    "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm3,0x30(%1)                 \n"
-    "lea       0x40(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "m"(kShuffleMaskRGB24ToARGB)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
-    "pslld     $0x18,%%xmm5                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm3                 \n"
-    "lea       0x30(%0),%0                     \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "palignr   $0x8,%%xmm1,%%xmm2              \n"
-    "pshufb    %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqa    %%xmm2,0x20(%1)                 \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "palignr   $0x4,%%xmm3,%%xmm3              \n"
-    "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm3,0x30(%1)                 \n"
-    "lea       0x40(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_raw),   // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "m"(kShuffleMaskRAWToARGB)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "mov       $0x1080108,%%eax                \n"
-    "movd      %%eax,%%xmm5                    \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x20802080,%%eax               \n"
-    "movd      %%eax,%%xmm6                    \n"
-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psllw     $0xb,%%xmm3                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0xa,%%xmm4                     \n"
-    "psrlw     $0x5,%%xmm4                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psllw     $0x8,%%xmm7                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "psllw     $0xb,%%xmm2                     \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "psllw     $0x8,%%xmm1                     \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "pmulhuw   %%xmm6,%%xmm0                   \n"
-    "por       %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpckhbw %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm1,(%1,%0,2)                \n"
-    "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
-    "lea       0x10(%0),%0                     \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  :
-  : "memory", "cc", "eax"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "mov       $0x1080108,%%eax                \n"
-    "movd      %%eax,%%xmm5                    \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x42004200,%%eax               \n"
-    "movd      %%eax,%%xmm6                    \n"
-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psllw     $0xb,%%xmm3                     \n"
-    "movdqa    %%xmm3,%%xmm4                   \n"
-    "psrlw     $0x6,%%xmm4                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psllw     $0x8,%%xmm7                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psllw     $0x1,%%xmm1                     \n"
-    "psllw     $0xb,%%xmm2                     \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "psllw     $0x8,%%xmm1                     \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "pmulhuw   %%xmm6,%%xmm0                   \n"
-    "pand      %%xmm7,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpckhbw %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm1,(%1,%0,2)                \n"
-    "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
-    "lea       0x10(%0),%0                     \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  :
-  : "memory", "cc", "eax"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "mov       $0xf0f0f0f,%%eax                \n"
-    "movd      %%eax,%%xmm4                    \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "pslld     $0x4,%%xmm5                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "psllw     $0x4,%%xmm1                     \n"
-    "psrlw     $0x4,%%xmm3                     \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm0                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%1,%0,2)                \n"
-    "movdqa    %%xmm1,0x10(%1,%0,2)            \n"
-    "lea       0x10(%0),%0                     \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  :
-  : "memory", "cc", "eax"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "movdqa    %3,%%xmm6                       \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
-    "lea       0x40(%0),%0                     \n"
-    "pshufb    %%xmm6,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm6,%%xmm2                   \n"
-    "pshufb    %%xmm6,%%xmm3                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "psrldq    $0x4,%%xmm1                     \n"
-    "pslldq    $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm2,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pslldq    $0x8,%%xmm5                     \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "psrldq    $0x8,%%xmm2                     \n"
-    "pslldq    $0x4,%%xmm3                     \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
-    "movdqa    %%xmm2,0x20(%1)                 \n"
-    "lea       0x30(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  : "m"(kShuffleMaskARGBToRGB24)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
-  );
-}
-
-void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "movdqa    %3,%%xmm6                       \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
-    "lea       0x40(%0),%0                     \n"
-    "pshufb    %%xmm6,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm6,%%xmm2                   \n"
-    "pshufb    %%xmm6,%%xmm3                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "psrldq    $0x4,%%xmm1                     \n"
-    "pslldq    $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm2,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pslldq    $0x8,%%xmm5                     \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "psrldq    $0x8,%%xmm2                     \n"
-    "pslldq    $0x4,%%xmm3                     \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
-    "movdqa    %%xmm2,0x20(%1)                 \n"
-    "lea       0x30(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  : "m"(kShuffleMaskARGBToRAW)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
-  );
-}
-
-void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psrld     $0x1b,%%xmm3                    \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psrld     $0x1a,%%xmm4                    \n"
-    "pslld     $0x5,%%xmm4                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0xb,%%xmm5                     \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pslld     $0x8,%%xmm0                     \n"
-    "psrld     $0x3,%%xmm1                     \n"
-    "psrld     $0x5,%%xmm2                     \n"
-    "psrad     $0x10,%%xmm0                    \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm4,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "lea       0x10(%0),%0                     \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psrld     $0x1b,%%xmm4                    \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "pslld     $0x5,%%xmm5                     \n"
-    "movdqa    %%xmm4,%%xmm6                   \n"
-    "pslld     $0xa,%%xmm6                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "pslld     $0xf,%%xmm7                     \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "psrad     $0x10,%%xmm0                    \n"
-    "psrld     $0x3,%%xmm1                     \n"
-    "psrld     $0x6,%%xmm2                     \n"
-    "psrld     $0x9,%%xmm3                     \n"
-    "pand      %%xmm7,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm6,%%xmm3                   \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "lea       0x10(%0),%0                     \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm4,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm3,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm1                   \n"
-    "psrlq     $0x4,%%xmm0                     \n"
-    "psrlq     $0x8,%%xmm1                     \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "lea       0x10(%0),%0                     \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
-  );
-}
-
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kARGBToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kARGBToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-// TODO(fbarchard): pass xmm constants to single block of assembly.
-// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
-// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
-// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
-// and considered unsafe.
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToU),  // %0
-    "m"(kARGBToV),  // %1
-    "m"(kAddUV128)  // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm6                 \n"
-    "pavgb     (%0,%4,1),%%xmm0                \n"
-    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
-    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
-    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
-    "lea       0x40(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"(static_cast<intptr_t>(src_stride_argb))
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToU),         // %0
-    "m"(kARGBToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "movdqu    (%0,%4,1),%%xmm7                \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"(static_cast<intptr_t>(src_stride_argb))
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kBGRAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kBGRAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kBGRAToU),         // %0
-    "m"(kBGRAToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm6                 \n"
-    "pavgb     (%0,%4,1),%%xmm0                \n"
-    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
-    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
-    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
-    "lea       0x40(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"(static_cast<intptr_t>(src_stride_bgra))
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kBGRAToU),         // %0
-    "m"(kBGRAToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "movdqu    (%0,%4,1),%%xmm7                \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"(static_cast<intptr_t>(src_stride_bgra))
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kABGRToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kABGRToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kABGRToU),         // %0
-    "m"(kABGRToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm6                 \n"
-    "pavgb     (%0,%4,1),%%xmm0                \n"
-    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
-    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
-    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
-    "lea       0x40(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"(static_cast<intptr_t>(src_stride_abgr))
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kABGRToU),         // %0
-    "m"(kABGRToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "movdqu    (%0,%4,1),%%xmm7                \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"(static_cast<intptr_t>(src_stride_abgr))
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-#endif  // HAS_ARGBTOYROW_SSSE3
-
-#ifdef HAS_I422TOARGBROW_SSSE3
-#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
-#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
-#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
-#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
-
-struct {
-  vec8 kUVToB;  // 0
-  vec8 kUVToG;  // 16
-  vec8 kUVToR;  // 32
-  vec16 kUVBiasB;  // 48
-  vec16 kUVBiasG;  // 64
-  vec16 kUVBiasR;  // 80
-  vec16 kYSub16;  // 96
-  vec16 kYToRgb;  // 112
-  vec8 kVUToB;  // 128
-  vec8 kVUToG;  // 144
-  vec8 kVUToR;  // 160
-} CONST SIMD_ALIGNED(kYuvConstants) = {
-  { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR },
-  { 16, 16, 16, 16, 16, 16, 16, 16 },
-  { YG, YG, YG, YG, YG, YG, YG, YG },
-  { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
-};
-
-
-// Read 8 UV from 411
-#define READYUV444                                                             \
-    "movq       (%[u_buf]),%%xmm0              \n"                             \
-    "movq       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
-    "lea        0x8(%[u_buf]),%[u_buf]         \n"                             \
-    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
-
-// Read 4 UV from 422, upsample to 8 UV
-#define READYUV422                                                             \
-    "movd       (%[u_buf]),%%xmm0              \n"                             \
-    "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
-    "lea        0x4(%[u_buf]),%[u_buf]         \n"                             \
-    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
-    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
-
-// Read 2 UV from 411, upsample to 8 UV
-#define READYUV411                                                             \
-    "movd       (%[u_buf]),%%xmm0              \n"                             \
-    "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
-    "lea        0x2(%[u_buf]),%[u_buf]         \n"                             \
-    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
-    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
-    "punpckldq  %%xmm0,%%xmm0                  \n"                             \
-
-// Read 4 UV from NV12, upsample to 8 UV
-#define READNV12                                                               \
-    "movq       (%[uv_buf]),%%xmm0             \n"                             \
-    "lea        0x8(%[uv_buf]),%[uv_buf]       \n"                             \
-    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
-
-// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB                                                               \
-    "movdqa     %%xmm0,%%xmm1                  \n"                             \
-    "movdqa     %%xmm0,%%xmm2                  \n"                             \
-    "pmaddubsw  (%[kYuvConstants]),%%xmm0      \n"                             \
-    "pmaddubsw  16(%[kYuvConstants]),%%xmm1    \n"                             \
-    "pmaddubsw  32(%[kYuvConstants]),%%xmm2    \n"                             \
-    "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
-    "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
-    "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
-    "movq       (%[y_buf]),%%xmm3              \n"                             \
-    "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
-    "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
-    "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
-    "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
-    "paddsw     %%xmm3,%%xmm0                  \n"                             \
-    "paddsw     %%xmm3,%%xmm1                  \n"                             \
-    "paddsw     %%xmm3,%%xmm2                  \n"                             \
-    "psraw      $0x6,%%xmm0                    \n"                             \
-    "psraw      $0x6,%%xmm1                    \n"                             \
-    "psraw      $0x6,%%xmm2                    \n"                             \
-    "packuswb   %%xmm0,%%xmm0                  \n"                             \
-    "packuswb   %%xmm1,%%xmm1                  \n"                             \
-    "packuswb   %%xmm2,%%xmm2                  \n"                             \
-
-// Convert 8 pixels: 8 VU and 8 Y
-#define YVUTORGB                                                               \
-    "movdqa     %%xmm0,%%xmm1                  \n"                             \
-    "movdqa     %%xmm0,%%xmm2                  \n"                             \
-    "pmaddubsw  128(%[kYuvConstants]),%%xmm0   \n"                             \
-    "pmaddubsw  144(%[kYuvConstants]),%%xmm1   \n"                             \
-    "pmaddubsw  160(%[kYuvConstants]),%%xmm2   \n"                             \
-    "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
-    "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
-    "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
-    "movq       (%[y_buf]),%%xmm3              \n"                             \
-    "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
-    "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
-    "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
-    "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
-    "paddsw     %%xmm3,%%xmm0                  \n"                             \
-    "paddsw     %%xmm3,%%xmm1                  \n"                             \
-    "paddsw     %%xmm3,%%xmm2                  \n"                             \
-    "psraw      $0x6,%%xmm0                    \n"                             \
-    "psraw      $0x6,%%xmm1                    \n"                             \
-    "psraw      $0x6,%%xmm2                    \n"                             \
-    "packuswb   %%xmm0,%%xmm0                  \n"                             \
-    "packuswb   %%xmm1,%%xmm1                  \n"                             \
-    "packuswb   %%xmm2,%%xmm2                  \n"                             \
-
-void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* argb_buf,
-                                int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    READYUV444
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%[argb_buf])            \n"
-    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
-    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* argb_buf,
-                                int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%[argb_buf])            \n"
-    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
-    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* argb_buf,
-                                int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    READYUV411
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%[argb_buf])            \n"
-    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
-    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* uv_buf,
-                                uint8* argb_buf,
-                                int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    READNV12
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%[argb_buf])            \n"
-    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
-    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* vu_buf,
-                                uint8* argb_buf,
-                                int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    READNV12
-    YVUTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%[argb_buf])            \n"
-    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
-    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(vu_buf),    // %[uv_buf]
-    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* argb_buf,
-                                          int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    READYUV444
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%[argb_buf])            \n"
-    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
-    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* argb_buf,
-                                          int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%[argb_buf])            \n"
-    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
-    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* argb_buf,
-                                          int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    READYUV411
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%[argb_buf])            \n"
-    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
-    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* uv_buf,
-                                          uint8* argb_buf,
-                                          int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    READNV12
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%[argb_buf])            \n"
-    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
-    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* vu_buf,
-                                          uint8* argb_buf,
-                                          int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    READNV12
-    YVUTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%[argb_buf])            \n"
-    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
-    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(vu_buf),    // %[uv_buf]
-    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* bgra_buf,
-                                int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm5                   \n"
-    "movdqa    %%xmm5,%%xmm0                   \n"
-    "punpcklwd %%xmm1,%%xmm5                   \n"
-    "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm5,(%[argb_buf])            \n"
-    "movdqa    %%xmm0,0x10(%[argb_buf])        \n"
-    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [argb_buf]"+r"(bgra_buf),  // %[argb_buf]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* abgr_buf,
-                                int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm2                   \n"
-    "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,(%[argb_buf])            \n"
-    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
-    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [argb_buf]"+r"(abgr_buf),  // %[argb_buf]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* bgra_buf,
-                                          int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm5                   \n"
-    "movdqa    %%xmm5,%%xmm0                   \n"
-    "punpcklwd %%xmm1,%%xmm5                   \n"
-    "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm5,(%[argb_buf])            \n"
-    "movdqu    %%xmm0,0x10(%[argb_buf])        \n"
-    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [argb_buf]"+r"(bgra_buf),  // %[argb_buf]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* abgr_buf,
-                                          int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm2                   \n"
-    "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm2,(%[argb_buf])            \n"
-    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
-    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [argb_buf]"+r"(abgr_buf),  // %[argb_buf]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_I422TOARGBROW_SSSE3
-
-#ifdef HAS_YTOARGBROW_SSE2
-void YToARGBRow_SSE2(const uint8* y_buf,
-                     uint8* rgb_buf,
-                     int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    "mov       $0x10001000,%%eax               \n"
-    "movd      %%eax,%%xmm3                    \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "mov       $0x012a012a,%%eax               \n"
-    "movd      %%eax,%%xmm2                    \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-    "movq      (%0),%%xmm0                     \n"
-    "lea       0x8(%0),%0                      \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "psubusw   %%xmm3,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-
-    // Step 2: Weave into ARGB
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm1                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "por       %%xmm4,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "movdqa    %%xmm1,16(%1)                   \n"
-    "lea       32(%1),%1                       \n"
-
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(y_buf),    // %0
-    "+r"(rgb_buf),  // %1
-    "+rm"(width)    // %2
-  :
-  : "memory", "cc", "eax"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
-  );
-}
-#endif  // HAS_YTOARGBROW_SSE2
-
-#ifdef HAS_MIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-CONST uvec8 kShuffleMirror = {
-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
-  intptr_t temp_width = static_cast<intptr_t>(width);
-  asm volatile (
-    "movdqa    %3,%%xmm5                       \n"
-    "lea       -0x10(%0),%0                    \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0,%2),%%xmm0                  \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kShuffleMirror) // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_MIRRORROW_SSSE3
-
-#ifdef HAS_MIRRORROW_SSE2
-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
-  intptr_t temp_width = static_cast<intptr_t>(width);
-  asm volatile (
-    "lea       -0x10(%0),%0                    \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0,%2),%%xmm0                  \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "psllw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
-    "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-#endif  // HAS_MIRRORROW_SSE2
-
-#ifdef HAS_MIRRORROW_UV_SSSE3
-// Shuffle table for reversing the bytes of UV channels.
-CONST uvec8 kShuffleMirrorUV = {
-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
-void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
-                       int width) {
-  intptr_t temp_width = static_cast<intptr_t>(width);
-  asm volatile (
-    "movdqa    %4,%%xmm1                       \n"
-    "lea       -16(%0,%3,2),%0                 \n"
-    "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "lea       -16(%0),%0                      \n"
-    "pshufb    %%xmm1,%%xmm0                   \n"
-    "sub       $8,%3                           \n"
-    "movlpd    %%xmm0,(%1)                     \n"
-    "movhpd    %%xmm0,(%1,%2)                  \n"
-    "lea       8(%1),%1                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),      // %0
-    "+r"(dst_u),    // %1
-    "+r"(dst_v),    // %2
-    "+r"(temp_width)  // %3
-  : "m"(kShuffleMirrorUV)  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-#endif  // HAS_MIRRORROW_UV_SSSE3
-
-#ifdef HAS_ARGBMIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-CONST uvec8 kARGBShuffleMirror = {
-  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
-};
-
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
-  intptr_t temp_width = static_cast<intptr_t>(width);
-  asm volatile (
-    "movdqa    %3,%%xmm5                       \n"
-    "lea       -0x10(%0),%0                    \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0,%2,4),%%xmm0                \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kARGBShuffleMirror)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_ARGBMIRRORROW_SSSE3
-
-#ifdef HAS_SPLITUV_SSE2
-void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb    %%xmm5,%%xmm5                    \n"
-    "psrlw      $0x8,%%xmm5                      \n"
-    "sub        %1,%2                            \n"
-    ".p2align  4                               \n"
-  "1:                                            \n"
-    "movdqa     (%0),%%xmm0                      \n"
-    "movdqa     0x10(%0),%%xmm1                  \n"
-    "lea        0x20(%0),%0                      \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "pand       %%xmm5,%%xmm0                    \n"
-    "pand       %%xmm5,%%xmm1                    \n"
-    "packuswb   %%xmm1,%%xmm0                    \n"
-    "psrlw      $0x8,%%xmm2                      \n"
-    "psrlw      $0x8,%%xmm3                      \n"
-    "packuswb   %%xmm3,%%xmm2                    \n"
-    "movdqa     %%xmm0,(%1)                      \n"
-    "movdqa     %%xmm2,(%1,%2)                   \n"
-    "lea        0x10(%1),%1                      \n"
-    "sub        $0x10,%3                         \n"
-    "jg         1b                               \n"
-  : "+r"(src_uv),     // %0
-    "+r"(dst_u),      // %1
-    "+r"(dst_v),      // %2
-    "+r"(pix)         // %3
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_SPLITUV_SSE2
-
-#ifdef HAS_COPYROW_SSE2
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
-  asm volatile (
-    "sub        %0,%1                          \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    %%xmm0,(%0,%1)                  \n"
-    "movdqa    %%xmm1,0x10(%0,%1)              \n"
-    "lea       0x20(%0),%0                     \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-#endif  // HAS_COPYROW_SSE2
-
-#ifdef HAS_COPYROW_X86
-void CopyRow_X86(const uint8* src, uint8* dst, int width) {
-  size_t width_tmp = static_cast<size_t>(width);
-  asm volatile (
-    "shr       $0x2,%2                         \n"
-    "rep movsl                                 \n"
-  : "+S"(src),  // %0
-    "+D"(dst),  // %1
-    "+c"(width_tmp) // %2
-  :
-  : "memory", "cc"
-  );
-}
-#endif  // HAS_COPYROW_X86
-
-#ifdef HAS_SETROW_X86
-void SetRow8_X86(uint8* dst, uint32 v32, int width) {
-  size_t width_tmp = static_cast<size_t>(width);
-  asm volatile (
-    "shr       $0x2,%1                         \n"
-    "rep stosl                                 \n"
-    : "+D"(dst),       // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v32)         // %2
-    : "memory", "cc");
-}
-
-void SetRows32_X86(uint8* dst, uint32 v32, int width,
-                   int dst_stride, int height) {
-  for (int y = 0; y < height; ++y) {
-    size_t width_tmp = static_cast<size_t>(width);
-    uint32* d = reinterpret_cast<uint32*>(dst);
-    asm volatile (
-      "rep stosl                               \n"
-      : "+D"(d),         // %0
-        "+c"(width_tmp)  // %1
-      : "a"(v32)         // %2
-      : "memory", "cc");
-    dst += dst_stride;
-  }
-}
-#endif  // HAS_SETROW_X86
-
-#ifdef HAS_YUY2TOYROW_SSE2
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    (%0,%4,1),%%xmm2                \n"
-    "movdqa    0x10(%0,%4,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
-                               uint8* dst_y, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
-                                int stride_yuy2,
-                                uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    (%0,%4,1),%%xmm2                \n"
-    "movdqu    0x10(%0,%4,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
-                                   uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    (%0,%4,1),%%xmm2                \n"
-    "movdqa    0x10(%0,%4,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
-                               uint8* dst_y, int pix) {
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                                uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    (%0,%4,1),%%xmm2                \n"
-    "movdqu    0x10(%0,%4,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
-                                   uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_YUY2TOYROW_SSE2
-
-#ifdef HAS_ARGBBLENDROW_SSE2
-// Blend 8 pixels at a time.
-void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0xf,%%xmm7                     \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x8,%%xmm6                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psllw     $0x8,%%xmm5                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    "sub       $0x1,%3                         \n"
-    "je        91f                             \n"
-    "jl        99f                             \n"
-
-    // 1 pixel loop until destination pointer is aligned.
-  "10:                                         \n"
-    "test      $0xf,%2                         \n"
-    "je        19f                             \n"
-    "movd      (%0),%%xmm3                     \n"
-    "lea       0x4(%0),%0                      \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      (%1),%%xmm2                     \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      (%1),%%xmm1                     \n"
-    "lea       0x4(%1),%1                      \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x1,%3                         \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x4(%2),%2                      \n"
-    "jge       10b                             \n"
-
-  "19:                                         \n"
-    "add       $1-4,%3                         \n"
-    "jl        49f                             \n"
-
-    // 4 pixel loop.
-    ".p2align  2                               \n"
-  "41:                                         \n"
-    "movdqu    (%0),%%xmm3                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqu    (%1),%%xmm2                     \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqu    (%1),%%xmm1                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "jge       41b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        99f                             \n"
-
-    // 1 pixel loop.
-  "91:                                         \n"
-    "movd      (%0),%%xmm3                     \n"
-    "lea       0x4(%0),%0                      \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      (%1),%%xmm2                     \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      (%1),%%xmm1                     \n"
-    "lea       0x4(%1),%1                      \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x1,%3                         \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x4(%2),%2                      \n"
-    "jge       91b                             \n"
-  "99:                                         \n"
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-#endif  // HAS_ARGBBLENDROW_SSE2
-
-#ifdef HAS_ARGBBLENDROW_SSSE3
-// Shuffle table for isolating alpha.
-CONST uvec8 kShuffleAlpha = {
-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
-
-// Blend 8 pixels at a time
-// Shuffle table for reversing the bytes.
-
-// Same as SSE2, but replaces
-//    psrlw      xmm3, 8          // alpha
-//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-//    pshuflw    xmm3, xmm3,0F5h
-// with..
-//    pshufb     xmm3, kShuffleAlpha // alpha
-
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                        uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0xf,%%xmm7                     \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x8,%%xmm6                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psllw     $0x8,%%xmm5                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    "sub       $0x1,%3                         \n"
-    "je        91f                             \n"
-    "jl        99f                             \n"
-
-    // 1 pixel loop until destination pointer is aligned.
-  "10:                                         \n"
-    "test      $0xf,%2                         \n"
-    "je        19f                             \n"
-    "movd      (%0),%%xmm3                     \n"
-    "lea       0x4(%0),%0                      \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      (%1),%%xmm2                     \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      (%1),%%xmm1                     \n"
-    "lea       0x4(%1),%1                      \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x1,%3                         \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x4(%2),%2                      \n"
-    "jge       10b                             \n"
-
-  "19:                                         \n"
-    "add       $1-4,%3                         \n"
-    "jl        49f                             \n"
-    "test      $0xf,%0                         \n"
-    "jne       41f                             \n"
-    "test      $0xf,%1                         \n"
-    "jne       41f                             \n"
-
-    // 4 pixel loop.
-    ".p2align  2                               \n"
-  "40:                                         \n"
-    "movdqa    (%0),%%xmm3                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqa    (%1),%%xmm2                     \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqa    (%1),%%xmm1                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "jge       40b                             \n"
-    "jmp       49f                             \n"
-
-    // 4 pixel unaligned loop.
-    ".p2align  2                               \n"
-  "41:                                         \n"
-    "movdqu    (%0),%%xmm3                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqu    (%1),%%xmm2                     \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqu    (%1),%%xmm1                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "jge       41b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        99f                             \n"
-
-    // 1 pixel loop.
-  "91:                                         \n"
-    "movd      (%0),%%xmm3                     \n"
-    "lea       0x4(%0),%0                      \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      (%1),%%xmm2                     \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      (%1),%%xmm1                     \n"
-    "lea       0x4(%1),%1                      \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x1,%3                         \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x4(%2),%2                      \n"
-    "jge       91b                             \n"
-  "99:                                         \n"
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  : "m"(kShuffleAlpha)  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-#endif  // HAS_ARGBBLENDROW_SSSE3
-
-#ifdef HAS_ARGBATTENUATE_SSE2
-// Attenuate 4 pixels at a time.
-// aligned to 16 bytes
-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrld     $0x8,%%xmm5                     \n"
-
-    // 4 pixel loop.
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
-    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm1                     \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
-    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqa    (%0),%%xmm2                     \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "pand      %%xmm4,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_ARGBATTENUATE_SSE2
-
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha
-CONST uvec8 kShuffleAlpha0 = {
-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
-};
-CONST uvec8 kShuffleAlpha1 = {
-  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
-};
-// Attenuate 4 pixels at a time.
-// aligned to 16 bytes
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "pslld     $0x18,%%xmm3                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-
-    // 4 pixel loop.
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm1                     \n"
-    "punpcklbw %%xmm1,%%xmm1                   \n"
-    "pmulhuw   %%xmm1,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm1                     \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "movdqa    (%0),%%xmm2                     \n"
-    "punpckhbw %%xmm2,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqa    (%0),%%xmm2                     \n"
-    "pand      %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleAlpha0),  // %3
-    "m"(kShuffleAlpha1)  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_ARGBATTENUATEROW_SSSE3
-
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-// Unattenuate 4 pixels at a time.
-// aligned to 16 bytes
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
-  uintptr_t alpha = 0;
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-
-    // 4 pixel loop.
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movzb     0x3(%0),%3                      \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movd      0x0(%4,%3,4),%%xmm2             \n"
-    "movzb     0x7(%0),%3                      \n"
-    "movd      0x0(%4,%3,4),%%xmm3             \n"
-    "pshuflw   $0xc0,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0xc0,%%xmm3,%%xmm3             \n"
-    "movlhps   %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm1                     \n"
-    "movzb     0xb(%0),%3                      \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "movd      0x0(%4,%3,4),%%xmm2             \n"
-    "movzb     0xf(%0),%3                      \n"
-    "movd      0x0(%4,%3,4),%%xmm3             \n"
-    "pshuflw   $0xc0,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0xc0,%%xmm3,%%xmm3             \n"
-    "movlhps   %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqa    (%0),%%xmm2                     \n"
-    "pand      %%xmm4,%%xmm2                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width),       // %2
-    "+r"(alpha)        // %3
-  : "r"(fixed_invtbl8)  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_ARGBUNATTENUATEROW_SSE2
-
-#ifdef HAS_ARGBGRAYROW_SSSE3
-// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
-CONST vec8 kARGBToGray = {
-  14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
-};
-
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "sub       %0,%1                           \n"
-
-    // 8 pixel loop.
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm2                     \n"
-    "movdqa    0x10(%0),%%xmm3                 \n"
-    "psrld     $0x18,%%xmm2                    \n"
-    "psrld     $0x18,%%xmm3                    \n"
-    "packuswb  %%xmm3,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpcklbw %%xmm2,%%xmm3                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm3,%%xmm0                   \n"
-    "punpckhwd %%xmm3,%%xmm1                   \n"
-    "sub       $0x8,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "movdqa    %%xmm1,0x10(%0,%1,1)            \n"
-    "lea       0x20(%0),%0                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  : "m"(kARGBToGray)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
-  );
-}
-#endif  // HAS_ARGBGRAYROW_SSSE3
-
-#ifdef HAS_ARGBSEPIAROW_SSSE3
-//    b = (r * 35 + g * 68 + b * 17) >> 7
-//    g = (r * 45 + g * 88 + b * 22) >> 7
-//    r = (r * 50 + g * 98 + b * 24) >> 7
-// Constant for ARGB color to sepia tone
-CONST vec8 kARGBToSepiaB = {
-  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
-
-CONST vec8 kARGBToSepiaG = {
-  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
-
-CONST vec8 kARGBToSepiaR = {
-  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
-  asm volatile (
-    "movdqa    %2,%%xmm2                       \n"
-    "movdqa    %3,%%xmm3                       \n"
-    "movdqa    %4,%%xmm4                       \n"
-
-    // 8 pixel loop.
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm6                 \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "pmaddubsw %%xmm2,%%xmm6                   \n"
-    "phaddw    %%xmm6,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm5                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "pmaddubsw %%xmm3,%%xmm5                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm5                   \n"
-    "psrlw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm5                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "pmaddubsw %%xmm4,%%xmm5                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm5                   \n"
-    "psrlw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "movdqa    (%0),%%xmm6                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "psrld     $0x18,%%xmm6                    \n"
-    "psrld     $0x18,%%xmm1                    \n"
-    "packuswb  %%xmm1,%%xmm6                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm5                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm5,%%xmm0                   \n"
-    "punpckhwd %%xmm5,%%xmm1                   \n"
-    "sub       $0x8,%1                         \n"
-    "movdqa    %%xmm0,(%0)                     \n"
-    "movdqa    %%xmm1,0x10(%0)                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),      // %0
-    "+r"(width)          // %1
-  : "m"(kARGBToSepiaB),  // %2
-    "m"(kARGBToSepiaG),  // %3
-    "m"(kARGBToSepiaR)   // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
-  );
-}
-#endif  // HAS_ARGBSEPIAROW_SSSE3
-
-#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
-                              int width) {
-  asm volatile (
-    "movd      (%2),%%xmm2                     \n"
-    "movd      0x4(%2),%%xmm3                  \n"
-    "movd      0x8(%2),%%xmm4                  \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-
-    // 8 pixel loop.
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm6                 \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "pmaddubsw %%xmm2,%%xmm6                   \n"
-    "movdqa    (%0),%%xmm5                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "pmaddubsw %%xmm3,%%xmm5                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "phaddsw   %%xmm6,%%xmm0                   \n"
-    "phaddsw   %%xmm1,%%xmm5                   \n"
-    "psraw     $0x7,%%xmm0                     \n"
-    "psraw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm5                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "pmaddubsw %%xmm4,%%xmm5                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "phaddsw   %%xmm1,%%xmm5                   \n"
-    "psraw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "movdqa    (%0),%%xmm6                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "psrld     $0x18,%%xmm6                    \n"
-    "psrld     $0x18,%%xmm1                    \n"
-    "packuswb  %%xmm1,%%xmm6                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm6,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm0                   \n"
-    "punpckhwd %%xmm5,%%xmm1                   \n"
-    "sub       $0x8,%1                         \n"
-    "movdqa    %%xmm0,(%0)                     \n"
-    "movdqa    %%xmm1,0x10(%0)                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),      // %0
-    "+r"(width)          // %1
-  : "r"(matrix_argb)     // %2
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
-  );
-}
-#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
-
-#ifdef HAS_ARGBQUANTIZEROW_SSE2
-// Quantize 4 ARGB pixels (16 bytes).
-// aligned to 16 bytes
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  asm volatile (
-    "movd      %2,%%xmm2                       \n"
-    "movd      %3,%%xmm3                       \n"
-    "movd      %4,%%xmm4                       \n"
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
-    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "pslld     $0x18,%%xmm6                    \n"
-
-    // 4 pixel loop.
-    ".p2align  2                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm1                     \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "pmullw    %%xmm3,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm7                     \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm6,%%xmm7                   \n"
-    "paddw     %%xmm4,%%xmm0                   \n"
-    "paddw     %%xmm4,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "por       %%xmm7,%%xmm0                   \n"
-    "sub       $0x4,%1                         \n"
-    "movdqa    %%xmm0,(%0)                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),       // %0
-    "+r"(width)           // %1
-  : "r"(scale),           // %2
-    "r"(interval_size),   // %3
-    "r"(interval_offset)  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-#endif  // HAS_ARGBQUANTIZEROW_SSE2
-
-#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
-// Creates a table of cumulative sums where each value is a sum of all values
-// above and to the left of the value, inclusive of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
-                                  const int32* previous_cumsum, int width) {
-  asm volatile (
-    "sub       %1,%2                           \n"
-    "pxor      %%xmm0,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-    "test      $0xf,%1                         \n"
-    "jne       49f                             \n"
-
-  // 4 pixel loop                              \n"
-    ".p2align  2                               \n"
-  "40:                                         \n"
-    "movdqu    (%0),%%xmm2                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "movdqa    %%xmm2,%%xmm4                   \n"
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm1,%%xmm2                   \n"
-    "punpckhwd %%xmm1,%%xmm3                   \n"
-    "punpckhbw %%xmm1,%%xmm4                   \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "punpcklwd %%xmm1,%%xmm4                   \n"
-    "punpckhwd %%xmm1,%%xmm5                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqa    (%1,%2,1),%%xmm2                \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "movdqa    0x10(%1,%2,1),%%xmm3            \n"
-    "paddd     %%xmm0,%%xmm3                   \n"
-    "paddd     %%xmm4,%%xmm0                   \n"
-    "movdqa    0x20(%1,%2,1),%%xmm4            \n"
-    "paddd     %%xmm0,%%xmm4                   \n"
-    "paddd     %%xmm5,%%xmm0                   \n"
-    "movdqa    0x30(%1,%2,1),%%xmm5            \n"
-    "paddd     %%xmm0,%%xmm5                   \n"
-    "movdqa    %%xmm2,(%1)                     \n"
-    "movdqa    %%xmm3,0x10(%1)                 \n"
-    "movdqa    %%xmm4,0x20(%1)                 \n"
-    "movdqa    %%xmm5,0x30(%1)                 \n"
-    "lea       0x40(%1),%1                     \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        19f                             \n"
-
-  // 1 pixel loop                              \n"
-    ".p2align  2                               \n"
-  "10:                                         \n"
-    "movd      (%0),%%xmm2                     \n"
-    "lea       0x4(%0),%0                      \n"
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "punpcklwd %%xmm1,%%xmm2                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqu    (%1,%2,1),%%xmm2                \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "movdqu    %%xmm2,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x1,%3                         \n"
-    "jge       10b                             \n"
-
-  "19:                                         \n"
-  : "+r"(row),  // %0
-    "+r"(cumsum),  // %1
-    "+r"(previous_cumsum),  // %2
-    "+r"(width)  // %3
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
-
-#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
-void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
-                                 int width, int area, uint8* dst, int count) {
-  asm volatile (
-    "movd      %5,%%xmm4                       \n"
-    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
-    "rcpss     %%xmm4,%%xmm4                   \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-
-  // 4 pixel loop                              \n"
-    ".p2align  2                               \n"
-  "40:                                         \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
-    "psubd     (%0,%4,4),%%xmm0                \n"
-    "psubd     0x10(%0,%4,4),%%xmm1            \n"
-    "psubd     0x20(%0,%4,4),%%xmm2            \n"
-    "psubd     0x30(%0,%4,4),%%xmm3            \n"
-    "lea       0x40(%0),%0                     \n"
-    "psubd     (%1),%%xmm0                     \n"
-    "psubd     0x10(%1),%%xmm1                 \n"
-    "psubd     0x20(%1),%%xmm2                 \n"
-    "psubd     0x30(%1),%%xmm3                 \n"
-    "paddd     (%1,%4,4),%%xmm0                \n"
-    "paddd     0x10(%1,%4,4),%%xmm1            \n"
-    "paddd     0x20(%1,%4,4),%%xmm2            \n"
-    "paddd     0x30(%1,%4,4),%%xmm3            \n"
-    "lea       0x40(%1),%1                     \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
-    "mulps     %%xmm4,%%xmm0                   \n"
-    "mulps     %%xmm4,%%xmm1                   \n"
-    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
-    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
-    "mulps     %%xmm4,%%xmm2                   \n"
-    "mulps     %%xmm4,%%xmm3                   \n"
-    "cvtps2dq  %%xmm0,%%xmm0                   \n"
-    "cvtps2dq  %%xmm1,%%xmm1                   \n"
-    "cvtps2dq  %%xmm2,%%xmm2                   \n"
-    "cvtps2dq  %%xmm3,%%xmm3                   \n"
-    "packssdw  %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm3,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        19f                             \n"
-
-  // 1 pixel loop                              \n"
-    ".p2align  2                               \n"
-  "10:                                         \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "psubd     (%0,%4,4),%%xmm0                \n"
-    "lea       0x10(%0),%0                     \n"
-    "psubd     (%1),%%xmm0                     \n"
-    "paddd     (%1,%4,4),%%xmm0                \n"
-    "lea       0x10(%1),%1                     \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "mulps     %%xmm4,%%xmm0                   \n"
-    "cvtps2dq  %%xmm0,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x4(%2),%2                      \n"
-    "sub       $0x1,%3                         \n"
-    "jge       10b                             \n"
-  "19:                                         \n"
-  : "+r"(topleft),  // %0
-    "+r"(botleft),  // %1
-    "+r"(dst),      // %2
-    "+rm"(count)    // %3
-  : "r"(static_cast<intptr_t>(width)),  // %4
-    "rm"(area)     // %5
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
-  );
-}
-#endif  // HAS_CUMULATIVESUMTOAVERAGE_SSE2
-#ifdef HAS_ARGBSHADE_SSE2
-// Shade 4 pixels at a time by specified value.
-// Aligned to 16 bytes.
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
-  asm volatile (
-    "movd      %3,%%xmm2                       \n"
-    "sub       %0,%1                           \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "punpcklqdq %%xmm2,%%xmm2                  \n"
-
-    // 4 pixel loop.
-    ".p2align  2                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),       // %0
-    "+r"(dst_argb),       // %1
-    "+r"(width)           // %2
-  : "r"(value)            // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2"
-#endif
-  );
-}
-#endif  // HAS_ARGBSHADE_SSE2
-
-#ifdef HAS_ARGBAFFINEROW_SSE2
-// TODO(fbarchard): Find 64 bit way to avoid masking.
-// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
-// Copy ARGB pixels from source image with slope to a row of destination.
-// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
-// an error if movq is used. movd  %%xmm0,%1
-
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* uv_dudv, int width) {
-  intptr_t src_argb_stride_temp = src_argb_stride;
-  intptr_t temp = 0;
-  asm volatile (
-    "movq      (%3),%%xmm2                     \n"
-    "movq      0x8(%3),%%xmm7                  \n"
-    "shl       $0x10,%1                        \n"
-    "add       $0x4,%1                         \n"
-    "movd      %1,%%xmm5                       \n"
-    "sub       $0x4,%4                         \n"
-    "jl        49f                             \n"
-
-    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "addps     %%xmm7,%%xmm0                   \n"
-    "movlhps   %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm7,%%xmm4                   \n"
-    "addps     %%xmm4,%%xmm4                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "addps     %%xmm4,%%xmm3                   \n"
-    "addps     %%xmm4,%%xmm4                   \n"
-
-  // 4 pixel loop                              \n"
-    ".p2align  4                               \n"
-  "40:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm0                   \n"
-    "cvttps2dq %%xmm3,%%xmm1                   \n"
-    "packssdw  %%xmm1,%%xmm0                   \n"
-    "pmaddwd   %%xmm5,%%xmm0                   \n"
-#if defined(__x86_64__)
-    "movd      %%xmm0,%1                       \n"
-    "mov       %1,%5                           \n"
-    "and       $0x0fffffff,%1                  \n"
-    "shr       $32,%5                          \n"
-    "pshufd    $0xEE,%%xmm0,%%xmm0             \n"
-#else
-    "movd      %%xmm0,%1                       \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "movd      %%xmm0,%5                       \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-#endif
-    "movd      (%0,%1,1),%%xmm1                \n"
-    "movd      (%0,%5,1),%%xmm6                \n"
-    "punpckldq %%xmm6,%%xmm1                   \n"
-    "addps     %%xmm4,%%xmm2                   \n"
-    "movq      %%xmm1,(%2)                     \n"
-#if defined(__x86_64__)
-    "movd      %%xmm0,%1                       \n"
-    "mov       %1,%5                           \n"
-    "and       $0x0fffffff,%1                  \n"
-    "shr       $32,%5                          \n"
-#else
-    "movd      %%xmm0,%1                       \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "movd      %%xmm0,%5                       \n"
-#endif
-    "movd      (%0,%1,1),%%xmm0                \n"
-    "movd      (%0,%5,1),%%xmm6                \n"
-    "punpckldq %%xmm6,%%xmm0                   \n"
-    "addps     %%xmm4,%%xmm3                   \n"
-    "sub       $0x4,%4                         \n"
-    "movq      %%xmm0,0x08(%2)                 \n"
-    "lea       0x10(%2),%2                     \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%4                         \n"
-    "jl        19f                             \n"
-
-  // 1 pixel loop                              \n"
-    ".p2align  4                               \n"
-  "10:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "pmaddwd   %%xmm5,%%xmm0                   \n"
-    "addps     %%xmm7,%%xmm2                   \n"
-    "movd      %%xmm0,%1                       \n"
-#if defined(__x86_64__)
-    "and       $0x0fffffff,%1                  \n"
-#endif
-    "movd      (%0,%1,1),%%xmm0                \n"
-    "sub       $0x1,%4                         \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x4(%2),%2                      \n"
-    "jge       10b                             \n"
-  "19:                                         \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_argb_stride_temp),  // %1
-    "+r"(dst_argb),  // %2
-    "+r"(uv_dudv),   // %3
-    "+rm"(width),    // %4
-    "+r"(temp)   // %5
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-#endif  // HAS_ARGBAFFINEROW_SSE2
-
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                              ptrdiff_t src_stride, int dst_width,
-                              int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "shr       %3                              \n"
-    "cmp       $0x0,%3                         \n"
-    "je        2f                              \n"
-    "cmp       $0x40,%3                        \n"
-    "je        3f                              \n"
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x80,%3                        \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm2                \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm0                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    "pmaddubsw %%xmm5,%%xmm0                   \n"
-    "pmaddubsw %%xmm5,%%xmm1                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-    "jmp       4f                              \n"
-    ".p2align  4                               \n"
-  "2:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        2b                              \n"
-    "jmp       4f                              \n"
-    ".p2align  4                               \n"
-  "3:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "pavgb     (%1,%4,1),%%xmm0                \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        3b                              \n"
-  "4:                                          \n"
-    ".p2align  4                               \n"
-  : "+r"(dst_ptr),     // %0
-    "+r"(src_ptr),     // %1
-    "+r"(dst_width),   // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"(static_cast<intptr_t>(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
-  );
-}
-
-#endif  // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/row_win.cc b/files/source/row_win.cc
index e3b01f27..2a3da896 100644
--- a/files/source/row_win.cc
+++ b/files/source/row_win.cc
@@ -4,21 +4,126 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #include "libyuv/row.h"
 
+// This module is for Visual C 32/64 bit and clangcl 32 bit
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+    (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+
+#if defined(_M_X64)
+#include <emmintrin.h>
+#include <tmmintrin.h>  // For _mm_maddubs_epi16
+#endif
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for Visual C x86.
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+// 64 bit
+#if defined(_M_X64)
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422                                                             \
+    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
+    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
+    u_buf += 4;                                                                \
+    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
+    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
+    y_buf += 8;
+
+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
+#define READYUVA422                                                            \
+    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
+    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
+    u_buf += 4;                                                                \
+    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
+    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
+    y_buf += 8;                                                                \
+    xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                                   \
+    a_buf += 8;
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB(yuvconstants)                                                 \
+    xmm1 = _mm_loadu_si128(&xmm0);                                             \
+    xmm2 = _mm_loadu_si128(&xmm0);                                             \
+    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB);           \
+    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG);           \
+    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR);           \
+    xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);             \
+    xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);             \
+    xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);             \
+    xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);            \
+    xmm0 = _mm_adds_epi16(xmm0, xmm4);                                         \
+    xmm1 = _mm_adds_epi16(xmm1, xmm4);                                         \
+    xmm2 = _mm_adds_epi16(xmm2, xmm4);                                         \
+    xmm0 = _mm_srai_epi16(xmm0, 6);                                            \
+    xmm1 = _mm_srai_epi16(xmm1, 6);                                            \
+    xmm2 = _mm_srai_epi16(xmm2, 6);                                            \
+    xmm0 = _mm_packus_epi16(xmm0, xmm0);                                       \
+    xmm1 = _mm_packus_epi16(xmm1, xmm1);                                       \
+    xmm2 = _mm_packus_epi16(xmm2, xmm2);
+
+// Store 8 ARGB values.
+#define STOREARGB                                                              \
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);                                      \
+    xmm1 = _mm_loadu_si128(&xmm0);                                             \
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);                                     \
+    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);                                     \
+    _mm_storeu_si128((__m128i *)dst_argb, xmm0);                               \
+    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);                        \
+    dst_argb += 32;
+
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  __m128i xmm0, xmm1, xmm2, xmm4;
+  const __m128i xmm5 = _mm_set1_epi8(-1);
+  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+  while (width > 0) {
+    READYUV422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    width -= 8;
+  }
+}
+#endif
+
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              const uint8* a_buf,
+                              uint8* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width) {
+  __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+  while (width > 0) {
+    READYUVA422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    width -= 8;
+  }
+}
+#endif
 
-// TODO(fbarchard): I420ToRGB24, I420ToRAW
+// 32 bit
+#else  // defined(_M_X64)
 #ifdef HAS_ARGBTOYROW_SSSE3
 
 // Constants for ARGB.
@@ -26,14 +131,33 @@ static const vec8 kARGBToY = {
   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
 };
 
+// JPeg full range.
+static const vec8 kARGBToYJ = {
+  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+
 static const vec8 kARGBToU = {
   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
 };
 
+static const vec8 kARGBToUJ = {
+  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
 static const vec8 kARGBToV = {
   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
 };
 
+static const vec8 kARGBToVJ = {
+  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+};
+
 // Constants for BGRA.
 static const vec8 kBGRAToY = {
   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
@@ -77,11 +201,20 @@ static const uvec8 kAddY16 = {
   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
 };
 
+// 7 bit fixed point 0.5.
+static const vec16 kAddYJ64 = {
+  64, 64, 64, 64, 64, 64, 64, 64
+};
+
 static const uvec8 kAddUV128 = {
   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
 };
 
+static const uvec16 kAddUVJ128 = {
+  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+
 // Shuffle table for converting RGB24 to ARGB.
 static const uvec8 kShuffleMaskRGB24ToARGB = {
   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
@@ -92,24 +225,22 @@ static const uvec8 kShuffleMaskRAWToARGB = {
   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
 };
 
-// Shuffle table for converting BGRA to ARGB.
-static const uvec8 kShuffleMaskBGRAToARGB = {
-  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
-};
-
-// Shuffle table for converting ABGR to ARGB.
-static const uvec8 kShuffleMaskABGRToARGB = {
-  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+// Shuffle table for converting RAW to RGB24.  First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
 };
 
-// Shuffle table for converting RGBA to ARGB.
-static const uvec8 kShuffleMaskRGBAToARGB = {
-  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
+// Shuffle table for converting RAW to RGB24.  Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
 };
 
-// Shuffle table for converting ARGB to RGBA.
-static const uvec8 kShuffleMaskARGBToRGBA = {
-  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+// Shuffle table for converting RAW to RGB24.  Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
 };
 
 // Shuffle table for converting ARGB to RGB24.
@@ -122,16 +253,51 @@ static const uvec8 kShuffleMaskARGBToRAW = {
   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
 };
 
-__declspec(naked) __declspec(align(16))
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {
+  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
+  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
+};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {
+  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
+  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
+};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {
+  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
+  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
+};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {
+  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
+  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
+};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
+
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked)
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]        // src_y
     mov        edx, [esp + 8]        // dst_argb
-    mov        ecx, [esp + 12]       // pix
+    mov        ecx, [esp + 12]       // width
     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
     pslld      xmm5, 24
 
-    align      16
   convertloop:
     movq       xmm0, qword ptr [eax]
     lea        eax,  [eax + 8]
@@ -141,8 +307,8 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
     punpckhwd  xmm1, xmm1
     por        xmm0, xmm5
     por        xmm1, xmm5
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
     sub        ecx, 8
     jg         convertloop
@@ -150,101 +316,48 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
-__asm {
-    mov       eax, [esp + 4]   // src_bgra
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm5, kShuffleMaskBGRAToARGB
-    sub       edx, eax
-
-    align      16
- convertloop:
-    movdqa    xmm0, [eax]
-    pshufb    xmm0, xmm5
-    sub       ecx, 4
-    movdqa    [eax + edx], xmm0
-    lea       eax, [eax + 16]
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
-__asm {
-    mov       eax, [esp + 4]   // src_abgr
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm5, kShuffleMaskABGRToARGB
-    sub       edx, eax
-
-    align      16
- convertloop:
-    movdqa    xmm0, [eax]
-    pshufb    xmm0, xmm5
-    sub       ecx, 4
-    movdqa    [eax + edx], xmm0
-    lea       eax, [eax + 16]
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
-__asm {
-    mov       eax, [esp + 4]   // src_rgba
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm5, kShuffleMaskRGBAToARGB
-    sub       edx, eax
-
-    align      16
- convertloop:
-    movdqa    xmm0, [eax]
-    pshufb    xmm0, xmm5
-    sub       ecx, 4
-    movdqa    [eax + edx], xmm0
-    lea       eax, [eax + 16]
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
-__asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgba
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm5, kShuffleMaskARGBToRGBA
-    sub       edx, eax
+#ifdef HAS_J400TOARGBROW_AVX2
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked)
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
+  __asm {
+    mov         eax, [esp + 4]        // src_y
+    mov         edx, [esp + 8]        // dst_argb
+    mov         ecx, [esp + 12]       // width
+    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
+    vpslld      ymm5, ymm5, 24
 
-    align      16
- convertloop:
-    movdqa    xmm0, [eax]
-    pshufb    xmm0, xmm5
-    sub       ecx, 4
-    movdqa    [eax + edx], xmm0
-    lea       eax, [eax + 16]
-    jg        convertloop
+  convertloop:
+    vmovdqu     xmm0, [eax]
+    lea         eax,  [eax + 16]
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpcklbw  ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhwd  ymm1, ymm0, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm0
+    vpor        ymm0, ymm0, ymm5
+    vpor        ymm1, ymm1, ymm5
+    vmovdqu     [edx], ymm0
+    vmovdqu     [edx + 32], ymm1
+    lea         edx, [edx + 64]
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
     ret
   }
 }
+#endif  // HAS_J400TOARGBROW_AVX2
 
-__declspec(naked) __declspec(align(16))
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
-__asm {
+__declspec(naked)
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  __asm {
     mov       eax, [esp + 4]   // src_rgb24
     mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
+    mov       ecx, [esp + 12]  // width
     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
     pslld     xmm5, 24
-    movdqa    xmm4, kShuffleMaskRGB24ToARGB
+    movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
 
-    align      16
  convertloop:
     movdqu    xmm0, [eax]
     movdqu    xmm1, [eax + 16]
@@ -256,35 +369,34 @@ __asm {
     por       xmm2, xmm5
     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
     pshufb    xmm0, xmm4
-    movdqa    [edx + 32], xmm2
+    movdqu    [edx + 32], xmm2
     por       xmm0, xmm5
     pshufb    xmm1, xmm4
-    movdqa    [edx], xmm0
+    movdqu    [edx], xmm0
     por       xmm1, xmm5
     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
     pshufb    xmm3, xmm4
-    movdqa    [edx + 16], xmm1
+    movdqu    [edx + 16], xmm1
     por       xmm3, xmm5
-    sub       ecx, 16
-    movdqa    [edx + 48], xmm3
+    movdqu    [edx + 48], xmm3
     lea       edx, [edx + 64]
+    sub       ecx, 16
     jg        convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
-                        int pix) {
-__asm {
+                        int width) {
+  __asm {
     mov       eax, [esp + 4]   // src_raw
     mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
+    mov       ecx, [esp + 12]  // width
     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
     pslld     xmm5, 24
-    movdqa    xmm4, kShuffleMaskRAWToARGB
+    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
 
-    align      16
  convertloop:
     movdqu    xmm0, [eax]
     movdqu    xmm1, [eax + 16]
@@ -296,18 +408,46 @@ __asm {
     por       xmm2, xmm5
     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
     pshufb    xmm0, xmm4
-    movdqa    [edx + 32], xmm2
+    movdqu    [edx + 32], xmm2
     por       xmm0, xmm5
     pshufb    xmm1, xmm4
-    movdqa    [edx], xmm0
+    movdqu    [edx], xmm0
     por       xmm1, xmm5
     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
     pshufb    xmm3, xmm4
-    movdqa    [edx + 16], xmm1
+    movdqu    [edx + 16], xmm1
     por       xmm3, xmm5
-    sub       ecx, 16
-    movdqa    [edx + 48], xmm3
+    movdqu    [edx + 48], xmm3
     lea       edx, [edx + 64]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src_raw
+    mov       edx, [esp + 8]   // dst_rgb24
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
+    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
+    movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 4]
+    movdqu    xmm2, [eax + 8]
+    lea       eax, [eax + 24]
+    pshufb    xmm0, xmm3
+    pshufb    xmm1, xmm4
+    pshufb    xmm2, xmm5
+    movq      qword ptr [edx], xmm0
+    movq      qword ptr [edx + 8], xmm1
+    movq      qword ptr [edx + 16], xmm2
+    lea       edx, [edx + 24]
+    sub       ecx, 8
     jg        convertloop
     ret
   }
@@ -320,10 +460,10 @@ __asm {
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
 // 20 instructions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
-                          int pix) {
-__asm {
+                          int width) {
+  __asm {
     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
     movd      xmm5, eax
     pshufd    xmm5, xmm5, 0
@@ -340,11 +480,10 @@ __asm {
 
     mov       eax, [esp + 4]   // src_rgb565
     mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
+    mov       ecx, [esp + 12]  // width
     sub       edx, eax
     sub       edx, eax
 
-    align      16
  convertloop:
     movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
     movdqa    xmm1, xmm0
@@ -361,8 +500,8 @@ __asm {
     movdqa    xmm2, xmm1
     punpcklbw xmm1, xmm0
     punpckhbw xmm2, xmm0
-    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
-    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
     lea       eax, [eax + 16]
     sub       ecx, 8
     jg        convertloop
@@ -370,11 +509,158 @@ __asm {
   }
 }
 
+#ifdef HAS_RGB565TOARGBROW_AVX2
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+__declspec(naked)
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+                          int width) {
+  __asm {
+    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
+    vmovd      xmm5, eax
+    vbroadcastss ymm5, xmm5
+    mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
+    vmovd      xmm6, eax
+    vbroadcastss ymm6, xmm6
+    vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
+    vpsllw     ymm3, ymm3, 11
+    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
+    vpsllw     ymm4, ymm4, 10
+    vpsrlw     ymm4, ymm4, 5
+    vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
+    vpsllw     ymm7, ymm7, 8
+
+    mov        eax, [esp + 4]   // src_rgb565
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    sub        edx, eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
+    vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm1, ymm1, ymm2    // RB
+    vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
+    vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
+    vpor       ymm0, ymm0, ymm7    // AG
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpckhbw ymm2, ymm1, ymm0
+    vpunpcklbw ymm1, ymm1, ymm0
+    vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_RGB565TOARGBROW_AVX2
+
+#ifdef HAS_ARGB1555TOARGBROW_AVX2
+__declspec(naked)
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                            int width) {
+  __asm {
+    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
+    vmovd      xmm5, eax
+    vbroadcastss ymm5, xmm5
+    mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
+    vmovd      xmm6, eax
+    vbroadcastss ymm6, xmm6
+    vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+    vpsllw     ymm3, ymm3, 11
+    vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
+    vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+    vpsllw     ymm7, ymm7, 8
+
+    mov        eax,  [esp + 4]   // src_argb1555
+    mov        edx,  [esp + 8]   // dst_argb
+    mov        ecx,  [esp + 12]  // width
+    sub        edx,  eax
+    sub        edx,  eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
+    vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vpand      ymm1, ymm1, ymm3
+    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm1, ymm1, ymm2    // RB
+    vpsraw     ymm2, ymm0, 8       // A
+    vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
+    vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
+    vpand      ymm2, ymm2, ymm7
+    vpor       ymm0, ymm0, ymm2    // AG
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpckhbw ymm2, ymm1, ymm0
+    vpunpcklbw ymm1, ymm1, ymm0
+    vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGB1555TOARGBROW_AVX2
+
+#ifdef HAS_ARGB4444TOARGBROW_AVX2
+__declspec(naked)
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                            int width) {
+  __asm {
+    mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
+    vmovd     xmm4, eax
+    vbroadcastss ymm4, xmm4
+    vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
+    mov       eax,  [esp + 4]   // src_argb4444
+    mov       edx,  [esp + 8]   // dst_argb
+    mov       ecx,  [esp + 12]  // width
+    sub       edx,  eax
+    sub       edx,  eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
+    vpand      ymm2, ymm0, ymm5    // mask high nibbles
+    vpand      ymm0, ymm0, ymm4    // mask low nibbles
+    vpsrlw     ymm3, ymm2, 4
+    vpsllw     ymm1, ymm0, 4
+    vpor       ymm2, ymm2, ymm3
+    vpor       ymm0, ymm0, ymm1
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpckhbw ymm1, ymm0, ymm2
+    vpunpcklbw ymm0, ymm0, ymm2
+    vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGB4444TOARGBROW_AVX2
+
 // 24 instructions
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
-                            int pix) {
-__asm {
+                            int width) {
+  __asm {
     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
     movd      xmm5, eax
     pshufd    xmm5, xmm5, 0
@@ -390,11 +676,10 @@ __asm {
 
     mov       eax, [esp + 4]   // src_argb1555
     mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
+    mov       ecx, [esp + 12]  // width
     sub       edx, eax
     sub       edx, eax
 
-    align      16
  convertloop:
     movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
     movdqa    xmm1, xmm0
@@ -415,8 +700,8 @@ __asm {
     movdqa    xmm2, xmm1
     punpcklbw xmm1, xmm0
     punpckhbw xmm2, xmm0
-    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
-    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
     lea       eax, [eax + 16]
     sub       ecx, 8
     jg        convertloop
@@ -425,10 +710,10 @@ __asm {
 }
 
 // 18 instructions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
-                            int pix) {
-__asm {
+                            int width) {
+  __asm {
     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
     movd      xmm4, eax
     pshufd    xmm4, xmm4, 0
@@ -436,11 +721,10 @@ __asm {
     pslld     xmm5, 4
     mov       eax, [esp + 4]   // src_argb4444
     mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
+    mov       ecx, [esp + 12]  // width
     sub       edx, eax
     sub       edx, eax
 
-    align      16
  convertloop:
     movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
     movdqa    xmm2, xmm0
@@ -455,8 +739,8 @@ __asm {
     movdqa    xmm1, xmm0
     punpcklbw xmm0, xmm2
     punpckhbw xmm1, xmm2
-    movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
-    movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
+    movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
     lea       eax, [eax + 16]
     sub       ecx, 8
     jg        convertloop
@@ -464,20 +748,19 @@ __asm {
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
-__asm {
+__declspec(naked)
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm6, kShuffleMaskARGBToRGB24
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
 
-    align      16
  convertloop:
-    movdqa    xmm0, [eax]   // fetch 16 pixels of argb
-    movdqa    xmm1, [eax + 16]
-    movdqa    xmm2, [eax + 32]
-    movdqa    xmm3, [eax + 48]
+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
     lea       eax, [eax + 64]
     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
     pshufb    xmm1, xmm6
@@ -489,13 +772,13 @@ __asm {
     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
     por       xmm0, xmm4   // 4 bytes from 1 for 0
     pslldq    xmm5, 8      // 8 bytes from 2 for 1
-    movdqa    [edx], xmm0  // store 0
+    movdqu    [edx], xmm0  // store 0
     por       xmm1, xmm5   // 8 bytes from 2 for 1
     psrldq    xmm2, 8      // 4 bytes from 2
     pslldq    xmm3, 4      // 12 bytes from 3 for 2
     por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqa    [edx + 16], xmm1   // store 1
-    movdqa    [edx + 32], xmm2   // store 2
+    movdqu    [edx + 16], xmm1   // store 1
+    movdqu    [edx + 32], xmm2   // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
@@ -503,20 +786,19 @@ __asm {
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
-__asm {
+__declspec(naked)
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm6, kShuffleMaskARGBToRAW
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
 
-    align      16
  convertloop:
-    movdqa    xmm0, [eax]   // fetch 16 pixels of argb
-    movdqa    xmm1, [eax + 16]
-    movdqa    xmm2, [eax + 32]
-    movdqa    xmm3, [eax + 48]
+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
     lea       eax, [eax + 64]
     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
     pshufb    xmm1, xmm6
@@ -528,13 +810,13 @@ __asm {
     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
     por       xmm0, xmm4   // 4 bytes from 1 for 0
     pslldq    xmm5, 8      // 8 bytes from 2 for 1
-    movdqa    [edx], xmm0  // store 0
+    movdqu    [edx], xmm0  // store 0
     por       xmm1, xmm5   // 8 bytes from 2 for 1
     psrldq    xmm2, 8      // 4 bytes from 2
     pslldq    xmm3, 4      // 12 bytes from 3 for 2
     por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqa    [edx + 16], xmm1   // store 1
-    movdqa    [edx + 32], xmm2   // store 2
+    movdqu    [edx + 16], xmm1   // store 1
+    movdqu    [edx + 32], xmm2   // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
@@ -542,12 +824,12 @@ __asm {
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
-__asm {
+__declspec(naked)
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
-    mov       ecx, [esp + 12]  // pix
+    mov       ecx, [esp + 12]  // width
     pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
     psrld     xmm3, 27
     pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
@@ -556,9 +838,8 @@ __asm {
     pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
     pslld     xmm5, 11
 
-    align      16
  convertloop:
-    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
     movdqa    xmm1, xmm0    // B
     movdqa    xmm2, xmm0    // G
     pslld     xmm0, 8       // R
@@ -572,7 +853,7 @@ __asm {
     por       xmm0, xmm1    // BGR
     packssdw  xmm0, xmm0
     lea       eax, [eax + 16]
-    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
+    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
     lea       edx, [edx + 8]
     sub       ecx, 4
     jg        convertloop
@@ -580,13 +861,101 @@ __asm {
   }
 }
 
+__declspec(naked)
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  __asm {
+
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    movd      xmm6, [esp + 12] // dither4
+    mov       ecx, [esp + 16]  // width
+    punpcklbw xmm6, xmm6       // make dither 16 bytes
+    movdqa    xmm7, xmm6
+    punpcklwd xmm6, xmm6
+    punpckhwd xmm7, xmm7
+    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    psrld     xmm3, 27
+    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    psrld     xmm4, 26
+    pslld     xmm4, 5
+    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pslld     xmm5, 11
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    paddusb   xmm0, xmm6    // add dither
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    pslld     xmm0, 8       // R
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 5       // G
+    psrad     xmm0, 16      // R
+    pand      xmm1, xmm3    // B
+    pand      xmm2, xmm4    // G
+    pand      xmm0, xmm5    // R
+    por       xmm1, xmm2    // BG
+    por       xmm0, xmm1    // BGR
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+__declspec(naked)
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    vbroadcastss xmm6, [esp + 12]  // dither4
+    mov        ecx, [esp + 16]     // width
+    vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
+    vpermq     ymm6, ymm6, 0xd8
+    vpunpcklwd ymm6, ymm6, ymm6
+    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpaddusb   ymm0, ymm0, ymm6    // add dither
+    vpsrld     ymm2, ymm0, 5       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrld     ymm0, ymm0, 8       // R
+    vpand      ymm2, ymm2, ymm4    // G
+    vpand      ymm1, ymm1, ymm3    // B
+    vpand      ymm0, ymm0, ymm5    // R
+    vpor       ymm1, ymm1, ymm2    // BG
+    vpor       ymm0, ymm0, ymm1    // BGR
+    vpackusdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
+
 // TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked) __declspec(align(16))
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
-__asm {
+__declspec(naked)
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
-    mov       ecx, [esp + 12]  // pix
+    mov       ecx, [esp + 12]  // width
     pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
     psrld     xmm4, 27
     movdqa    xmm5, xmm4       // generate mask 0x000003e0
@@ -596,9 +965,8 @@ __asm {
     pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
     pslld     xmm7, 15
 
-    align      16
  convertloop:
-    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
     movdqa    xmm1, xmm0    // B
     movdqa    xmm2, xmm0    // G
     movdqa    xmm3, xmm0    // R
@@ -623,25 +991,24 @@ __asm {
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
-__asm {
+__declspec(naked)
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
-    mov       ecx, [esp + 12]  // pix
+    mov       ecx, [esp + 12]  // width
     pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
     psllw     xmm4, 12
     movdqa    xmm3, xmm4       // generate mask 0x00f000f0
     psrlw     xmm3, 8
 
-    align      16
  convertloop:
-    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
     movdqa    xmm1, xmm0
     pand      xmm0, xmm3    // low nibble
     pand      xmm1, xmm4    // high nibble
-    psrl      xmm0, 4
-    psrl      xmm1, 8
+    psrld     xmm0, 4
+    psrld     xmm1, 8
     por       xmm0, xmm1
     packuswb  xmm0, xmm0
     lea       eax, [eax + 16]
@@ -653,51 +1020,124 @@ __asm {
   }
 }
 
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) __declspec(align(16))
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
-    movdqa     xmm4, kARGBToY
+#ifdef HAS_ARGBTORGB565ROW_AVX2
+__declspec(naked)
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // width
+    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
 
-    align      16
  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    sub        ecx, 16
-    movdqa     [edx], xmm0
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm2, ymm0, 5       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrld     ymm0, ymm0, 8       // R
+    vpand      ymm2, ymm2, ymm4    // G
+    vpand      ymm1, ymm1, ymm3    // B
+    vpand      ymm0, ymm0, ymm5    // R
+    vpor       ymm1, ymm1, ymm2    // BG
+    vpor       ymm0, ymm0, ymm1    // BGR
+    vpackusdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB1555ROW_AVX2
+__declspec(naked)
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // width
+    vpcmpeqb   ymm4, ymm4, ymm4
+    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
+    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
+    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
+    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
+    vpslld     ymm7, ymm7, 15
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm3, ymm0, 9       // R
+    vpsrld     ymm2, ymm0, 6       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrad     ymm0, ymm0, 16      // A
+    vpand      ymm3, ymm3, ymm6    // R
+    vpand      ymm2, ymm2, ymm5    // G
+    vpand      ymm1, ymm1, ymm4    // B
+    vpand      ymm0, ymm0, ymm7    // A
+    vpor       ymm0, ymm0, ymm1    // BA
+    vpor       ymm2, ymm2, ymm3    // GR
+    vpor       ymm0, ymm0, ymm2    // BGRA
+    vpackssdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB1555ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB4444ROW_AVX2
+__declspec(naked)
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_rgb
+    mov        ecx, [esp + 12]  // width
+    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
+    vpsllw     ymm4, ymm4, 12
+    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpand      ymm1, ymm0, ymm4    // high nibble
+    vpand      ymm0, ymm0, ymm3    // low nibble
+    vpsrld     ymm1, ymm1, 8
+    vpsrld     ymm0, ymm0, 4
+    vpor       ymm0, ymm0, ymm1
+    vpackuswb  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
     lea        edx, [edx + 16]
+    sub        ecx, 8
     jg         convertloop
+    vzeroupper
     ret
   }
 }
+#endif  // HAS_ARGBTOARGB4444ROW_AVX2
 
-__declspec(naked) __declspec(align(16))
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+__declspec(naked)
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+  __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
-    movdqa     xmm4, kARGBToY
+    mov        ecx, [esp + 12]  /* width */
+    movdqa     xmm4, xmmword ptr kARGBToY
+    movdqa     xmm5, xmmword ptr kAddY16
 
-    align      16
  convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -714,29 +1154,30 @@ __asm {
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
     paddb      xmm0, xmm5
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+__declspec(naked)
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+  __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
-    movdqa     xmm4, kBGRAToY
+    mov        ecx, [esp + 12]  /* width */
+    movdqa     xmm4, xmmword ptr kARGBToYJ
+    movdqa     xmm5, xmmword ptr kAddYJ64
 
-    align      16
  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
     pmaddubsw  xmm0, xmm4
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm2, xmm4
@@ -744,96 +1185,113 @@ __asm {
     lea        eax, [eax + 64]
     phaddw     xmm0, xmm1
     phaddw     xmm2, xmm3
+    paddw      xmm0, xmm5  // Add .5 for rounding.
+    paddw      xmm2, xmm5
     psrlw      xmm0, 7
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    sub        ecx, 16
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+  0, 4, 1, 5, 2, 6, 3, 7
+};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked)
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+  __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
-    movdqa     xmm4, kBGRAToY
+    mov        ecx, [esp + 12]  /* width */
+    vbroadcastf128 ymm4, xmmword ptr kARGBToY
+    vbroadcastf128 ymm5, xmmword ptr kAddY16
+    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
 
-    align      16
  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    sub        ecx, 16
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpmaddubsw ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm1, ymm4
+    vpmaddubsw ymm2, ymm2, ymm4
+    vpmaddubsw ymm3, ymm3, ymm4
+    lea        eax, [eax + 128]
+    vphaddw    ymm0, ymm0, ymm1  // mutates.
+    vphaddw    ymm2, ymm2, ymm3
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm2, ymm2, 7
+    vpackuswb  ymm0, ymm0, ymm2  // mutates.
+    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
+    vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
     jg         convertloop
+    vzeroupper
     ret
   }
 }
+#endif  //  HAS_ARGBTOYROW_AVX2
 
-__declspec(naked) __declspec(align(16))
-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked)
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+  __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
-    movdqa     xmm4, kABGRToY
+    mov        ecx, [esp + 12]  /* width */
+    vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
+    vbroadcastf128 ymm5, xmmword ptr kAddYJ64
+    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
 
-    align      16
  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpmaddubsw ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm1, ymm4
+    vpmaddubsw ymm2, ymm2, ymm4
+    vpmaddubsw ymm3, ymm3, ymm4
+    lea        eax, [eax + 128]
+    vphaddw    ymm0, ymm0, ymm1  // mutates.
+    vphaddw    ymm2, ymm2, ymm3
+    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
+    vpaddw     ymm2, ymm2, ymm5
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm2, ymm2, 7
+    vpackuswb  ymm0, ymm0, ymm2  // mutates.
+    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
     jg         convertloop
+
+    vzeroupper
     ret
   }
 }
+#endif  //  HAS_ARGBTOYJROW_AVX2
 
-__declspec(naked) __declspec(align(16))
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
+__declspec(naked)
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+  __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
-    movdqa     xmm4, kABGRToY
+    mov        ecx, [esp + 12]  /* width */
+    movdqa     xmm4, xmmword ptr kBGRAToY
+    movdqa     xmm5, xmmword ptr kAddY16
 
-    align      16
  convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -850,29 +1308,28 @@ __asm {
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
     paddb      xmm0, xmm5
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
+__declspec(naked)
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+  __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
-    movdqa     xmm4, kRGBAToY
+    mov        ecx, [esp + 12]  /* width */
+    movdqa     xmm4, xmmword ptr kABGRToY
+    movdqa     xmm5, xmmword ptr kAddY16
 
-    align      16
  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
     pmaddubsw  xmm0, xmm4
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm2, xmm4
@@ -884,24 +1341,23 @@ __asm {
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
     paddb      xmm0, xmm5
-    sub        ecx, 16
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
+__declspec(naked)
+void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+  __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
-    movdqa     xmm4, kRGBAToY
+    mov        ecx, [esp + 12]  /* width */
+    movdqa     xmm4, xmmword ptr kRGBAToY
+    movdqa     xmm5, xmmword ptr kAddY16
 
-    align      16
  convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -918,41 +1374,45 @@ __asm {
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
     paddb      xmm0, xmm5
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
-__asm {
+  __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kARGBToU
-    movdqa     xmm6, kARGBToV
-    movdqa     xmm5, kAddUV128
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kARGBToV
+    movdqa     xmm7, xmmword ptr kARGBToU
     sub        edi, edx             // stride from u to v
 
-    align      16
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pavgb      xmm0, [eax + esi]
-    pavgb      xmm1, [eax + esi + 16]
-    pavgb      xmm2, [eax + esi + 32]
-    pavgb      xmm3, [eax + esi + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -980,10 +1440,10 @@ __asm {
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -992,37 +1452,37 @@ __asm {
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-__asm {
+__declspec(naked)
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kARGBToU
-    movdqa     xmm6, kARGBToV
-    movdqa     xmm5, kAddUV128
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUVJ128
+    movdqa     xmm6, xmmword ptr kARGBToVJ
+    movdqa     xmm7, xmmword ptr kARGBToUJ
     sub        edi, edx             // stride from u to v
 
-    align      16
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
     movdqu     xmm4, [eax + esi + 16]
     pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
     movdqu     xmm4, [eax + esi + 32]
     pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
     movdqu     xmm4, [eax + esi + 48]
     pavgb      xmm3, xmm4
+
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -1044,16 +1504,17 @@ __asm {
     pmaddubsw  xmm3, xmm6
     phaddw     xmm0, xmm2
     phaddw     xmm1, xmm3
+    paddw      xmm0, xmm5  // +.5 rounding -> unsigned
+    paddw      xmm1, xmm5
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -1062,239 +1523,227 @@ __asm {
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-__asm {
+#ifdef HAS_ARGBTOUVROW_AVX2
+__declspec(naked)
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kBGRAToU
-    movdqa     xmm6, kBGRAToV
-    movdqa     xmm5, kAddUV128
+    mov        ecx, [esp + 8 + 20]  // width
+    vbroadcastf128 ymm5, xmmword ptr kAddUV128
+    vbroadcastf128 ymm6, xmmword ptr kARGBToV
+    vbroadcastf128 ymm7, xmmword ptr kARGBToU
     sub        edi, edx             // stride from u to v
 
-    align      16
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pavgb      xmm0, [eax + esi]
-    pavgb      xmm1, [eax + esi + 16]
-    pavgb      xmm2, [eax + esi + 32]
-    pavgb      xmm3, [eax + esi + 48]
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
+    /* step 1 - subsample 32x2 argb pixels to 16x1 */
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    vpavgb     ymm2, ymm2, [eax + esi + 64]
+    vpavgb     ymm3, ymm3, [eax + esi + 96]
+    lea        eax,  [eax + 128]
+    vshufps    ymm4, ymm0, ymm1, 0x88
+    vshufps    ymm0, ymm0, ymm1, 0xdd
+    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
+    vshufps    ymm4, ymm2, ymm3, 0x88
+    vshufps    ymm2, ymm2, ymm3, 0xdd
+    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
 
     // step 2 - convert to U and V
     // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
-
-    // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
+    // instead of 32 different pixels, its 16 pixels of U and 16 of V
+    vpmaddubsw ymm1, ymm0, ymm7  // U
+    vpmaddubsw ymm3, ymm2, ymm7
+    vpmaddubsw ymm0, ymm0, ymm6  // V
+    vpmaddubsw ymm2, ymm2, ymm6
+    vphaddw    ymm1, ymm1, ymm3  // mutates
+    vphaddw    ymm0, ymm0, ymm2
+    vpsraw     ymm1, ymm1, 8
+    vpsraw     ymm0, ymm0, 8
+    vpacksswb  ymm0, ymm1, ymm0  // mutates
+    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
+    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
+    vpaddb     ymm0, ymm0, ymm5  // -> unsigned
+
+    // step 3 - store 16 U and 16 V values
+    vextractf128 [edx], ymm0, 0 // U
+    vextractf128 [edx + edi], ymm0, 1 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
     jg         convertloop
 
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
+#endif  // HAS_ARGBTOUVROW_AVX2
 
-__declspec(naked) __declspec(align(16))
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-__asm {
+#ifdef HAS_ARGBTOUVJROW_AVX2
+__declspec(naked)
+void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kBGRAToU
-    movdqa     xmm6, kBGRAToV
-    movdqa     xmm5, kAddUV128
+    mov        ecx, [esp + 8 + 20]  // width
+    vbroadcastf128 ymm5, xmmword ptr kAddUV128
+    vbroadcastf128 ymm6, xmmword ptr kARGBToV
+    vbroadcastf128 ymm7, xmmword ptr kARGBToU
     sub        edi, edx             // stride from u to v
 
-    align      16
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
+    /* step 1 - subsample 32x2 argb pixels to 16x1 */
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    vpavgb     ymm2, ymm2, [eax + esi + 64]
+    vpavgb     ymm3, ymm3, [eax + esi + 96]
+    lea        eax,  [eax + 128]
+    vshufps    ymm4, ymm0, ymm1, 0x88
+    vshufps    ymm0, ymm0, ymm1, 0xdd
+    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
+    vshufps    ymm4, ymm2, ymm3, 0x88
+    vshufps    ymm2, ymm2, ymm3, 0xdd
+    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
 
     // step 2 - convert to U and V
     // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
-
-    // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
+    // instead of 32 different pixels, its 16 pixels of U and 16 of V
+    vpmaddubsw ymm1, ymm0, ymm7  // U
+    vpmaddubsw ymm3, ymm2, ymm7
+    vpmaddubsw ymm0, ymm0, ymm6  // V
+    vpmaddubsw ymm2, ymm2, ymm6
+    vphaddw    ymm1, ymm1, ymm3  // mutates
+    vphaddw    ymm0, ymm0, ymm2
+    vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned
+    vpaddw     ymm0, ymm0, ymm5
+    vpsraw     ymm1, ymm1, 8
+    vpsraw     ymm0, ymm0, 8
+    vpacksswb  ymm0, ymm1, ymm0  // mutates
+    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
+    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
+
+    // step 3 - store 16 U and 16 V values
+    vextractf128 [edx], ymm0, 0 // U
+    vextractf128 [edx + edi], ymm0, 1 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
     jg         convertloop
 
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
+#endif  // HAS_ARGBTOUVJROW_AVX2
 
-__declspec(naked) __declspec(align(16))
-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-__asm {
-    push       esi
+__declspec(naked)
+void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kABGRToU
-    movdqa     xmm6, kABGRToV
-    movdqa     xmm5, kAddUV128
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        edx, [esp + 4 + 8]   // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kARGBToV
+    movdqa     xmm7, xmmword ptr kARGBToU
     sub        edi, edx             // stride from u to v
 
-    align      16
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pavgb      xmm0, [eax + esi]
-    pavgb      xmm1, [eax + esi + 16]
-    pavgb      xmm2, [eax + esi + 32]
-    pavgb      xmm3, [eax + esi + 48]
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
+    /* convert to U and V */
+    movdqu     xmm0, [eax]          // U
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm7
+    pmaddubsw  xmm1, xmm7
     pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
+    pmaddubsw  xmm3, xmm7
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
     psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    psraw      xmm2, 8
+    packsswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
 
-    // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
+    movdqu     xmm0, [eax]          // V
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm6
+    pmaddubsw  xmm1, xmm6
+    pmaddubsw  xmm2, xmm6
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psraw      xmm0, 8
+    psraw      xmm2, 8
+    packsswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    lea        eax,  [eax + 64]
+    movdqu     [edx + edi], xmm0
+    lea        edx,  [edx + 16]
+    sub        ecx,  16
     jg         convertloop
 
     pop        edi
-    pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-__asm {
+__declspec(naked)
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kABGRToU
-    movdqa     xmm6, kABGRToV
-    movdqa     xmm5, kAddUV128
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kBGRAToV
+    movdqa     xmm7, xmmword ptr kBGRAToU
     sub        edi, edx             // stride from u to v
 
-    align      16
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
     movdqu     xmm4, [eax + esi + 16]
     pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
     movdqu     xmm4, [eax + esi + 32]
     pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
     movdqu     xmm4, [eax + esi + 48]
     pavgb      xmm3, xmm4
+
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -1322,10 +1771,10 @@ __asm {
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -1334,33 +1783,37 @@ __asm {
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+__declspec(naked)
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
-__asm {
+  __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kRGBAToU
-    movdqa     xmm6, kRGBAToV
-    movdqa     xmm5, kAddUV128
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kABGRToV
+    movdqa     xmm7, xmmword ptr kABGRToU
     sub        edi, edx             // stride from u to v
 
-    align      16
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pavgb      xmm0, [eax + esi]
-    pavgb      xmm1, [eax + esi + 16]
-    pavgb      xmm2, [eax + esi + 32]
-    pavgb      xmm3, [eax + esi + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -1388,10 +1841,10 @@ __asm {
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -1400,37 +1853,37 @@ __asm {
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-__asm {
+__declspec(naked)
+void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kRGBAToU
-    movdqa     xmm6, kRGBAToV
-    movdqa     xmm5, kAddUV128
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kRGBAToV
+    movdqa     xmm7, xmmword ptr kRGBAToU
     sub        edi, edx             // stride from u to v
 
-    align      16
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
     movdqu     xmm4, [eax + esi + 16]
     pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
     movdqu     xmm4, [eax + esi + 32]
     pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
     movdqu     xmm4, [eax + esi + 48]
     pavgb      xmm3, xmm4
+
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -1458,10 +1911,10 @@ __asm {
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -1471,833 +1924,1124 @@ __asm {
 }
 #endif  // HAS_ARGBTOYROW_SSSE3
 
-#ifdef HAS_I422TOARGBROW_SSSE3
-
-#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
-
-#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
-#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
-#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
-static const vec8 kUVToB = {
-  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
-};
-
-static const vec8 kUVToR = {
-  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
-};
-
-static const vec8 kUVToG = {
-  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
-};
-
-static const vec8 kVUToB = {
-  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
-};
+// Read 16 UV from 444
+#define READYUV444_AVX2 __asm {                                                \
+    __asm vmovdqu    xmm0, [esi]                  /* U */                      \
+    __asm vmovdqu    xmm1, [esi + edi]            /* V */                      \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpermq     ymm1, ymm1, 0xd8                                          \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
+  }
 
-static const vec8 kVUToR = {
-  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
-};
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 __asm {                                                \
+    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
+    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
+  }
 
-static const vec8 kVUToG = {
-  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-};
+// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
+#define READYUVA422_AVX2 __asm {                                               \
+    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
+    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
+    __asm vmovdqu    xmm5, [ebp]                  /* A */                      \
+    __asm vpermq     ymm5, ymm5, 0xd8                                          \
+    __asm lea        ebp, [ebp + 16]                                           \
+  }
 
-static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
-static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
-static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
-static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
-static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
+// Read 4 UV from 411, upsample to 16 UV.
+#define READYUV411_AVX2 __asm {                                                \
+    __asm vmovd      xmm0, dword ptr [esi]        /* U */                      \
+    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */                      \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
+  }
 
-// TODO(fbarchard): NV12/NV21 fetch UV and use directly.
-// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
+  }
 
-// Read 8 UV from 411.
-#define READYUV444 __asm {                                                     \
-    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
-    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+// Read 8 UV from NV21, upsample to 16 UV.
+#define READNV21_AVX2 __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
   }
 
-// Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422 __asm {                                                     \
-    __asm movd       xmm0, [esi]          /* U */                              \
-    __asm movd       xmm1, [esi + edi]    /* V */                              \
-    __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2 __asm {                                                  \
+    __asm vmovdqu    ymm4, [eax]          /* YUY2 */                           \
+    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
+    __asm vmovdqu    ymm0, [eax]          /* UV */                             \
+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
+    __asm lea        eax, [eax + 32]                                           \
   }
 
-// Read 2 UV from 411, upsample to 8 UV.
-#define READYUV411 __asm {                                                     \
-    __asm movd       xmm0, [esi]          /* U */                              \
-    __asm movd       xmm1, [esi + edi]    /* V */                              \
-    __asm lea        esi,  [esi + 2]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
-    __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2 __asm {                                                  \
+    __asm vmovdqu    ymm4, [eax]          /* UYVY */                           \
+    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
+    __asm vmovdqu    ymm0, [eax]          /* UV */                             \
+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
+    __asm lea        eax, [eax + 32]                                           \
   }
 
-// Read 4 UV from NV12, upsample to 8 UV.
-#define READNV12 __asm {                                                       \
-    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
+    __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
+    __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
+    __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \
+    __asm vpsubw     ymm2, ymm3, ymm2                                          \
+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
+    __asm vpsubw     ymm1, ymm3, ymm1                                          \
+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
+    __asm vpsubw     ymm0, ymm3, ymm0                                          \
+    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
+    __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
+    __asm vpaddsw    ymm0, ymm0, ymm4           /* B += Y */                   \
+    __asm vpaddsw    ymm1, ymm1, ymm4           /* G += Y */                   \
+    __asm vpaddsw    ymm2, ymm2, ymm4           /* R += Y */                   \
+    __asm vpsraw     ymm0, ymm0, 6                                             \
+    __asm vpsraw     ymm1, ymm1, 6                                             \
+    __asm vpsraw     ymm2, ymm2, 6                                             \
+    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
+    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
+    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
   }
 
-// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB __asm {                                                       \
-    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
-    __asm movdqa     xmm1, xmm0                                                \
-    __asm movdqa     xmm2, xmm0                                                \
-    __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
-    __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
-    __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
-    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
-    __asm psubw      xmm1, kUVBiasG                                            \
-    __asm psubw      xmm2, kUVBiasR                                            \
-    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
-    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
-    __asm lea        eax, [eax + 8]                                            \
-    __asm punpcklbw  xmm3, xmm4                                                \
-    __asm psubsw     xmm3, kYSub16                                             \
-    __asm pmullw     xmm3, kYToRgb                                             \
-    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
-    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
-    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
-    __asm psraw      xmm0, 6                                                   \
-    __asm psraw      xmm1, 6                                                   \
-    __asm psraw      xmm2, 6                                                   \
-    __asm packuswb   xmm0, xmm0           /* B */                              \
-    __asm packuswb   xmm1, xmm1           /* G */                              \
-    __asm packuswb   xmm2, xmm2           /* R */                              \
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 __asm {                                                 \
+    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
+    __asm vpermq     ymm2, ymm2, 0xd8                                          \
+    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
+    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
+    __asm vmovdqu    0[edx], ymm1                                              \
+    __asm vmovdqu    32[edx], ymm0                                             \
+    __asm lea        edx,  [edx + 64]                                          \
   }
 
-// Convert 8 pixels: 8 VU and 8 Y.
-#define YVUTORGB __asm {                                                       \
-    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
-    __asm movdqa     xmm1, xmm0                                                \
-    __asm movdqa     xmm2, xmm0                                                \
-    __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
-    __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
-    __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
-    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
-    __asm psubw      xmm1, kUVBiasG                                            \
-    __asm psubw      xmm2, kUVBiasR                                            \
-    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
-    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
-    __asm lea        eax, [eax + 8]                                            \
-    __asm punpcklbw  xmm3, xmm4                                                \
-    __asm psubsw     xmm3, kYSub16                                             \
-    __asm pmullw     xmm3, kYToRgb                                             \
-    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
-    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
-    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
-    __asm psraw      xmm0, 6                                                   \
-    __asm psraw      xmm1, 6                                                   \
-    __asm psraw      xmm2, 6                                                   \
-    __asm packuswb   xmm0, xmm0           /* B */                              \
-    __asm packuswb   xmm1, xmm1           /* G */                              \
-    __asm packuswb   xmm2, xmm2           /* R */                              \
+// Store 16 RGBA values.
+#define STORERGBA_AVX2 __asm {                                                 \
+    __asm vpunpcklbw ymm1, ymm1, ymm2           /* GR */                       \
+    __asm vpermq     ymm1, ymm1, 0xd8                                          \
+    __asm vpunpcklbw ymm2, ymm5, ymm0           /* AB */                       \
+    __asm vpermq     ymm2, ymm2, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm2, ymm1           /* ABGR first 8 pixels */      \
+    __asm vpunpckhwd ymm1, ymm2, ymm1           /* ABGR next 8 pixels */       \
+    __asm vmovdqu    [edx], ymm0                                               \
+    __asm vmovdqu    [edx + 32], ymm1                                          \
+    __asm lea        edx,  [edx + 64]                                          \
   }
 
-// 8 pixels, dest aligned 16.
-// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I444ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* argb_buf,
-                         int width) {
+#ifdef HAS_I422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I422ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 
-    align      16
  convertloop:
-    READYUV444
-    YUVTORGB
+    READYUV422_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
+    sub        ecx, 16
     jg         convertloop
 
+    pop        ebx
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* argb_buf,
-                         int width) {
+#endif  // HAS_I422TOARGBROW_AVX2
+
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+__declspec(naked)
+void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             const uint8* a_buf,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    push       ebp
+    mov        eax, [esp + 16 + 4]   // Y
+    mov        esi, [esp + 16 + 8]   // U
+    mov        edi, [esp + 16 + 12]  // V
+    mov        ebp, [esp + 16 + 16]  // A
+    mov        edx, [esp + 16 + 20]  // argb
+    mov        ebx, [esp + 16 + 24]  // yuvconstants
+    mov        ecx, [esp + 16 + 28]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
 
-    align      16
  convertloop:
-    READYUV422
-    YUVTORGB
+    READYUVA422_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
+    sub        ecx, 16
     jg         convertloop
 
+    pop        ebp
+    pop        ebx
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
+#endif  // HAS_I422ALPHATOARGBROW_AVX2
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I444ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+ convertloop:
+    READYUV444_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
 
-// 8 pixels, dest aligned 16.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-// Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
-void I411ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* argb_buf,
-                         int width) {
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I444TOARGBROW_AVX2
+
+#ifdef HAS_I411TOARGBROW_AVX2
+// 16 pixels
+// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I411ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // abgr
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 
-    align      16
  convertloop:
-    READYUV411
-    YUVTORGB
+    READYUV411_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
+    sub        ecx, 16
     jg         convertloop
 
+    pop        ebx
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
+#endif  // HAS_I411TOARGBROW_AVX2
+
+#ifdef HAS_NV12TOARGBROW_AVX2
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void NV12ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  __asm {
+    push       esi
+    push       ebx
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // UV
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV12ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* uv_buf,
-                         uint8* argb_buf,
-                         int width) {
+ convertloop:
+    READNV12_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_NV12TOARGBROW_AVX2
+
+#ifdef HAS_NV21TOARGBROW_AVX2
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void NV21ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* vu_buf,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // Y
-    mov        esi, [esp + 4 + 8]   // UV
-    mov        edx, [esp + 4 + 12]  // argb
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
+    push       ebx
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // VU
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 
-    align      16
  convertloop:
-    READNV12
-    YUVTORGB
+    READNV21_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
+    sub        ecx, 16
     jg         convertloop
 
+    pop        ebx
     pop        esi
+    vzeroupper
     ret
   }
 }
+#endif  // HAS_NV21TOARGBROW_AVX2
+
+#ifdef HAS_YUY2TOARGBROW_AVX2
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+__declspec(naked)
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  __asm {
+    push       ebx
+    mov        eax, [esp + 4 + 4]   // yuy2
+    mov        edx, [esp + 4 + 8]   // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV21ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* uv_buf,
-                         uint8* argb_buf,
-                         int width) {
+ convertloop:
+    READYUY2_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_YUY2TOARGBROW_AVX2
+
+#ifdef HAS_UYVYTOARGBROW_AVX2
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+__declspec(naked)
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
   __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // Y
-    mov        esi, [esp + 4 + 8]   // VU
-    mov        edx, [esp + 4 + 12]  // argb
+    push       ebx
+    mov        eax, [esp + 4 + 4]   // uyvy
+    mov        edx, [esp + 4 + 8]   // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
     mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 
-    align      16
  convertloop:
-    READNV12
-    YVUTORGB
+    READUYVY_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
+    sub        ecx, 16
     jg         convertloop
 
+    pop        ebx
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_UYVYTOARGBROW_AVX2
+
+#ifdef HAS_I422TORGBAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+__declspec(naked)
+void I422ToRGBARow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // abgr
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(ebx)
+    STORERGBA_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
+#endif  // HAS_I422TORGBAROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
+// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
+// Allows a conversion with half size scaling.
 
-// 8 pixels, unaligned.
+// Read 8 UV from 444.
+#define READYUV444 __asm {                                                     \
+    __asm movq       xmm0, qword ptr [esi] /* U */                             \
+    __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+  }
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422 __asm {                                                     \
+    __asm movd       xmm0, [esi]          /* U */                              \
+    __asm movd       xmm1, [esi + edi]    /* V */                              \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+  }
+
+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
+#define READYUVA422 __asm {                                                    \
+    __asm movd       xmm0, [esi]          /* U */                              \
+    __asm movd       xmm1, [esi + edi]    /* V */                              \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax]   /* Y */                           \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+    __asm movq       xmm5, qword ptr [ebp]   /* A */                           \
+    __asm lea        ebp, [ebp + 8]                                            \
+  }
+
+// Read 2 UV from 411, upsample to 8 UV.
+// drmemory fails with memory fault if pinsrw used. libyuv bug: 525
+//  __asm pinsrw     xmm0, [esi], 0        /* U */
+//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
+#define READYUV411_EBX __asm {                                                 \
+    __asm movzx      ebx, word ptr [esi]        /* U */                        \
+    __asm movd       xmm0, ebx                                                 \
+    __asm movzx      ebx, word ptr [esi + edi]  /* V */                        \
+    __asm movd       xmm1, ebx                                                 \
+    __asm lea        esi,  [esi + 2]                                           \
+    __asm punpcklbw  xmm0, xmm1            /* UV */                            \
+    __asm punpcklwd  xmm0, xmm0            /* UVUV (upsample) */               \
+    __asm punpckldq  xmm0, xmm0            /* UVUVUVUV (upsample) */           \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+  }
+
+// Read 4 UV from NV12, upsample to 8 UV.
+#define READNV12 __asm {                                                       \
+    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+  }
+
+// Read 4 VU from NV21, upsample to 8 UV.
+#define READNV21 __asm {                                                       \
+    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+  }
+
+// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
+#define READYUY2 __asm {                                                       \
+    __asm movdqu     xmm4, [eax]          /* YUY2 */                           \
+    __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
+    __asm movdqu     xmm0, [eax]          /* UV */                             \
+    __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
+    __asm lea        eax, [eax + 16]                                           \
+  }
+
+// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
+#define READUYVY __asm {                                                       \
+    __asm movdqu     xmm4, [eax]          /* UYVY */                           \
+    __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
+    __asm movdqu     xmm0, [eax]          /* UV */                             \
+    __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
+    __asm lea        eax, [eax + 16]                                           \
+  }
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB(YuvConstants) __asm {                                         \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm movdqa     xmm2, xmm0                                                \
+    __asm movdqa     xmm3, xmm0                                                \
+    __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVBIASB]               \
+    __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOB]                 \
+    __asm psubw      xmm0, xmm1                                                \
+    __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVBIASG]               \
+    __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOG]                 \
+    __asm psubw      xmm1, xmm2                                                \
+    __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \
+    __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
+    __asm psubw      xmm2, xmm3                                                \
+    __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
+    __asm paddsw     xmm0, xmm4           /* B += Y */                         \
+    __asm paddsw     xmm1, xmm4           /* G += Y */                         \
+    __asm paddsw     xmm2, xmm4           /* R += Y */                         \
+    __asm psraw      xmm0, 6                                                   \
+    __asm psraw      xmm1, 6                                                   \
+    __asm psraw      xmm2, 6                                                   \
+    __asm packuswb   xmm0, xmm0           /* B */                              \
+    __asm packuswb   xmm1, xmm1           /* G */                              \
+    __asm packuswb   xmm2, xmm2           /* R */                              \
+  }
+
+// Store 8 ARGB values.
+#define STOREARGB __asm {                                                      \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm0                                              \
+    __asm movdqu     16[edx], xmm1                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 BGRA values.
+#define STOREBGRA __asm {                                                      \
+    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
+    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 RGBA values.
+#define STORERGBA __asm {                                                      \
+    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
+    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 RGB24 values.
+#define STORERGB24 __asm {                                                     \
+    /* Weave into RRGB */                                                      \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* RRGB -> RGB24 */                                                        \
+    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]                                          \
+  }
+
+// Store 8 RGB565 values.
+#define STORERGB565 __asm {                                                    \
+    /* Weave into RRGB */                                                      \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* RRGB -> RGB565 */                                                       \
+    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
+    __asm movdqa     xmm2, xmm0    /* G */                                     \
+    __asm pslld      xmm0, 8       /* R */                                     \
+    __asm psrld      xmm3, 3       /* B */                                     \
+    __asm psrld      xmm2, 5       /* G */                                     \
+    __asm psrad      xmm0, 16      /* R */                                     \
+    __asm pand       xmm3, xmm5    /* B */                                     \
+    __asm pand       xmm2, xmm6    /* G */                                     \
+    __asm pand       xmm0, xmm7    /* R */                                     \
+    __asm por        xmm3, xmm2    /* BG */                                    \
+    __asm por        xmm0, xmm3    /* BGR */                                   \
+    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
+    __asm movdqa     xmm2, xmm1    /* G */                                     \
+    __asm pslld      xmm1, 8       /* R */                                     \
+    __asm psrld      xmm3, 3       /* B */                                     \
+    __asm psrld      xmm2, 5       /* G */                                     \
+    __asm psrad      xmm1, 16      /* R */                                     \
+    __asm pand       xmm3, xmm5    /* B */                                     \
+    __asm pand       xmm2, xmm6    /* G */                                     \
+    __asm pand       xmm1, xmm7    /* R */                                     \
+    __asm por        xmm3, xmm2    /* BG */                                    \
+    __asm por        xmm1, xmm3    /* BGR */                                   \
+    __asm packssdw   xmm0, xmm1                                                \
+    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
+    __asm lea        edx, [edx + 16]                                           \
+  }
+
+// 8 pixels.
 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* argb_buf,
-                                   int width) {
+__declspec(naked)
+void I444ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
+    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
 
-    align      16
  convertloop:
     READYUV444
-    YUVTORGB
+    YUVTORGB(ebx)
+    STOREARGB
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
+    pop        ebx
     pop        edi
     pop        esi
     ret
   }
 }
 
-// 8 pixels, unaligned.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* argb_buf,
-                                   int width) {
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
+__declspec(naked)
+void I422ToRGB24Row_SSSE3(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
+    movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
+    movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
 
-    align      16
  convertloop:
     READYUV422
-    YUVTORGB
+    YUVTORGB(ebx)
+    STORERGB24
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
+    pop        ebx
     pop        edi
     pop        esi
     ret
   }
 }
 
-// 8 pixels, unaligned.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-// Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
-void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* argb_buf,
-                                   int width) {
+// 8 pixels
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
+__declspec(naked)
+void I422ToRGB565Row_SSSE3(const uint8* y_buf,
+                           const uint8* u_buf,
+                           const uint8* v_buf,
+                           uint8* rgb565_buf,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
+    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
+    psrld      xmm5, 27
+    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
+    psrld      xmm6, 26
+    pslld      xmm6, 5
+    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
+    pslld      xmm7, 11
 
-    align      16
  convertloop:
-    READYUV411
-    YUVTORGB
+    READYUV422
+    YUVTORGB(ebx)
+    STORERGB565
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
+    pop        ebx
     pop        edi
     pop        esi
     ret
   }
 }
 
-
-// 8 pixels, dest aligned 16.
+// 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* uv_buf,
-                                   uint8* argb_buf,
-                                   int width) {
+__declspec(naked)
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // Y
-    mov        esi, [esp + 4 + 8]   // UV
-    mov        edx, [esp + 4 + 12]  // argb
-    mov        ecx, [esp + 4 + 16]  // width
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
 
-    align      16
  convertloop:
-    READNV12
-    YUVTORGB
+    READYUV422
+    YUVTORGB(ebx)
+    STOREARGB
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
+    pop        ebx
+    pop        edi
     pop        esi
     ret
   }
 }
 
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* uv_buf,
-                                   uint8* argb_buf,
-                                   int width) {
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
+__declspec(naked)
+void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              const uint8* a_buf,
+                              uint8* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // Y
-    mov        esi, [esp + 4 + 8]   // VU
-    mov        edx, [esp + 4 + 12]  // argb
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
+    push       edi
+    push       ebx
+    push       ebp
+    mov        eax, [esp + 16 + 4]   // Y
+    mov        esi, [esp + 16 + 8]   // U
+    mov        edi, [esp + 16 + 12]  // V
+    mov        ebp, [esp + 16 + 16]  // A
+    mov        edx, [esp + 16 + 20]  // argb
+    mov        ebx, [esp + 16 + 24]  // yuvconstants
+    mov        ecx, [esp + 16 + 28]  // width
+    sub        edi, esi
 
-    align      16
  convertloop:
-    READNV12
-    YVUTORGB
+    READYUVA422
+    YUVTORGB(ebx)
+    STOREARGB
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
+    pop        ebp
+    pop        ebx
+    pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void I422ToBGRARow_SSSE3(const uint8* y_buf,
+// 8 pixels.
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// Similar to I420 but duplicate UV once more.
+__declspec(naked)
+void I411ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
-                         uint8* bgra_buf,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
                          int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // bgra
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    push       ebp
+    mov        eax, [esp + 16 + 4]   // Y
+    mov        esi, [esp + 16 + 8]   // U
+    mov        edi, [esp + 16 + 12]  // V
+    mov        edx, [esp + 16 + 16]  // abgr
+    mov        ebp, [esp + 16 + 20]  // yuvconstants
+    mov        ecx, [esp + 16 + 24]  // width
     sub        edi, esi
-    pxor       xmm4, xmm4
+    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
 
-    align      16
  convertloop:
-    READYUV422
-    YUVTORGB
+    READYUV411_EBX
+    YUVTORGB(ebp)
+    STOREARGB
 
-    // Step 3: Weave into BGRA
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    punpcklbw  xmm1, xmm0           // GB
-    punpcklbw  xmm5, xmm2           // AR
-    movdqa     xmm0, xmm5
-    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
-    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
-    movdqa     [edx], xmm5
-    movdqa     [edx + 16], xmm0
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
+    pop        ebp
+    pop        ebx
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* bgra_buf,
-                                   int width) {
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* uv_buf,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
   __asm {
     push       esi
-    push       edi
+    push       ebx
     mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // bgra
+    mov        esi, [esp + 8 + 8]   // UV
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
     mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    pxor       xmm4, xmm4
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 
-    align      16
  convertloop:
-    READYUV422
-    YUVTORGB
+    READNV12
+    YUVTORGB(ebx)
+    STOREARGB
 
-    // Step 3: Weave into BGRA
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    punpcklbw  xmm1, xmm0           // GB
-    punpcklbw  xmm5, xmm2           // AR
-    movdqa     xmm0, xmm5
-    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
-    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
-    movdqu     [edx], xmm5
-    movdqu     [edx + 16], xmm0
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
-    pop        edi
+    pop        ebx
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void I422ToABGRRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* abgr_buf,
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* vu_buf,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
                          int width) {
   __asm {
     push       esi
-    push       edi
+    push       ebx
     mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // abgr
+    mov        esi, [esp + 8 + 8]   // VU
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
     mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
 
-    align      16
  convertloop:
-    READYUV422
-    YUVTORGB
+    READNV21
+    YUVTORGB(ebx)
+    STOREARGB
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm2, xmm1           // RG
-    punpcklbw  xmm0, xmm5           // BA
-    movdqa     xmm1, xmm2
-    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
-    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
-    movdqa     [edx], xmm2
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
-    pop        edi
+    pop        ebx
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* abgr_buf,
-                                   int width) {
+// 8 pixels.
+// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
+__declspec(naked)
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
   __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // abgr
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
+    push       ebx
+    mov        eax, [esp + 4 + 4]   // yuy2
+    mov        edx, [esp + 4 + 8]   // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
+    mov        ecx, [esp + 4 + 16]  // width
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
 
-    align      16
  convertloop:
-    READYUV422
-    YUVTORGB
+    READYUY2
+    YUVTORGB(ebx)
+    STOREARGB
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm2, xmm1           // RG
-    punpcklbw  xmm0, xmm5           // BA
-    movdqa     xmm1, xmm2
-    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
-    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
-    movdqu     [edx], xmm2
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
-    pop        edi
-    pop        esi
+    pop        ebx
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void I422ToRGBARow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgba_buf,
+// 8 pixels.
+// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
+__declspec(naked)
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
                          int width) {
   __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // rgba
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    pxor       xmm4, xmm4
+    push       ebx
+    mov        eax, [esp + 4 + 4]   // uyvy
+    mov        edx, [esp + 4 + 8]   // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 
-    align      16
  convertloop:
-    READYUV422
-    YUVTORGB
+    READUYVY
+    YUVTORGB(ebx)
+    STOREARGB
 
-    // Step 3: Weave into RGBA
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    punpcklbw  xmm1, xmm2           // GR
-    punpcklbw  xmm5, xmm0           // AB
-    movdqa     xmm0, xmm5
-    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
-    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
-    movdqa     [edx], xmm5
-    movdqa     [edx + 16], xmm0
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
-    pop        edi
-    pop        esi
+    pop        ebx
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* rgba_buf,
-                                   int width) {
+__declspec(naked)
+void I422ToRGBARow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_rgba,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // rgba
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pxor       xmm4, xmm4
 
-    align      16
  convertloop:
     READYUV422
-    YUVTORGB
+    YUVTORGB(ebx)
+    STORERGBA
 
-    // Step 3: Weave into RGBA
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    punpcklbw  xmm1, xmm2           // GR
-    punpcklbw  xmm5, xmm0           // AB
-    movdqa     xmm0, xmm5
-    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
-    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
-    movdqu     [edx], xmm5
-    movdqu     [edx + 16], xmm0
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
+    pop        ebx
     pop        edi
     pop        esi
     ret
   }
 }
-
 #endif  // HAS_I422TOARGBROW_SSSE3
 
-#ifdef HAS_YTOARGBROW_SSE2
-__declspec(naked) __declspec(align(16))
-void YToARGBRow_SSE2(const uint8* y_buf,
-                     uint8* rgb_buf,
-                     int width) {
+#ifdef HAS_I400TOARGBROW_SSE2
+// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
+__declspec(naked)
+void I400ToARGBRow_SSE2(const uint8* y_buf,
+                        uint8* rgb_buf,
+                        int width) {
   __asm {
+    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    movd       xmm2, eax
+    pshufd     xmm2, xmm2,0
+    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    movd       xmm3, eax
+    pshufd     xmm3, xmm3, 0
     pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
     pslld      xmm4, 24
-    mov        eax,0x10001000
-    movd       xmm3,eax
-    pshufd     xmm3,xmm3,0
-    mov        eax,0x012a012a
-    movd       xmm2,eax
-    pshufd     xmm2,xmm2,0
+
     mov        eax, [esp + 4]       // Y
     mov        edx, [esp + 8]       // rgb
     mov        ecx, [esp + 12]      // width
 
-    align      16
  convertloop:
     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
     movq       xmm0, qword ptr [eax]
     lea        eax, [eax + 8]
     punpcklbw  xmm0, xmm0           // Y.Y
-    psubusw    xmm0, xmm3
     pmulhuw    xmm0, xmm2
+    psubusw    xmm0, xmm3
+    psrlw      xmm0, 6
     packuswb   xmm0, xmm0           // G
 
     // Step 2: Weave into ARGB
@@ -2307,84 +3051,125 @@ void YToARGBRow_SSE2(const uint8* y_buf,
     punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
     por        xmm0, xmm4
     por        xmm1, xmm4
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
     lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+__declspec(naked)
+void I400ToARGBRow_AVX2(const uint8* y_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  __asm {
+    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    vmovd      xmm2, eax
+    vbroadcastss ymm2, xmm2
+    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    vmovd      xmm3, eax
+    vbroadcastss ymm3, xmm3
+    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
+    vpslld     ymm4, ymm4, 24
+
+    mov        eax, [esp + 4]       // Y
+    mov        edx, [esp + 8]       // rgb
+    mov        ecx, [esp + 12]      // width
 
+ convertloop:
+    // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
+    vmovdqu    xmm0, [eax]
+    lea        eax, [eax + 16]
+    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
+    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
+    vpmulhuw   ymm0, ymm0, ymm2
+    vpsubusw   ymm0, ymm0, ymm3
+    vpsrlw     ymm0, ymm0, 6
+    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
+
+    // TODO(fbarchard): Weave alpha with unpack.
+    // Step 2: Weave into ARGB
+    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
+    vpor       ymm0, ymm0, ymm4
+    vpor       ymm1, ymm1, ymm4
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+    vzeroupper
     ret
   }
 }
-#endif  // HAS_YTOARGBROW_SSE2
+#endif  // HAS_I400TOARGBROW_AVX2
 
 #ifdef HAS_MIRRORROW_SSSE3
-
 // Shuffle table for reversing the bytes.
 static const uvec8 kShuffleMirror = {
   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
 
-__declspec(naked) __declspec(align(16))
+// TODO(fbarchard): Replace lea with -16 offset.
+__declspec(naked)
 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
-__asm {
+  __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
-    movdqa    xmm5, kShuffleMirror
-    lea       eax, [eax - 16]
+    movdqa    xmm5, xmmword ptr kShuffleMirror
 
-    align      16
  convertloop:
-    movdqa    xmm0, [eax + ecx]
+    movdqu    xmm0, [eax - 16 + ecx]
     pshufb    xmm0, xmm5
-    sub       ecx, 16
-    movdqa    [edx], xmm0
+    movdqu    [edx], xmm0
     lea       edx, [edx + 16]
+    sub       ecx, 16
     jg        convertloop
     ret
   }
 }
 #endif  // HAS_MIRRORROW_SSSE3
 
-#ifdef HAS_MIRRORROW_SSE2
-// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
-// version can not.
-__declspec(naked) __declspec(align(16))
-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
-__asm {
+#ifdef HAS_MIRRORROW_AVX2
+__declspec(naked)
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
-    lea       eax, [eax - 16]
+    vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
 
-    align      16
  convertloop:
-    movdqu    xmm0, [eax + ecx]
-    movdqa    xmm1, xmm0        // swap bytes
-    psllw     xmm0, 8
-    psrlw     xmm1, 8
-    por       xmm0, xmm1
-    pshuflw   xmm0, xmm0, 0x1b  // swap words
-    pshufhw   xmm0, xmm0, 0x1b
-    pshufd    xmm0, xmm0, 0x4e  // swap qwords
-    sub       ecx, 16
-    movdqu    [edx], xmm0
-    lea       edx, [edx + 16]
+    vmovdqu   ymm0, [eax - 32 + ecx]
+    vpshufb   ymm0, ymm0, ymm5
+    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
+    vmovdqu   [edx], ymm0
+    lea       edx, [edx + 32]
+    sub       ecx, 32
     jg        convertloop
+    vzeroupper
     ret
   }
 }
-#endif  // HAS_MIRRORROW_SSE2
+#endif  // HAS_MIRRORROW_AVX2
 
-#ifdef HAS_MIRRORROW_UV_SSSE3
+#ifdef HAS_MIRRORUVROW_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
 static const uvec8 kShuffleMirrorUV = {
   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
 };
 
-__declspec(naked) __declspec(align(16))
-void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+__declspec(naked)
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
                        int width) {
   __asm {
     push      edi
@@ -2392,73 +3177,91 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
     mov       edx, [esp + 4 + 8]   // dst_u
     mov       edi, [esp + 4 + 12]  // dst_v
     mov       ecx, [esp + 4 + 16]  // width
-    movdqa    xmm1, kShuffleMirrorUV
+    movdqa    xmm1, xmmword ptr kShuffleMirrorUV
     lea       eax, [eax + ecx * 2 - 16]
     sub       edi, edx
 
-    align      16
  convertloop:
-    movdqa    xmm0, [eax]
+    movdqu    xmm0, [eax]
     lea       eax, [eax - 16]
     pshufb    xmm0, xmm1
-    sub       ecx, 8
     movlpd    qword ptr [edx], xmm0
     movhpd    qword ptr [edx + edi], xmm0
     lea       edx, [edx + 8]
+    sub       ecx, 8
     jg        convertloop
 
     pop       edi
     ret
   }
 }
-#endif  // HAS_MIRRORROW_UV_SSSE3
+#endif  // HAS_MIRRORUVROW_SSSE3
 
-#ifdef HAS_ARGBMIRRORROW_SSSE3
+#ifdef HAS_ARGBMIRRORROW_SSE2
+__declspec(naked)
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
 
+ convertloop:
+    movdqu    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufd    xmm0, xmm0, 0x1b
+    movdqu    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
-static const uvec8 kARGBShuffleMirror = {
-  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
 
-__declspec(naked) __declspec(align(16))
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
-__asm {
+__declspec(naked)
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
-    movdqa    xmm5, kARGBShuffleMirror
-    lea       eax, [eax - 16]
+    vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
 
-    align      16
  convertloop:
-    movdqa    xmm0, [eax + ecx * 4]
-    pshufb    xmm0, xmm5
-    sub       ecx, 4
-    movdqa    [edx], xmm0
-    lea       edx, [edx + 16]
+    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
+    vmovdqu   [edx], ymm0
+    lea       edx, [edx + 32]
+    sub       ecx, 8
     jg        convertloop
+    vzeroupper
     ret
   }
 }
-#endif  // HAS_ARGBMIRRORROW_SSSE3
+#endif  // HAS_ARGBMIRRORROW_AVX2
 
-#ifdef HAS_SPLITUV_SSE2
-__declspec(naked) __declspec(align(16))
-void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+#ifdef HAS_SPLITUVROW_SSE2
+__declspec(naked)
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_uv
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
+    mov        ecx, [esp + 4 + 16]   // width
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      16
   convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     movdqa     xmm2, xmm0
     movdqa     xmm3, xmm1
@@ -2468,8 +3271,8 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     psrlw      xmm2, 8      // odd bytes
     psrlw      xmm3, 8
     packuswb   xmm2, xmm3
-    movdqa     [edx], xmm0
-    movdqa     [edx + edi], xmm2
+    movdqu     [edx], xmm0
+    movdqu     [edx + edi], xmm2
     lea        edx, [edx + 16]
     sub        ecx, 16
     jg         convertloop
@@ -2478,58 +3281,363 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     ret
   }
 }
-#endif  // HAS_SPLITUV_SSE2
+
+#endif  // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_SPLITUVROW_AVX2
+__declspec(naked)
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_uv
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // width
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm2, ymm0, 8      // odd bytes
+    vpsrlw     ymm3, ymm1, 8
+    vpand      ymm0, ymm0, ymm5   // even bytes
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1
+    vpackuswb  ymm2, ymm2, ymm3
+    vpermq     ymm0, ymm0, 0xd8
+    vpermq     ymm2, ymm2, 0xd8
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + edi], ymm2
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+__declspec(naked)
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_u
+    mov        edx, [esp + 4 + 8]    // src_v
+    mov        edi, [esp + 4 + 12]   // dst_uv
+    mov        ecx, [esp + 4 + 16]   // width
+    sub        edx, eax
+
+  convertloop:
+    movdqu     xmm0, [eax]      // read 16 U's
+    movdqu     xmm1, [eax + edx]  // and 16 V's
+    lea        eax,  [eax + 16]
+    movdqa     xmm2, xmm0
+    punpcklbw  xmm0, xmm1       // first 8 UV pairs
+    punpckhbw  xmm2, xmm1       // next 8 UV pairs
+    movdqu     [edi], xmm0
+    movdqu     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+#endif  //  HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+__declspec(naked)
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_u
+    mov        edx, [esp + 4 + 8]    // src_v
+    mov        edi, [esp + 4 + 12]   // dst_uv
+    mov        ecx, [esp + 4 + 16]   // width
+    sub        edx, eax
+
+  convertloop:
+    vmovdqu    ymm0, [eax]           // read 32 U's
+    vmovdqu    ymm1, [eax + edx]     // and 32 V's
+    lea        eax,  [eax + 32]
+    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
+    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
+    vextractf128 [edi], ymm2, 0       // bytes 0..15
+    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
+    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
+    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
+    lea        edi, [edi + 64]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_MERGEUVROW_AVX2
 
 #ifdef HAS_COPYROW_SSE2
 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   __asm {
     mov        eax, [esp + 4]   // src
     mov        edx, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
-    sub        edx, eax
+    test       eax, 15
+    jne        convertloopu
+    test       edx, 15
+    jne        convertloopu
 
-    align      16
-  convertloop:
+  convertloopa:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
-    movdqa     [eax + edx], xmm0
-    movdqa     [eax + edx + 16], xmm1
     lea        eax, [eax + 32]
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
     sub        ecx, 32
-    jg         convertloop
+    jg         convertloopa
+    ret
+
+  convertloopu:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloopu
     ret
   }
 }
 #endif  // HAS_COPYROW_SSE2
 
-#ifdef HAS_COPYROW_X86
-__declspec(naked) __declspec(align(16))
-void CopyRow_X86(const uint8* src, uint8* dst, int count) {
+#ifdef HAS_COPYROW_AVX
+// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked)
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax, [eax + 64]
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx, [edx + 64]
+    sub        ecx, 64
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_COPYROW_AVX
+
+// Multiple of 1.
+__declspec(naked)
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
   __asm {
     mov        eax, esi
     mov        edx, edi
     mov        esi, [esp + 4]   // src
     mov        edi, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
-    shr        ecx, 2
-    rep movsd
+    rep movsb
     mov        edi, edx
     mov        esi, eax
     ret
   }
 }
-#endif  // HAS_COPYROW_X86
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+__declspec(naked)
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    pslld      xmm0, 24
+    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    psrld      xmm1, 8
+
+  convertloop:
+    movdqu     xmm2, [eax]
+    movdqu     xmm3, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
+    pand       xmm2, xmm0
+    pand       xmm3, xmm0
+    pand       xmm4, xmm1
+    pand       xmm5, xmm1
+    por        xmm2, xmm4
+    por        xmm3, xmm5
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+__declspec(naked)
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    vpcmpeqb   ymm0, ymm0, ymm0
+    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+
+  convertloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + 32]
+    lea        eax, [eax + 64]
+    vpblendvb  ymm1, ymm1, [edx], ymm0
+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
+    vmovdqu    [edx], ymm1
+    vmovdqu    [edx + 32], ymm2
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+// width in pixels
+__declspec(naked)
+void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_a
+    mov        ecx, [esp + 12]  // width
+
+  extractloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    psrld      xmm0, 24
+    psrld      xmm1, 24
+    packssdw   xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 8
+    jg         extractloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+__declspec(naked)
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    pslld      xmm0, 24
+    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    psrld      xmm1, 8
+
+  convertloop:
+    movq       xmm2, qword ptr [eax]  // 8 Y's
+    lea        eax, [eax + 8]
+    punpcklbw  xmm2, xmm2
+    punpckhwd  xmm3, xmm2
+    punpcklwd  xmm2, xmm2
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
+    pand       xmm2, xmm0
+    pand       xmm3, xmm0
+    pand       xmm4, xmm1
+    pand       xmm5, xmm1
+    por        xmm2, xmm4
+    por        xmm3, xmm5
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+__declspec(naked)
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    vpcmpeqb   ymm0, ymm0, ymm0
+    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+
+  convertloop:
+    vpmovzxbd  ymm1, qword ptr [eax]
+    vpmovzxbd  ymm2, qword ptr [eax + 8]
+    lea        eax, [eax + 16]
+    vpslld     ymm1, ymm1, 24
+    vpslld     ymm2, ymm2, 24
+    vpblendvb  ymm1, ymm1, [edx], ymm0
+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
+    vmovdqu    [edx], ymm1
+    vmovdqu    [edx + 32], ymm2
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 
 #ifdef HAS_SETROW_X86
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
-__declspec(naked) __declspec(align(16))
-void SetRow8_X86(uint8* dst, uint32 v32, int count) {
+// Write 'count' bytes using an 8 bit value repeated.
+// Count should be multiple of 4.
+__declspec(naked)
+void SetRow_X86(uint8* dst, uint8 v8, int count) {
   __asm {
+    movzx      eax, byte ptr [esp + 8]    // v8
+    mov        edx, 0x01010101  // Duplicate byte to all bytes.
+    mul        edx              // overwrites edx with upper part of result.
     mov        edx, edi
     mov        edi, [esp + 4]   // dst
-    mov        eax, [esp + 8]   // v32
     mov        ecx, [esp + 12]  // count
     shr        ecx, 2
     rep stosd
@@ -2538,68 +3646,65 @@ void SetRow8_X86(uint8* dst, uint32 v32, int count) {
   }
 }
 
-// SetRow32 writes 'count' words using a 32 bit value repeated.
-__declspec(naked) __declspec(align(16))
-void SetRows32_X86(uint8* dst, uint32 v32, int width,
-                   int dst_stride, int height) {
+// Write 'count' bytes using an 8 bit value repeated.
+__declspec(naked)
+void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
   __asm {
-    push       esi
-    push       edi
-    push       ebp
-    mov        edi, [esp + 12 + 4]   // dst
-    mov        eax, [esp + 12 + 8]   // v32
-    mov        ebp, [esp + 12 + 12]  // width
-    mov        edx, [esp + 12 + 16]  // dst_stride
-    mov        esi, [esp + 12 + 20]  // height
-    lea        ecx, [ebp * 4]
-    sub        edx, ecx             // stride - width * 4
-
-    align      16
-  convertloop:
-    mov        ecx, ebp
-    rep stosd
-    add        edi, edx
-    sub        esi, 1
-    jg         convertloop
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v8
+    mov        ecx, [esp + 12]  // count
+    rep stosb
+    mov        edi, edx
+    ret
+  }
+}
 
-    pop        ebp
-    pop        edi
-    pop        esi
+// Write 'count' 32 bit values.
+__declspec(naked)
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v32
+    mov        ecx, [esp + 12]  // count
+    rep stosd
+    mov        edi, edx
     ret
   }
 }
 #endif  // HAS_SETROW_X86
 
-#ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked) __declspec(align(16))
-void YUY2ToYRow_SSE2(const uint8* src_yuy2,
-                     uint8* dst_y, int pix) {
+#ifdef HAS_YUY2TOYROW_AVX2
+__declspec(naked)
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
   __asm {
     mov        eax, [esp + 4]    // src_yuy2
     mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // pix
-    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
+    mov        ecx, [esp + 12]   // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
 
-    align      16
   convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // even bytes are Y
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5   // even bytes are Y
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
     jg         convertloop
+    vzeroupper
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked)
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
@@ -2607,106 +3712,107 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
     mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
+    mov        ecx, [esp + 8 + 20]   // width
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      16
   convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-    psrlw      xmm0, 8      // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
     jg         convertloop
 
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked)
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
+    mov        ecx, [esp + 4 + 16]   // width
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      16
   convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8      // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
     jg         convertloop
 
     pop        edi
+    vzeroupper
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
-                               uint8* dst_y, int pix) {
+__declspec(naked)
+void UYVYToYRow_AVX2(const uint8* src_uyvy,
+                     uint8* dst_y, int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_yuy2
+    mov        eax, [esp + 4]    // src_uyvy
     mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // pix
-    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
+    mov        ecx, [esp + 12]   // width
 
-    align      16
   convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // even bytes are Y
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
     jg         convertloop
+    vzeroupper
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                                uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked)
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
@@ -2714,104 +3820,109 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
     mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
+    mov        ecx, [esp + 8 + 20]   // width
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      16
   convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-    psrlw      xmm0, 8      // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
     jg         convertloop
 
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
-                                   uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked)
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
+    mov        ecx, [esp + 4 + 16]   // width
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      16
   convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8      // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
     jg         convertloop
 
     pop        edi
+    vzeroupper
     ret
   }
 }
+#endif  // HAS_YUY2TOYROW_AVX2
 
-__declspec(naked) __declspec(align(16))
-void UYVYToYRow_SSE2(const uint8* src_uyvy,
-                     uint8* dst_y, int pix) {
+#ifdef HAS_YUY2TOYROW_SSE2
+__declspec(naked)
+void YUY2ToYRow_SSE2(const uint8* src_yuy2,
+                     uint8* dst_y, int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_uyvy
+    mov        eax, [esp + 4]    // src_yuy2
     mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // pix
+    mov        ecx, [esp + 12]   // width
+    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
 
-    align      16
   convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    psrlw      xmm0, 8    // odd bytes are Y
-    psrlw      xmm1, 8
+    pand       xmm0, xmm5   // even bytes are Y
+    pand       xmm1, xmm5
     packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked)
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
@@ -2819,22 +3930,21 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
     mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // pix
+    mov        ecx, [esp + 8 + 20]   // width
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      16
   convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
     lea        eax,  [eax + 32]
     pavgb      xmm0, xmm2
     pavgb      xmm1, xmm3
-    pand       xmm0, xmm5   // UYVY -> UVUV
-    pand       xmm1, xmm5
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
@@ -2853,26 +3963,25 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked)
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
+    mov        ecx, [esp + 4 + 16]   // width
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      16
   convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // UYVY -> UVUV
-    pand       xmm1, xmm5
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
@@ -2890,15 +3999,14 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
-                               uint8* dst_y, int pix) {
+__declspec(naked)
+void UYVYToYRow_SSE2(const uint8* src_uyvy,
+                     uint8* dst_y, int width) {
   __asm {
     mov        eax, [esp + 4]    // src_uyvy
     mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // pix
+    mov        ecx, [esp + 12]   // width
 
-    align      16
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -2906,17 +4014,17 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
     psrlw      xmm0, 8    // odd bytes are Y
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                                uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked)
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
@@ -2924,12 +4032,11 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
     mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // pix
+    mov        ecx, [esp + 8 + 20]   // width
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      16
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -2958,20 +4065,19 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
-                                   uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked)
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
+    mov        ecx, [esp + 4 + 16]   // width
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      16
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -2996,127 +4102,122 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
 }
 #endif  // HAS_YUY2TOYROW_SSE2
 
-#ifdef HAS_ARGBBLENDROW_SSE2
+#ifdef HAS_BLENDPLANEROW_SSSE3
 // Blend 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
-void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+__declspec(naked)
+void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
+                         const uint8* alpha, uint8* dst, int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 1
-    psrlw      xmm7, 15
-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
-    psrlw      xmm6, 8
+    push       edi
     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
     psllw      xmm5, 8
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
-    pslld      xmm4, 24
-
-    sub        ecx, 1
-    je         convertloop1     // only 1 pixel?
-    jl         convertloop1b
-
-    // 1 pixel loop until destination pointer is aligned.
-  alignloop1:
-    test       edx, 15          // aligned?
-    je         alignloop1b
-    movd       xmm3, [eax]
-    lea        eax, [eax + 4]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movd       xmm2, [esi]      // _r_b
-    psrlw      xmm3, 8          // alpha
-    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-    pshuflw    xmm3, xmm3,0F5h
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movd       xmm1, [esi]      // _a_g
-    lea        esi, [esi + 4]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 1
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    jge        alignloop1
-
-  alignloop1b:
-    add        ecx, 1 - 4
-    jl         convertloop4b
-
-    // 4 pixel loop.
-  convertloop4:
-    movdqu     xmm3, [eax]      // src argb
-    lea        eax, [eax + 16]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqu     xmm2, [esi]      // _r_b
-    psrlw      xmm3, 8          // alpha
-    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-    pshuflw    xmm3, xmm3,0F5h
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqu     xmm1, [esi]      // _a_g
-    lea        esi, [esi + 16]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jge        convertloop4
-
-  convertloop4b:
-    add        ecx, 4 - 1
-    jl         convertloop1b
+    mov        eax, 0x80808080  // 128 for biasing image to signed.
+    movd       xmm6, eax
+    pshufd     xmm6, xmm6, 0x00
+
+    mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
+    movd       xmm7, eax
+    pshufd     xmm7, xmm7, 0x00
+    mov        eax, [esp + 8 + 4]   // src0
+    mov        edx, [esp + 8 + 8]   // src1
+    mov        esi, [esp + 8 + 12]  // alpha
+    mov        edi, [esp + 8 + 16]  // dst
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        eax, esi
+    sub        edx, esi
+    sub        edi, esi
 
-    // 1 pixel loop.
-  convertloop1:
-    movd       xmm3, [eax]      // src argb
-    lea        eax, [eax + 4]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movd       xmm2, [esi]      // _r_b
-    psrlw      xmm3, 8          // alpha
-    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-    pshuflw    xmm3, xmm3,0F5h
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movd       xmm1, [esi]      // _a_g
-    lea        esi, [esi + 4]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 1
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    jge        convertloop1
+    // 8 pixel loop.
+  convertloop8:
+    movq       xmm0, qword ptr [esi]        // alpha
+    punpcklbw  xmm0, xmm0
+    pxor       xmm0, xmm5         // a, 255-a
+    movq       xmm1, qword ptr [eax + esi]  // src0
+    movq       xmm2, qword ptr [edx + esi]  // src1
+    punpcklbw  xmm1, xmm2
+    psubb      xmm1, xmm6         // bias src0/1 - 128
+    pmaddubsw  xmm0, xmm1
+    paddw      xmm0, xmm7         // unbias result - 32768 and round.
+    psrlw      xmm0, 8
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi + esi], xmm0
+    lea        esi, [esi + 8]
+    sub        ecx, 8
+    jg         convertloop8
 
-  convertloop1b:
+    pop        edi
     pop        esi
     ret
   }
 }
-#endif  // HAS_ARGBBLENDROW_SSE2
+#endif  // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+__declspec(naked)
+void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
+                         const uint8* alpha, uint8* dst, int width) {
+  __asm {
+    push        esi
+    push        edi
+    vpcmpeqb    ymm5, ymm5, ymm5       // generate mask 0xff00ff00
+    vpsllw      ymm5, ymm5, 8
+    mov         eax, 0x80808080  // 128 for biasing image to signed.
+    vmovd       xmm6, eax
+    vbroadcastss ymm6, xmm6
+    mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
+    vmovd       xmm7, eax
+    vbroadcastss ymm7, xmm7
+    mov         eax, [esp + 8 + 4]   // src0
+    mov         edx, [esp + 8 + 8]   // src1
+    mov         esi, [esp + 8 + 12]  // alpha
+    mov         edi, [esp + 8 + 16]  // dst
+    mov         ecx, [esp + 8 + 20]  // width
+    sub         eax, esi
+    sub         edx, esi
+    sub         edi, esi
+
+    // 32 pixel loop.
+  convertloop32:
+    vmovdqu     ymm0, [esi]        // alpha
+    vpunpckhbw  ymm3, ymm0, ymm0   // 8..15, 24..31
+    vpunpcklbw  ymm0, ymm0, ymm0   // 0..7, 16..23
+    vpxor       ymm3, ymm3, ymm5   // a, 255-a
+    vpxor       ymm0, ymm0, ymm5   // a, 255-a
+    vmovdqu     ymm1, [eax + esi]  // src0
+    vmovdqu     ymm2, [edx + esi]  // src1
+    vpunpckhbw  ymm4, ymm1, ymm2
+    vpunpcklbw  ymm1, ymm1, ymm2
+    vpsubb      ymm4, ymm4, ymm6   // bias src0/1 - 128
+    vpsubb      ymm1, ymm1, ymm6   // bias src0/1 - 128
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpmaddubsw  ymm0, ymm0, ymm1
+    vpaddw      ymm3, ymm3, ymm7   // unbias result - 32768 and round.
+    vpaddw      ymm0, ymm0, ymm7   // unbias result - 32768 and round.
+    vpsrlw      ymm3, ymm3, 8
+    vpsrlw      ymm0, ymm0, 8
+    vpackuswb   ymm0, ymm0, ymm3
+    vmovdqu     [edi + esi], ymm0
+    lea         esi, [esi + 32]
+    sub         ecx, 32
+    jg          convertloop32
+
+    pop         edi
+    pop         esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_BLENDPLANEROW_AVX2
 
 #ifdef HAS_ARGBBLENDROW_SSSE3
 // Shuffle table for isolating alpha.
@@ -3124,15 +4225,9 @@ static const uvec8 kShuffleAlpha = {
   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
 };
-// Same as SSE2, but replaces:
-//    psrlw      xmm3, 8          // alpha
-//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-//    pshuflw    xmm3, xmm3,0F5h
-// with..
-//    pshufb     xmm3, kShuffleAlpha // alpha
-// Blend 8 pixels at a time.
 
-__declspec(naked) __declspec(align(16))
+// Blend 8 pixels at a time.
+__declspec(naked)
 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
                         uint8* dst_argb, int width) {
   __asm {
@@ -3141,7 +4236,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 1
+    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
     psrlw      xmm7, 15
     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
     psrlw      xmm6, 8
@@ -3149,81 +4244,17 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     psllw      xmm5, 8
     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
     pslld      xmm4, 24
-
-    sub        ecx, 1
-    je         convertloop1     // only 1 pixel?
-    jl         convertloop1b
-
-    // 1 pixel loop until destination pointer is aligned.
-  alignloop1:
-    test       edx, 15          // aligned?
-    je         alignloop1b
-    movd       xmm3, [eax]
-    lea        eax, [eax + 4]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movd       xmm2, [esi]      // _r_b
-    pshufb     xmm3, kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movd       xmm1, [esi]      // _a_g
-    lea        esi, [esi + 4]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 1
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    jge        alignloop1
-
-  alignloop1b:
-    add        ecx, 1 - 4
-    jl         convertloop4b
-
-    test       eax, 15          // unaligned?
-    jne        convertuloop4
-    test       esi, 15          // unaligned?
-    jne        convertuloop4
+    sub        ecx, 4
+    jl         convertloop4b    // less than 4 pixels?
 
     // 4 pixel loop.
   convertloop4:
-    movdqa     xmm3, [eax]      // src argb
-    lea        eax, [eax + 16]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqa     xmm2, [esi]      // _r_b
-    pshufb     xmm3, kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqa     xmm1, [esi]      // _a_g
-    lea        esi, [esi + 16]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jge        convertloop4
-    jmp        convertloop4b
-
-    // 4 pixel unaligned loop.
-  convertuloop4:
     movdqu     xmm3, [eax]      // src argb
     lea        eax, [eax + 16]
     movdqa     xmm0, xmm3       // src argb
     pxor       xmm3, xmm4       // ~alpha
     movdqu     xmm2, [esi]      // _r_b
-    pshufb     xmm3, kShuffleAlpha // alpha
+    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
     pand       xmm2, xmm6       // _r_b
     paddw      xmm3, xmm7       // 256 - alpha
     pmullw     xmm2, xmm3       // _r_b * alpha
@@ -3236,10 +4267,10 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     paddusb    xmm0, xmm2       // + src argb
     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
     paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
-    jge        convertuloop4
+    sub        ecx, 4
+    jge        convertloop4
 
   convertloop4b:
     add        ecx, 4 - 1
@@ -3252,7 +4283,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     movdqa     xmm0, xmm3       // src argb
     pxor       xmm3, xmm4       // ~alpha
     movd       xmm2, [esi]      // _r_b
-    pshufb     xmm3, kShuffleAlpha // alpha
+    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
     pand       xmm2, xmm6       // _r_b
     paddw      xmm3, xmm7       // 256 - alpha
     pmullw     xmm2, xmm3       // _r_b * alpha
@@ -3265,9 +4296,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     paddusb    xmm0, xmm2       // + src argb
     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
     paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
+    sub        ecx, 1
     jge        convertloop1
 
   convertloop1b:
@@ -3277,50 +4308,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
 
-#ifdef HAS_ARGBATTENUATE_SSE2
-// Attenuate 4 pixels at a time.
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
-    sub        edx, eax
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
-    pslld      xmm4, 24
-    pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
-    psrld      xmm5, 8
-
-    align      16
- convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
-    punpcklbw  xmm0, xmm0       // first 2
-    pshufhw    xmm2, xmm0,0FFh  // 8 alpha words
-    pshuflw    xmm2, xmm2,0FFh
-    pmulhuw    xmm0, xmm2       // rgb * a
-    movdqa     xmm1, [eax]      // read 4 pixels
-    punpckhbw  xmm1, xmm1       // next 2 pixels
-    pshufhw    xmm2, xmm1,0FFh  // 8 alpha words
-    pshuflw    xmm2, xmm2,0FFh
-    pmulhuw    xmm1, xmm2       // rgb * a
-    movdqa     xmm2, [eax]      // alphas
-    psrlw      xmm0, 8
-    pand       xmm2, xmm4
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    pand       xmm0, xmm5       // keep original alphas
-    por        xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [eax + edx], xmm0
-    lea        eax, [eax + 16]
-    jg         convertloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBATTENUATE_SSE2
-
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha.
 static const uvec8 kShuffleAlpha0 = {
@@ -3330,39 +4317,38 @@ static const uvec8 kShuffleAlpha1 = {
   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
 };
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   // src_argb0
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
-    sub        edx, eax
     pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
     pslld      xmm3, 24
-    movdqa     xmm4, kShuffleAlpha0
-    movdqa     xmm5, kShuffleAlpha1
+    movdqa     xmm4, xmmword ptr kShuffleAlpha0
+    movdqa     xmm5, xmmword ptr kShuffleAlpha1
 
-    align      16
  convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]      // read 4 pixels
     pshufb     xmm0, xmm4       // isolate first 2 alphas
-    movdqa     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]      // read 4 pixels
     punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
     pmulhuw    xmm0, xmm1       // rgb * a
-    movdqa     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]      // read 4 pixels
     pshufb     xmm1, xmm5       // isolate next 2 alphas
-    movdqa     xmm2, [eax]      // read 4 pixels
+    movdqu     xmm2, [eax]      // read 4 pixels
     punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
     pmulhuw    xmm1, xmm2       // rgb * a
-    movdqa     xmm2, [eax]      // mask original alpha
+    movdqu     xmm2, [eax]      // mask original alpha
+    lea        eax, [eax + 16]
     pand       xmm2, xmm3
     psrlw      xmm0, 8
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     por        xmm0, xmm2       // copy original alpha
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
     sub        ecx, 4
-    movdqa     [eax + edx], xmm0
-    lea        eax, [eax + 16]
     jg         convertloop
 
     ret
@@ -3370,88 +4356,229 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 }
 #endif  // HAS_ARGBATTENUATEROW_SSSE3
 
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {
+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
+};
+__declspec(naked)
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
+    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
+    vpslld     ymm5, ymm5, 24
+
+ convertloop:
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
+    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
+    vpand      ymm6, ymm6, ymm5  // isolate alpha
+    vpsrlw     ymm0, ymm0, 8
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vpor       ymm0, ymm0, ymm6  // copy original alpha
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
   __asm {
+    push       ebx
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb0
-    mov        edx, [esp + 8 + 8]   // dst_argb
-    mov        ecx, [esp + 8 + 12]  // width
-    sub        edx, eax
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
-    pslld      xmm4, 24
+    mov        eax, [esp + 12 + 4]   // src_argb
+    mov        edx, [esp + 12 + 8]   // dst_argb
+    mov        ecx, [esp + 12 + 12]  // width
+    lea        ebx, fixed_invtbl8
 
-    align      16
  convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]      // read 4 pixels
     movzx      esi, byte ptr [eax + 3]  // first alpha
     movzx      edi, byte ptr [eax + 7]  // second alpha
     punpcklbw  xmm0, xmm0       // first 2
-    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
-    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
-    pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
-    pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
+    movd       xmm2, dword ptr [ebx + esi * 4]
+    movd       xmm3, dword ptr [ebx + edi * 4]
+    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
+    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
     movlhps    xmm2, xmm3
     pmulhuw    xmm0, xmm2       // rgb * a
 
-    movdqa     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]      // read 4 pixels
     movzx      esi, byte ptr [eax + 11]  // third alpha
     movzx      edi, byte ptr [eax + 15]  // forth alpha
     punpckhbw  xmm1, xmm1       // next 2
-    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
-    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
-    pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
-    pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
+    movd       xmm2, dword ptr [ebx + esi * 4]
+    movd       xmm3, dword ptr [ebx + edi * 4]
+    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
+    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
     movlhps    xmm2, xmm3
     pmulhuw    xmm1, xmm2       // rgb * a
-
-    movdqa     xmm2, [eax]      // alphas
-    pand       xmm2, xmm4
+    lea        eax, [eax + 16]
     packuswb   xmm0, xmm1
-    por        xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
     sub        ecx, 4
-    movdqa     [eax + edx], xmm0
-    lea        eax, [eax + 16]
     jg         convertloop
+
     pop        edi
     pop        esi
+    pop        ebx
     ret
   }
 }
 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
 
-#ifdef HAS_ARGBGRAYROW_SSSE3
-// Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R
-static const vec8 kARGBToGray = {
-  14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
 };
+// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
+// USE_GATHER is not on by default, due to being a slow instruction.
+#ifdef USE_GATHER
+__declspec(naked)
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
+
+ convertloop:
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
+    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
+    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
+    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
+    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
+    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#else  // USE_GATHER
+__declspec(naked)
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
 
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]   // src_argb
+    mov        edx, [esp + 12 + 8]   // dst_argb
+    mov        ecx, [esp + 12 + 12]  // width
+    sub        edx, eax
+    lea        ebx, fixed_invtbl8
+    vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
+
+ convertloop:
+    // replace VPGATHER
+    movzx      esi, byte ptr [eax + 3]                 // alpha0
+    movzx      edi, byte ptr [eax + 7]                 // alpha1
+    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
+    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
+    movzx      esi, byte ptr [eax + 11]                // alpha2
+    movzx      edi, byte ptr [eax + 15]                // alpha3
+    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
+    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
+    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
+    movzx      esi, byte ptr [eax + 19]                // alpha4
+    movzx      edi, byte ptr [eax + 23]                // alpha5
+    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
+    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
+    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
+    movzx      esi, byte ptr [eax + 27]                // alpha6
+    movzx      edi, byte ptr [eax + 31]                // alpha7
+    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
+    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
+    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
+    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
+    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
+    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
+    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+    // end of VPGATHER
+
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
+    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
+    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
+    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    vzeroupper
+    ret
+  }
+}
+#endif  // USE_GATHER
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_argb */
     mov        ecx, [esp + 12]  /* width */
-    movdqa     xmm4, kARGBToGray
-    sub        edx, eax
+    movdqa     xmm4, xmmword ptr kARGBToYJ
+    movdqa     xmm5, xmmword ptr kAddYJ64
 
-    align      16
  convertloop:
-    movdqa     xmm0, [eax]  // G
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]  // G
+    movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm0, xmm4
     pmaddubsw  xmm1, xmm4
     phaddw     xmm0, xmm1
+    paddw      xmm0, xmm5  // Add .5 for rounding.
     psrlw      xmm0, 7
     packuswb   xmm0, xmm0   // 8 G bytes
-    movdqa     xmm2, [eax]  // A
-    movdqa     xmm3, [eax + 16]
+    movdqu     xmm2, [eax]  // A
+    movdqu     xmm3, [eax + 16]
+    lea        eax, [eax + 32]
     psrld      xmm2, 24
     psrld      xmm3, 24
     packuswb   xmm2, xmm3
@@ -3462,10 +4589,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     movdqa     xmm1, xmm0
     punpcklwd  xmm0, xmm3   // GGGA first 4
     punpckhwd  xmm1, xmm3   // GGGA next 4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
     sub        ecx, 8
-    movdqa     [eax + edx], xmm0
-    movdqa     [eax + edx + 16], xmm1
-    lea        eax, [eax + 32]
     jg         convertloop
     ret
   }
@@ -3490,41 +4617,40 @@ static const vec8 kARGBToSepiaR = {
 };
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   /* dst_argb */
     mov        ecx, [esp + 8]   /* width */
-    movdqa     xmm2, kARGBToSepiaB
-    movdqa     xmm3, kARGBToSepiaG
-    movdqa     xmm4, kARGBToSepiaR
+    movdqa     xmm2, xmmword ptr kARGBToSepiaB
+    movdqa     xmm3, xmmword ptr kARGBToSepiaG
+    movdqa     xmm4, xmmword ptr kARGBToSepiaR
 
-    align      16
  convertloop:
-    movdqa     xmm0, [eax]  // B
-    movdqa     xmm6, [eax + 16]
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm6, [eax + 16]
     pmaddubsw  xmm0, xmm2
     pmaddubsw  xmm6, xmm2
     phaddw     xmm0, xmm6
     psrlw      xmm0, 7
     packuswb   xmm0, xmm0   // 8 B values
-    movdqa     xmm5, [eax]  // G
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm5, [eax]  // G
+    movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm3
     pmaddubsw  xmm1, xmm3
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
     packuswb   xmm5, xmm5   // 8 G values
     punpcklbw  xmm0, xmm5   // 8 BG values
-    movdqa     xmm5, [eax]  // R
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm5, [eax]  // R
+    movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm4
     pmaddubsw  xmm1, xmm4
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
     packuswb   xmm5, xmm5   // 8 R values
-    movdqa     xmm6, [eax]  // A
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm1, [eax + 16]
     psrld      xmm6, 24
     psrld      xmm1, 24
     packuswb   xmm6, xmm1
@@ -3533,10 +4659,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
     movdqa     xmm1, xmm0   // Weave BG, RA together
     punpcklwd  xmm0, xmm5   // BGRA first 4
     punpckhwd  xmm1, xmm5   // BGRA next 4
-    sub        ecx, 8
-    movdqa     [eax], xmm0
-    movdqa     [eax + 16], xmm1
+    movdqu     [eax], xmm0
+    movdqu     [eax + 16], xmm1
     lea        eax, [eax + 32]
+    sub        ecx, 8
     jg         convertloop
     ret
   }
@@ -3548,116 +4674,68 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
 // Same as Sepia except matrix is provided.
 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked) __declspec(align(16))
-void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
-                              int width) {
+__declspec(naked)
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width) {
   __asm {
-    mov        eax, [esp + 4]   /* dst_argb */
-    mov        edx, [esp + 8]   /* matrix_argb */
-    mov        ecx, [esp + 12]  /* width */
-    movd       xmm2, [edx]
-    movd       xmm3, [edx + 4]
-    movd       xmm4, [edx + 8]
-    pshufd     xmm2, xmm2, 0
-    pshufd     xmm3, xmm3, 0
-    pshufd     xmm4, xmm4, 0
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]  /* matrix_argb */
+    movdqu     xmm5, [ecx]
+    pshufd     xmm2, xmm5, 0x00
+    pshufd     xmm3, xmm5, 0x55
+    pshufd     xmm4, xmm5, 0xaa
+    pshufd     xmm5, xmm5, 0xff
+    mov        ecx, [esp + 16]  /* width */
 
-    align      16
  convertloop:
-    movdqa     xmm0, [eax]  // B
-    movdqa     xmm6, [eax + 16]
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm0, xmm2
-    pmaddubsw  xmm6, xmm2
-    movdqa     xmm5, [eax]  // G
-    movdqa     xmm1, [eax + 16]
-    pmaddubsw  xmm5, xmm3
+    pmaddubsw  xmm7, xmm2
+    movdqu     xmm6, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm6, xmm3
     pmaddubsw  xmm1, xmm3
-    phaddsw    xmm0, xmm6   // B
-    phaddsw    xmm5, xmm1   // G
-    psraw      xmm0, 7      // B
-    psraw      xmm5, 7      // G
+    phaddsw    xmm0, xmm7   // B
+    phaddsw    xmm6, xmm1   // G
+    psraw      xmm0, 6      // B
+    psraw      xmm6, 6      // G
     packuswb   xmm0, xmm0   // 8 B values
-    packuswb   xmm5, xmm5   // 8 G values
-    punpcklbw  xmm0, xmm5   // 8 BG values
-    movdqa     xmm5, [eax]  // R
-    movdqa     xmm1, [eax + 16]
-    pmaddubsw  xmm5, xmm4
+    packuswb   xmm6, xmm6   // 8 G values
+    punpcklbw  xmm0, xmm6   // 8 BG values
+    movdqu     xmm1, [eax]  // R
+    movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm1, xmm4
-    phaddsw    xmm5, xmm1
-    psraw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 R values
-    movdqa     xmm6, [eax]  // A
-    movdqa     xmm1, [eax + 16]
-    psrld      xmm6, 24
-    psrld      xmm1, 24
-    packuswb   xmm6, xmm1
+    pmaddubsw  xmm7, xmm4
+    phaddsw    xmm1, xmm7   // R
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm6, xmm5
+    pmaddubsw  xmm7, xmm5
+    phaddsw    xmm6, xmm7   // A
+    psraw      xmm1, 6      // R
+    psraw      xmm6, 6      // A
+    packuswb   xmm1, xmm1   // 8 R values
     packuswb   xmm6, xmm6   // 8 A values
-    movdqa     xmm1, xmm0   // Weave BG, RA together
-    punpcklbw  xmm5, xmm6   // 8 RA values
-    punpcklwd  xmm0, xmm5   // BGRA first 4
-    punpckhwd  xmm1, xmm5   // BGRA next 4
-    sub        ecx, 8
-    movdqa     [eax], xmm0
-    movdqa     [eax + 16], xmm1
+    punpcklbw  xmm1, xmm6   // 8 RA values
+    movdqa     xmm6, xmm0   // Weave BG, RA together
+    punpcklwd  xmm0, xmm1   // BGRA first 4
+    punpckhwd  xmm6, xmm1   // BGRA next 4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm6
     lea        eax, [eax + 32]
+    lea        edx, [edx + 32]
+    sub        ecx, 8
     jg         convertloop
     ret
   }
 }
 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
 
-#ifdef HAS_ARGBCOLORTABLEROW_X86
-// Tranform ARGB pixels with color table.
-__declspec(naked) __declspec(align(16))
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
-                           int width) {
-  __asm {
-    push       ebx
-    push       esi
-    push       edi
-    push       ebp
-    mov        eax, [esp + 16 + 4]   /* dst_argb */
-    mov        edi, [esp + 16 + 8]   /* table_argb */
-    mov        ecx, [esp + 16 + 12]  /* width */
-    xor        ebx, ebx
-    xor        edx, edx
-
-    align      16
- convertloop:
-    mov        ebp, dword ptr [eax]  // BGRA
-    mov        esi, ebp
-    and        ebp, 255
-    shr        esi, 8
-    and        esi, 255
-    mov        bl, [edi + ebp * 4 + 0]  // B
-    mov        dl, [edi + esi * 4 + 1]  // G
-    mov        ebp, dword ptr [eax]  // BGRA
-    mov        esi, ebp
-    shr        ebp, 16
-    shr        esi, 24
-    and        ebp, 255
-    mov        [eax], bl
-    mov        [eax + 1], dl
-    mov        bl, [edi + ebp * 4 + 2]  // R
-    mov        dl, [edi + esi * 4 + 3]  // A
-    mov        [eax + 2], bl
-    mov        [eax + 3], dl
-    lea        eax, [eax + 4]
-    sub        ecx, 1
-    jg         convertloop
-    pop        ebp
-    pop        edi
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-#endif  // HAS_ARGBCOLORTABLEROW_X86
-
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
                           int interval_offset, int width) {
   __asm {
@@ -3676,32 +4754,514 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
     pslld      xmm6, 24
 
-    align      16
  convertloop:
-    movdqa     xmm0, [eax]  // read 4 pixels
+    movdqu     xmm0, [eax]  // read 4 pixels
     punpcklbw  xmm0, xmm5   // first 2 pixels
     pmulhuw    xmm0, xmm2   // pixel * scale >> 16
-    movdqa     xmm1, [eax]  // read 4 pixels
+    movdqu     xmm1, [eax]  // read 4 pixels
     punpckhbw  xmm1, xmm5   // next 2 pixels
     pmulhuw    xmm1, xmm2
     pmullw     xmm0, xmm3   // * interval_size
-    movdqa     xmm7, [eax]  // read 4 pixels
+    movdqu     xmm7, [eax]  // read 4 pixels
     pmullw     xmm1, xmm3
     pand       xmm7, xmm6   // mask alpha
     paddw      xmm0, xmm4   // + interval_size / 2
     paddw      xmm1, xmm4
     packuswb   xmm0, xmm1
     por        xmm0, xmm7
-    sub        ecx, 4
-    movdqa     [eax], xmm0
+    movdqu     [eax], xmm0
     lea        eax, [eax + 16]
+    sub        ecx, 4
     jg         convertloop
     ret
   }
 }
 #endif  // HAS_ARGBQUANTIZEROW_SSE2
 
-#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+__declspec(naked)
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    movd       xmm2, [esp + 16]  // value
+    punpcklbw  xmm2, xmm2
+    punpcklqdq xmm2, xmm2
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0       // first 2
+    punpckhbw  xmm1, xmm1       // next 2
+    pmulhuw    xmm0, xmm2       // argb * value
+    pmulhuw    xmm1, xmm2       // argb * value
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked)
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm1, xmm0
+    movdqu     xmm3, xmm2
+    punpcklbw  xmm0, xmm0         // first 2
+    punpckhbw  xmm1, xmm1         // next 2
+    punpcklbw  xmm2, xmm5         // first 2
+    punpckhbw  xmm3, xmm5         // next 2
+    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
+    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
+    lea        eax, [eax + 16]
+    lea        esi, [esi + 16]
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+// TODO(fbarchard): Port this to posix, neon and other math functions.
+__declspec(naked)
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+    sub        ecx, 4
+    jl         convertloop49
+
+ convertloop4:
+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
+    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        convertloop4
+
+ convertloop49:
+    add        ecx, 4 - 1
+    jl         convertloop19
+
+ convertloop1:
+    movd       xmm0, [eax]        // read 1 pixels from src_argb0
+    lea        eax, [eax + 4]
+    movd       xmm1, [esi]        // read 1 pixels from src_argb1
+    lea        esi, [esi + 4]
+    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        convertloop1
+
+ convertloop19:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked)
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
+    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpxor      ymm5, ymm5, ymm5     // constant 0
+
+ convertloop:
+    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
+    lea        esi, [esi + 32]
+    vpunpcklbw ymm0, ymm1, ymm1   // low 4
+    vpunpckhbw ymm1, ymm1, ymm1   // high 4
+    vpunpcklbw ymm2, ymm3, ymm5   // low 4
+    vpunpckhbw ymm3, ymm3, ymm5   // high 4
+    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
+    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
+    vpackuswb  ymm0, ymm0, ymm1
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+__declspec(naked)
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_y0
+    mov        esi, [esp + 8 + 8]   // src_y1
+    mov        edi, [esp + 8 + 12]  // src_y2
+    mov        edx, [esp + 8 + 16]  // dst_sobelx
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        esi, eax
+    sub        edi, eax
+    sub        edx, eax
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    psubw      xmm0, xmm1
+    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm2, xmm5
+    psubw      xmm1, xmm2
+    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
+    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
+    punpcklbw  xmm2, xmm5
+    punpcklbw  xmm3, xmm5
+    psubw      xmm2, xmm3
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm1
+    paddw      xmm0, xmm1
+    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    psubw      xmm1, xmm0
+    pmaxsw     xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [eax + edx], xmm0
+    lea        eax, [eax + 8]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+__declspec(naked)
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_y0
+    mov        esi, [esp + 4 + 8]   // src_y1
+    mov        edx, [esp + 4 + 12]  // dst_sobely
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    sub        edx, eax
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    psubw      xmm0, xmm1
+    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
+    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm2, xmm5
+    psubw      xmm1, xmm2
+    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
+    punpcklbw  xmm2, xmm5
+    punpcklbw  xmm3, xmm5
+    psubw      xmm2, xmm3
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm1
+    paddw      xmm0, xmm1
+    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    psubw      xmm1, xmm0
+    pmaxsw     xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [eax + edx], xmm0
+    lea        eax, [eax + 8]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+__declspec(naked)
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    pcmpeqb    xmm5, xmm5           // alpha 255
+    pslld      xmm5, 24             // 0xff000000
+
+ convertloop:
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
+    movdqa     xmm2, xmm0             // GG
+    punpcklbw  xmm2, xmm0             // First 8
+    punpckhbw  xmm0, xmm0             // Next 8
+    movdqa     xmm1, xmm2             // GGGG
+    punpcklwd  xmm1, xmm2             // First 4
+    punpckhwd  xmm2, xmm2             // Next 4
+    por        xmm1, xmm5             // GGGA
+    por        xmm2, xmm5
+    movdqa     xmm3, xmm0             // GGGG
+    punpcklwd  xmm3, xmm0             // Next 4
+    punpckhwd  xmm0, xmm0             // Last 4
+    por        xmm3, xmm5             // GGGA
+    por        xmm0, xmm5
+    movdqu     [edx], xmm1
+    movdqu     [edx + 16], xmm2
+    movdqu     [edx + 32], xmm3
+    movdqu     [edx + 48], xmm0
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+__declspec(naked)
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+
+ convertloop:
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+__declspec(naked)
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    pcmpeqb    xmm5, xmm5           // alpha 255
+
+ convertloop:
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    movdqa     xmm2, xmm0
+    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
+    movdqa     xmm3, xmm0             // XA
+    punpcklbw  xmm3, xmm5
+    punpckhbw  xmm0, xmm5
+    movdqa     xmm4, xmm1             // YS
+    punpcklbw  xmm4, xmm2
+    punpckhbw  xmm1, xmm2
+    movdqa     xmm6, xmm4             // YSXA
+    punpcklwd  xmm6, xmm3             // First 4
+    punpckhwd  xmm4, xmm3             // Next 4
+    movdqa     xmm7, xmm1             // YSXA
+    punpcklwd  xmm7, xmm0             // Next 4
+    punpckhwd  xmm1, xmm0             // Last 4
+    movdqu     [edx], xmm6
+    movdqu     [edx + 16], xmm4
+    movdqu     [edx + 32], xmm7
+    movdqu     [edx + 48], xmm1
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
 // Consider float CumulativeSum.
 // Consider calling CumulativeSum one row at time as needed.
 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
@@ -3713,31 +5273,85 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
 // area is the number of pixels in the area being averaged.
 // dst points to pixel to store result to.
 // count is number of averaged pixels to produce.
-// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
-// aligned.
-void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
-                                 int width, int area, uint8* dst, int count) {
+// Does 4 pixels at a time.
+// This function requires alignment on accumulation buffer pointers.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+                                    int width, int area, uint8* dst,
+                                    int count) {
   __asm {
     mov        eax, topleft  // eax topleft
     mov        esi, botleft  // esi botleft
     mov        edx, width
-    movd       xmm4, area
+    movd       xmm5, area
     mov        edi, dst
     mov        ecx, count
-    cvtdq2ps   xmm4, xmm4
-    rcpss      xmm4, xmm4  // 1.0f / area
+    cvtdq2ps   xmm5, xmm5
+    rcpss      xmm4, xmm5  // 1.0f / area
     pshufd     xmm4, xmm4, 0
     sub        ecx, 4
     jl         l4b
 
+    cmp        area, 128  // 128 pixels will not overflow 15 bits.
+    ja         l4
+
+    pshufd     xmm5, xmm5, 0        // area
+    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
+    psrld      xmm6, 16
+    cvtdq2ps   xmm6, xmm6
+    addps      xmm5, xmm6           // (65536.0 + area - 1)
+    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
+    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
+    packssdw   xmm5, xmm5           // 16 bit shorts
+
+    // 4 pixel loop small blocks.
+  s4:
+    // top left
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+
+    // - top right
+    psubd      xmm0, [eax + edx * 4]
+    psubd      xmm1, [eax + edx * 4 + 16]
+    psubd      xmm2, [eax + edx * 4 + 32]
+    psubd      xmm3, [eax + edx * 4 + 48]
+    lea        eax, [eax + 64]
+
+    // - bottom left
+    psubd      xmm0, [esi]
+    psubd      xmm1, [esi + 16]
+    psubd      xmm2, [esi + 32]
+    psubd      xmm3, [esi + 48]
+
+    // + bottom right
+    paddd      xmm0, [esi + edx * 4]
+    paddd      xmm1, [esi + edx * 4 + 16]
+    paddd      xmm2, [esi + edx * 4 + 32]
+    paddd      xmm3, [esi + edx * 4 + 48]
+    lea        esi, [esi + 64]
+
+    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
+    packssdw   xmm2, xmm3
+
+    pmulhuw    xmm0, xmm5
+    pmulhuw    xmm2, xmm5
+
+    packuswb   xmm0, xmm2
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jge        s4
+
+    jmp        l4b
+
     // 4 pixel loop
-    align      4
   l4:
     // top left
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
 
     // - top right
     psubd      xmm0, [eax + edx * 4]
@@ -3784,9 +5398,8 @@ void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
     jl         l1b
 
     // 1 pixel loop
-    align      4
   l1:
-    movdqa     xmm0, [eax]
+    movdqu     xmm0, [eax]
     psubd      xmm0, [eax + edx * 4]
     lea        eax, [eax + 16]
     psubd      xmm0, [esi]
@@ -3804,7 +5417,7 @@ void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
   l1b:
   }
 }
-#endif  // HAS_CUMULATIVESUMTOAVERAGE_SSE2
+#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
 
 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
 // Creates a table of cumulative sums where each value is a sum of all values
@@ -3816,7 +5429,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     mov        edx, cumsum
     mov        esi, previous_cumsum
     mov        ecx, width
-    sub        esi, edx
     pxor       xmm0, xmm0
     pxor       xmm1, xmm1
 
@@ -3826,7 +5438,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     jne        l4b
 
     // 4 pixel loop
-    align      4
   l4:
     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
     lea        eax, [eax + 16]
@@ -3843,25 +5454,26 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     punpckhwd  xmm5, xmm1
 
     paddd      xmm0, xmm2
-    movdqa     xmm2, [edx + esi]  // previous row above.
+    movdqu     xmm2, [esi]  // previous row above.
     paddd      xmm2, xmm0
 
     paddd      xmm0, xmm3
-    movdqa     xmm3, [edx + esi + 16]
+    movdqu     xmm3, [esi + 16]
     paddd      xmm3, xmm0
 
     paddd      xmm0, xmm4
-    movdqa     xmm4, [edx + esi + 32]
+    movdqu     xmm4, [esi + 32]
     paddd      xmm4, xmm0
 
     paddd      xmm0, xmm5
-    movdqa     xmm5, [edx + esi + 48]
+    movdqu     xmm5, [esi + 48]
+    lea        esi, [esi + 64]
     paddd      xmm5, xmm0
 
-    movdqa     [edx], xmm2
-    movdqa     [edx + 16], xmm3
-    movdqa     [edx + 32], xmm4
-    movdqa     [edx + 48], xmm5
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    movdqu     [edx + 32], xmm4
+    movdqu     [edx + 48], xmm5
 
     lea        edx, [edx + 64]
     sub        ecx, 4
@@ -3872,14 +5484,14 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     jl         l1b
 
     // 1 pixel loop
-    align      4
   l1:
     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
     lea        eax, [eax + 4]
     punpcklbw  xmm2, xmm1
     punpcklwd  xmm2, xmm1
     paddd      xmm0, xmm2
-    movdqu     xmm2, [edx + esi]
+    movdqu     xmm2, [esi]
+    lea        esi, [esi + 16]
     paddd      xmm2, xmm0
     movdqu     [edx], xmm2
     lea        edx, [edx + 16]
@@ -3891,52 +5503,16 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
 }
 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
 
-#ifdef HAS_ARGBSHADE_SSE2
-// Shade 4 pixels at a time by specified value.
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
-    movd       xmm2, [esp + 16]  // value
-    sub        edx, eax
-    punpcklbw  xmm2, xmm2
-    punpcklqdq xmm2, xmm2
-
-    align      16
- convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm0       // first 2
-    punpckhbw  xmm1, xmm1       // next 2
-    pmulhuw    xmm0, xmm2       // argb * value
-    pmulhuw    xmm1, xmm2       // argb * value
-    psrlw      xmm0, 8
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    sub        ecx, 4
-    movdqa     [eax + edx], xmm0
-    lea        eax, [eax + 16]
-    jg         convertloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBSHADE_SSE2
-
 #ifdef HAS_ARGBAFFINEROW_SSE2
 // Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 LIBYUV_API
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                         uint8* dst_argb, const float* uv_dudv, int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 12]   // src_argb
+    mov        eax, [esp + 12]  // src_argb
     mov        esi, [esp + 16]  // stride
     mov        edx, [esp + 20]  // dst_argb
     mov        ecx, [esp + 24]  // pointer to uv_dudv
@@ -3962,7 +5538,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     addps      xmm4, xmm4    // dudv *= 4
 
     // 4 pixel loop
-    align      4
   l4:
     cvttps2dq  xmm0, xmm2    // x, y float to int first 2
     cvttps2dq  xmm1, xmm3    // x, y float to int next 2
@@ -3984,9 +5559,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     movd       xmm0, [eax + edi]  // read pixel 3
     punpckldq  xmm6, xmm0     // combine pixel 2 and 3
     addps      xmm3, xmm4    // x, y += dx, dy next 2
-    sub        ecx, 4
     movq       qword ptr 8[edx], xmm6
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jge        l4
 
   l4b:
@@ -3994,7 +5569,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     jl         l1b
 
     // 1 pixel loop
-    align      4
   l1:
     cvttps2dq  xmm0, xmm2    // x, y float to int
     packssdw   xmm0, xmm0    // x, y as shorts
@@ -4002,9 +5576,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     addps      xmm2, xmm7    // x, y += dx, dy
     movd       esi, xmm0
     movd       xmm0, [eax + esi]  // copy a pixel
-    sub        ecx, 1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
+    sub        ecx, 1
     jge        l1
   l1b:
     pop        edi
@@ -4014,11 +5588,12 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
-__declspec(naked) __declspec(align(16))
-void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                              ptrdiff_t src_stride, int dst_width,
-                              int source_y_fraction) {
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+__declspec(naked)
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
   __asm {
     push       esi
     push       edi
@@ -4027,71 +5602,668 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    // Dispatch to specialized filters if applicable.
+    cmp        eax, 0
+    je         xloop100  // 0 / 256.  Blend 100 / 0.
     sub        edi, esi
-    shr        eax, 1
+    cmp        eax, 128
+    je         xloop50   // 128 /256 is 0.50.  Blend 50 / 50.
+
+    vmovd      xmm0, eax  // high fraction 0..255
+    neg        eax
+    add        eax, 256
+    vmovd      xmm5, eax  // low fraction 256..1
+    vpunpcklbw xmm5, xmm5, xmm0
+    vpunpcklwd xmm5, xmm5, xmm5
+    vbroadcastss ymm5, xmm5
+
+    mov        eax, 0x80808080  // 128b for bias and rounding.
+    vmovd      xmm4, eax
+    vbroadcastss ymm4, xmm4
+
+  xloop:
+    vmovdqu    ymm0, [esi]
+    vmovdqu    ymm2, [esi + edx]
+    vpunpckhbw ymm1, ymm0, ymm2  // mutates
+    vpunpcklbw ymm0, ymm0, ymm2
+    vpsubb     ymm1, ymm1, ymm4  // bias to signed image
+    vpsubb     ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm5, ymm1
+    vpmaddubsw ymm0, ymm5, ymm0
+    vpaddw     ymm1, ymm1, ymm4  // unbias and round
+    vpaddw     ymm0, ymm0, ymm4
+    vpsrlw     ymm1, ymm1, 8
+    vpsrlw     ymm0, ymm0, 8
+    vpackuswb  ymm0, ymm0, ymm1  // unmutates
+    vmovdqu    [esi + edi], ymm0
+    lea        esi, [esi + 32]
+    sub        ecx, 32
+    jg         xloop
+    jmp        xloop99
+
+   // Blend 50 / 50.
+ xloop50:
+   vmovdqu    ymm0, [esi]
+   vpavgb     ymm0, ymm0, [esi + edx]
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop50
+   jmp        xloop99
+
+   // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+   rep movsb
+
+  xloop99:
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_INTERPOLATEROW_AVX2
+
+// Bilinear filter 16x2 -> 16x1
+// TODO(fbarchard): Consider allowing 256 using memcpy.
+__declspec(naked)
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
+    // Dispatch to specialized filters if applicable.
     cmp        eax, 0
-    je         xloop1
-    cmp        eax, 64
-    je         xloop2
-    movd       xmm0, eax  // high fraction 0..127
+    je         xloop100  // 0 /256.  Blend 100 / 0.
+    cmp        eax, 128
+    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
+
+    movd       xmm0, eax  // high fraction 0..255
     neg        eax
-    add        eax, 128
-    movd       xmm5, eax  // low fraction 128..1
+    add        eax, 256
+    movd       xmm5, eax  // low fraction 255..1
     punpcklbw  xmm5, xmm0
     punpcklwd  xmm5, xmm5
     pshufd     xmm5, xmm5, 0
+    mov        eax, 0x80808080  // 128 for biasing image to signed.
+    movd       xmm4, eax
+    pshufd     xmm4, xmm4, 0x00
 
-    align      16
   xloop:
-    movdqa     xmm0, [esi]
-    movdqa     xmm2, [esi + edx]
-    movdqa     xmm1, xmm0
+    movdqu     xmm0, [esi]
+    movdqu     xmm2, [esi + edx]
+    movdqu     xmm1, xmm0
     punpcklbw  xmm0, xmm2
     punpckhbw  xmm1, xmm2
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm1, xmm5
-    psrlw      xmm0, 7
-    psrlw      xmm1, 7
-    packuswb   xmm0, xmm1
-    sub        ecx, 4
-    movdqa     [esi + edi], xmm0
+    psubb      xmm0, xmm4  // bias image by -128
+    psubb      xmm1, xmm4
+    movdqa     xmm2, xmm5
+    movdqa     xmm3, xmm5
+    pmaddubsw  xmm2, xmm0
+    pmaddubsw  xmm3, xmm1
+    paddw      xmm2, xmm4
+    paddw      xmm3, xmm4
+    psrlw      xmm2, 8
+    psrlw      xmm3, 8
+    packuswb   xmm2, xmm3
+    movdqu     [esi + edi], xmm2
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop
+    jmp        xloop99
+
+    // Blend 50 / 50.
+  xloop50:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop50
+    jmp        xloop99
+
+    // Blend 100 / 0 - Copy row unchanged.
+  xloop100:
+    movdqu     xmm0, [esi]
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop100
 
+  xloop99:
     pop        edi
     pop        esi
     ret
+  }
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+__declspec(naked)
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int width) {
+  __asm {
+    mov        eax, [esp + 4]    // src_argb
+    mov        edx, [esp + 8]    // dst_argb
+    mov        ecx, [esp + 12]   // shuffler
+    movdqu     xmm5, [ecx]
+    mov        ecx, [esp + 16]   // width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm5
+    pshufb     xmm1, xmm5
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         wloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+__declspec(naked)
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width) {
+  __asm {
+    mov        eax, [esp + 4]     // src_argb
+    mov        edx, [esp + 8]     // dst_argb
+    mov        ecx, [esp + 12]    // shuffler
+    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
+    mov        ecx, [esp + 16]    // width
+
+  wloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax, [eax + 64]
+    vpshufb    ymm0, ymm0, ymm5
+    vpshufb    ymm1, ymm1, ymm5
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         wloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBSHUFFLEROW_AVX2
+
+__declspec(naked)
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width) {
+  __asm {
+    push       ebx
+    push       esi
+    mov        eax, [esp + 8 + 4]    // src_argb
+    mov        edx, [esp + 8 + 8]    // dst_argb
+    mov        esi, [esp + 8 + 12]   // shuffler
+    mov        ecx, [esp + 8 + 16]   // width
+    pxor       xmm5, xmm5
+
+    mov        ebx, [esi]   // shuffler
+    cmp        ebx, 0x03000102
+    je         shuf_3012
+    cmp        ebx, 0x00010203
+    je         shuf_0123
+    cmp        ebx, 0x00030201
+    je         shuf_0321
+    cmp        ebx, 0x02010003
+    je         shuf_2103
+
+  // TODO(fbarchard): Use one source pointer and 3 offsets.
+  shuf_any1:
+    movzx      ebx, byte ptr [esi]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx], bl
+    movzx      ebx, byte ptr [esi + 1]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 1], bl
+    movzx      ebx, byte ptr [esi + 2]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 2], bl
+    movzx      ebx, byte ptr [esi + 3]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 3], bl
+    lea        eax, [eax + 4]
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jg         shuf_any1
+    jmp        shuf99
+
+  shuf_0123:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
+    pshuflw    xmm0, xmm0, 01Bh
+    pshufhw    xmm1, xmm1, 01Bh
+    pshuflw    xmm1, xmm1, 01Bh
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_0123
+    jmp        shuf99
 
-    align      16
-  xloop1:
-    movdqa     xmm0, [esi]
+  shuf_0321:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
+    pshuflw    xmm0, xmm0, 039h
+    pshufhw    xmm1, xmm1, 039h
+    pshuflw    xmm1, xmm1, 039h
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
     sub        ecx, 4
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop1
+    jg         shuf_0321
+    jmp        shuf99
+
+  shuf_2103:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
+    pshuflw    xmm0, xmm0, 093h
+    pshufhw    xmm1, xmm1, 093h
+    pshuflw    xmm1, xmm1, 093h
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_2103
+    jmp        shuf99
+
+  shuf_3012:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
+    pshuflw    xmm0, xmm0, 0C6h
+    pshufhw    xmm1, xmm1, 0C6h
+    pshuflw    xmm1, xmm1, 0C6h
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_3012
+
+  shuf99:
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// YUY2 - Macro-pixel = 2 image pixels
+// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+
+// UYVY - Macro-pixel = 2 image pixels
+// U0Y0V0Y1
+
+__declspec(naked)
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_y
+    mov        esi, [esp + 8 + 8]    // src_u
+    mov        edx, [esp + 8 + 12]   // src_v
+    mov        edi, [esp + 8 + 16]   // dst_frame
+    mov        ecx, [esp + 8 + 20]   // width
+    sub        edx, esi
+
+  convertloop:
+    movq       xmm2, qword ptr [esi] // U
+    movq       xmm3, qword ptr [esi + edx] // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3 // UV
+    movdqu     xmm0, [eax] // Y
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm2 // YUYV
+    punpckhbw  xmm1, xmm2
+    movdqu     [edi], xmm0
+    movdqu     [edi + 16], xmm1
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
 
     pop        edi
     pop        esi
     ret
+  }
+}
 
-    align      16
-  xloop2:
-    movdqa     xmm0, [esi]
-    pavgb      xmm0, [esi + edx]
+__declspec(naked)
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_y
+    mov        esi, [esp + 8 + 8]    // src_u
+    mov        edx, [esp + 8 + 12]   // src_v
+    mov        edi, [esp + 8 + 16]   // dst_frame
+    mov        ecx, [esp + 8 + 20]   // width
+    sub        edx, esi
+
+  convertloop:
+    movq       xmm2, qword ptr [esi] // U
+    movq       xmm3, qword ptr [esi + edx] // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3 // UV
+    movdqu     xmm0, [eax] // Y
+    movdqa     xmm1, xmm2
+    lea        eax, [eax + 16]
+    punpcklbw  xmm1, xmm0 // UYVY
+    punpckhbw  xmm2, xmm0
+    movdqu     [edi], xmm1
+    movdqu     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+__declspec(naked)
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* src_argb */
+    mov        edx, [esp + 4 + 8]   /* dst_argb */
+    mov        esi, [esp + 4 + 12]  /* poly */
+    mov        ecx, [esp + 4 + 16]  /* width */
+    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
+
+    // 2 pixel loop.
+ convertloop:
+//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
+//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
+    movq       xmm0, qword ptr [eax]  // BGRABGRA
+    lea        eax, [eax + 8]
+    punpcklbw  xmm0, xmm3
+    movdqa     xmm4, xmm0
+    punpcklwd  xmm0, xmm3  // pixel 0
+    punpckhwd  xmm4, xmm3  // pixel 1
+    cvtdq2ps   xmm0, xmm0  // 4 floats
+    cvtdq2ps   xmm4, xmm4
+    movdqa     xmm1, xmm0  // X
+    movdqa     xmm5, xmm4
+    mulps      xmm0, [esi + 16]  // C1 * X
+    mulps      xmm4, [esi + 16]
+    addps      xmm0, [esi]  // result = C0 + C1 * X
+    addps      xmm4, [esi]
+    movdqa     xmm2, xmm1
+    movdqa     xmm6, xmm5
+    mulps      xmm2, xmm1  // X * X
+    mulps      xmm6, xmm5
+    mulps      xmm1, xmm2  // X * X * X
+    mulps      xmm5, xmm6
+    mulps      xmm2, [esi + 32]  // C2 * X * X
+    mulps      xmm6, [esi + 32]
+    mulps      xmm1, [esi + 48]  // C3 * X * X * X
+    mulps      xmm5, [esi + 48]
+    addps      xmm0, xmm2  // result += C2 * X * X
+    addps      xmm4, xmm6
+    addps      xmm0, xmm1  // result += C3 * X * X * X
+    addps      xmm4, xmm5
+    cvttps2dq  xmm0, xmm0
+    cvttps2dq  xmm4, xmm4
+    packuswb   xmm0, xmm4
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 2
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+__declspec(naked)
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]   /* poly */
+    vbroadcastf128 ymm4, [ecx]       // C0
+    vbroadcastf128 ymm5, [ecx + 16]  // C1
+    vbroadcastf128 ymm6, [ecx + 32]  // C2
+    vbroadcastf128 ymm7, [ecx + 48]  // C3
+    mov        ecx, [esp + 16]  /* width */
+
+    // 2 pixel loop.
+ convertloop:
+    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
+    lea         eax, [eax + 8]
+    vcvtdq2ps   ymm0, ymm0        // X 8 floats
+    vmulps      ymm2, ymm0, ymm0  // X * X
+    vmulps      ymm3, ymm0, ymm7  // C3 * X
+    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
+    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
+    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
+    vcvttps2dq  ymm0, ymm0
+    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
+    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
+    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
+    vmovq       qword ptr [edx], xmm0
+    lea         edx, [edx + 8]
+    sub         ecx, 2
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked)
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* dst_argb */
+    mov        esi, [esp + 4 + 8]   /* table_argb */
+    mov        ecx, [esp + 4 + 12]  /* width */
+
+    // 1 pixel loop.
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    movzx      edx, byte ptr [eax - 4 + 3]
+    movzx      edx, byte ptr [esi + edx * 4 + 3]
+    mov        byte ptr [eax - 4 + 3], dl
+    dec        ecx
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+__declspec(naked)
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* dst_argb */
+    mov        esi, [esp + 4 + 8]   /* table_argb */
+    mov        ecx, [esp + 4 + 12]  /* width */
+
+    // 1 pixel loop.
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    dec        ecx
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+__declspec(naked)
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   /* src_argb */
+    mov        edi, [esp + 8 + 8]   /* dst_argb */
+    mov        ecx, [esp + 8 + 12]  /* width */
+    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
+    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
+    pshufd     xmm2, xmm2, 0
+    pshufd     xmm3, xmm3, 0
+    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
+    psllw      xmm4, 8
+    pxor       xmm5, xmm5
+
+    // 4 pixel loop.
+  convertloop:
+    movdqu     xmm0, xmmword ptr [eax]      // generate luma ptr
+    pmaddubsw  xmm0, xmm3
+    phaddw     xmm0, xmm0
+    pand       xmm0, xmm4  // mask out low bits
+    punpcklwd  xmm0, xmm5
+    paddd      xmm0, xmm2  // add table base
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi], dl
+    movzx      edx, byte ptr [eax + 1]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 1], dl
+    movzx      edx, byte ptr [eax + 2]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 2], dl
+    movzx      edx, byte ptr [eax + 3]  // copy alpha.
+    mov        byte ptr [edi + 3], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 4]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 4], dl
+    movzx      edx, byte ptr [eax + 5]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 5], dl
+    movzx      edx, byte ptr [eax + 6]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 6], dl
+    movzx      edx, byte ptr [eax + 7]  // copy alpha.
+    mov        byte ptr [edi + 7], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 8]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 8], dl
+    movzx      edx, byte ptr [eax + 9]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 9], dl
+    movzx      edx, byte ptr [eax + 10]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 10], dl
+    movzx      edx, byte ptr [eax + 11]  // copy alpha.
+    mov        byte ptr [edi + 11], dl
+
+    movd       esi, xmm0
+
+    movzx      edx, byte ptr [eax + 12]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 12], dl
+    movzx      edx, byte ptr [eax + 13]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 13], dl
+    movzx      edx, byte ptr [eax + 14]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 14], dl
+    movzx      edx, byte ptr [eax + 15]  // copy alpha.
+    mov        byte ptr [edi + 15], dl
+
+    lea        eax, [eax + 16]
+    lea        edi, [edi + 16]
     sub        ecx, 4
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop2
+    jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
 
-#endif  // _M_IX86
+#endif  // defined(_M_X64)
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
diff --git a/files/source/scale.cc b/files/source/scale.cc
index 38910c91..36e3fe52 100644
--- a/files/source/scale.cc
+++ b/files/source/scale.cc
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -12,2444 +12,420 @@
 
 #include <assert.h>
 #include <string.h>
-#include <stdlib.h>  // For getenv()
 
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"  // For CopyPlane
 #include "libyuv/row.h"
+#include "libyuv/scale_row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-// Bilinear SSE2 is disabled.
-#define SSE2_DISABLED 1
-
-// Note: Some SSE2 reference manuals
-// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
-
-// Set the following flag to true to revert to only
-// using the reference implementation ScalePlaneBox(), and
-// NOT the optimized versions. Useful for debugging and
-// when comparing the quality of the resulting YUV planes
-// as produced by the optimized and non-optimized versions.
-static bool use_reference_impl_ = false;
-
-LIBYUV_API
-void SetUseReferenceImpl(bool use) {
-  use_reference_impl_ = use;
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
 }
 
-// ScaleRowDown2Int also used by planar functions
-
-/**
- * NEON downscalers with interpolation.
- *
- * Provided by Fritz Koenig
- *
- */
-
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
-#define HAS_SCALEROWDOWN2_NEON
-// Note - not static due to reuse in convert for 444 to 420.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                        uint8* dst, int dst_width);
-
-void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width);
-
-#define HAS_SCALEROWDOWN4_NEON
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-
-#define HAS_SCALEROWDOWN34_NEON
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-//  to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
-                         ptrdiff_t /* src_stride */,
-                         uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-
-#define HAS_SCALEROWDOWN38_NEON
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
-                         ptrdiff_t /* src_stride */,
-                         uint8* dst_ptr, int dst_width);
-// 32x3 -> 12x1
-void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-// 16x2 -> 16x1
-#define HAS_SCALEFILTERROWS_NEON
-void ScaleFilterRows_NEON(uint8* dst_ptr,
-                          const uint8* src_ptr, ptrdiff_t src_stride,
-                          int dst_width, int source_y_fraction);
-
-/**
- * SSE2 downscalers with interpolation.
- *
- * Provided by Frank Barchard (fbarchard@google.com)
- *
- */
-
-
-// Constants for SSSE3 code
-#elif !defined(YUV_DISABLE_ASM) && \
-    (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
-
-// GCC 4.2 on OSX has link error when passing static or const to inline.
-// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
-#ifdef __APPLE__
-#define CONST
-#else
-#define CONST static const
-#endif
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
 
-// Offsets for source bytes 0 to 9
-CONST uvec8 kShuf0 =
-  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-CONST uvec8 kShuf1 =
-  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-CONST uvec8 kShuf2 =
-  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 0 to 10
-CONST uvec8 kShuf01 =
-  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
-
-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-CONST uvec8 kShuf11 =
-  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-CONST uvec8 kShuf21 =
-  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
-
-// Coefficients for source bytes 0 to 10
-CONST uvec8 kMadd01 =
-  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
-
-// Coefficients for source bytes 10 to 21
-CONST uvec8 kMadd11 =
-  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
-
-// Coefficients for source bytes 21 to 31
-CONST uvec8 kMadd21 =
-  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
-
-// Coefficients for source bytes 21 to 31
-CONST vec16 kRound34 =
-  { 2, 2, 2, 2, 2, 2, 2, 2 };
-
-CONST uvec8 kShuf38a =
-  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-CONST uvec8 kShuf38b =
-  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 0,1,2
-CONST uvec8 kShufAc =
-  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 3,4,5
-CONST uvec8 kShufAc3 =
-  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x3 and 2x3
-CONST uvec16 kScaleAc33 =
-  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
-
-// Arrange first value for pixels 0,1,2,3,4,5
-CONST uvec8 kShufAb0 =
-  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
-
-// Arrange second value for pixels 0,1,2,3,4,5
-CONST uvec8 kShufAb1 =
-  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
-
-// Arrange third value for pixels 0,1,2,3,4,5
-CONST uvec8 kShufAb2 =
-  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x2 and 2x2
-CONST uvec16 kScaleAb2 =
-  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
-#endif
+// Scale plane, 1/2
+// This is an optimized version for scaling down a plane to 1/2 of
+// its original size.
 
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-
-#define HAS_SCALEROWDOWN2_SSE2
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    ret
-  }
-}
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
-    psrlw      xmm0, 8
-    movdqa     xmm3, xmm1
-    psrlw      xmm1, 8
-    pand       xmm2, xmm5
-    pand       xmm3, xmm5
-    pavgw      xmm0, xmm2
-    pavgw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    pop        esi
-    ret
+static void ScalePlaneDown2(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) =
+      filtering == kFilterNone ? ScaleRowDown2_C :
+      (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
+  int row_stride = src_stride << 1;
+  if (!filtering) {
+    src_ptr += src_stride;  // Point to odd rows.
+    src_stride = 0;
   }
-}
 
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
-                                         ptrdiff_t src_stride,
-                                         uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-
-    align      16
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    ret
+#if defined(HAS_SCALEROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
+        ScaleRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
+          ScaleRowDown2Box_NEON);
+    }
   }
-}
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
-                                            ptrdiff_t src_stride,
-                                            uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-
-    align      16
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
-    psrlw      xmm0, 8
-    movdqa     xmm3, xmm1
-    psrlw      xmm1, 8
-    pand       xmm2, xmm5
-    pand       xmm3, xmm5
-    pavgw      xmm0, xmm2
-    pavgw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-
-    sub        ecx, 16
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    pop        esi
-    ret
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :
+        ScaleRowDown2Box_Any_SSSE3);
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :
+          ScaleRowDown2Box_SSSE3);
+    }
   }
-}
-
-#define HAS_SCALEROWDOWN4_SSE2
-// Point samples 32 pixels to 8 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x000000ff
-    psrld      xmm5, 24
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    packuswb   xmm0, xmm0
-    sub        ecx, 8
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    jg         wloop
-
-    ret
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
+        ScaleRowDown2Box_Any_AVX2);
+    if (IS_ALIGNED(dst_width, 32)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
+          ScaleRowDown2Box_AVX2);
+    }
   }
-}
-
-// Blends 32x4 rectangle to 8x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_ptr
-    mov        esi, [esp + 8 + 8]    // src_stride
-    mov        edx, [esp + 8 + 12]   // dst_ptr
-    mov        ecx, [esp + 8 + 16]   // dst_width
-    lea        edi, [esi + esi * 2]  // src_stride * 3
-    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
-    psrlw      xmm7, 8
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-    movdqa     xmm2, [eax + esi * 2]
-    movdqa     xmm3, [eax + esi * 2 + 16]
-    movdqa     xmm4, [eax + edi]
-    movdqa     xmm5, [eax + edi + 16]
-    lea        eax, [eax + 32]
-    pavgb      xmm2, xmm4
-    pavgb      xmm3, xmm5
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
-    psrlw      xmm0, 8
-    movdqa     xmm3, xmm1
-    psrlw      xmm1, 8
-    pand       xmm2, xmm7
-    pand       xmm3, xmm7
-    pavgw      xmm0, xmm2
-    pavgw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-
-    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
-    psrlw      xmm0, 8
-    pand       xmm2, xmm7
-    pavgw      xmm0, xmm2
-    packuswb   xmm0, xmm0
-
-    sub        ecx, 8
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    jg         wloop
-
-    pop        edi
-    pop        esi
-    ret
+#endif
+#if defined(HAS_SCALEROWDOWN2_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
+      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown2 = filtering ?
+        ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
   }
-}
+#endif
 
-#define HAS_SCALEROWDOWN8_SSE2
-// Point samples 32 pixels to 4 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask isolating 1 src 8 bytes
-    psrlq      xmm5, 56
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1  // 32->16
-    packuswb   xmm0, xmm0  // 16->8
-    packuswb   xmm0, xmm0  // 8->4
-    sub        ecx, 4
-    movd       dword ptr [edx], xmm0
-    lea        edx, [edx + 4]
-    jg         wloop
-
-    ret
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
   }
-}
-
-// Blends 32x8 rectangle to 4x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebp
-    mov        eax, [esp + 12 + 4]   // src_ptr
-    mov        esi, [esp + 12 + 8]   // src_stride
-    mov        edx, [esp + 12 + 12]  // dst_ptr
-    mov        ecx, [esp + 12 + 16]  // dst_width
-    lea        edi, [esi + esi * 2]  // src_stride * 3
-    pxor       xmm7, xmm7
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]           // average 8 rows to 1
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-    movdqa     xmm2, [eax + esi * 2]
-    movdqa     xmm3, [eax + esi * 2 + 16]
-    movdqa     xmm4, [eax + edi]
-    movdqa     xmm5, [eax + edi + 16]
-    lea        ebp, [eax + esi * 4]
-    lea        eax, [eax + 32]
-    pavgb      xmm2, xmm4
-    pavgb      xmm3, xmm5
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-
-    movdqa     xmm2, [ebp]
-    movdqa     xmm3, [ebp + 16]
-    movdqa     xmm4, [ebp + esi]
-    movdqa     xmm5, [ebp + esi + 16]
-    pavgb      xmm2, xmm4
-    pavgb      xmm3, xmm5
-    movdqa     xmm4, [ebp + esi * 2]
-    movdqa     xmm5, [ebp + esi * 2 + 16]
-    movdqa     xmm6, [ebp + edi]
-    pavgb      xmm4, xmm6
-    movdqa     xmm6, [ebp + edi + 16]
-    pavgb      xmm5, xmm6
-    pavgb      xmm2, xmm4
-    pavgb      xmm3, xmm5
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-
-    psadbw     xmm0, xmm7            // average 32 pixels to 4
-    psadbw     xmm1, xmm7
-    pshufd     xmm0, xmm0, 0xd8      // x1x0 -> xx01
-    pshufd     xmm1, xmm1, 0x8d      // x3x2 -> 32xx
-    por        xmm0, xmm1            //      -> 3201
-    psrlw      xmm0, 3
-    packuswb   xmm0, xmm0
-    packuswb   xmm0, xmm0
-
-    sub        ecx, 4
-    movd       dword ptr [edx], xmm0
-    lea        edx, [edx + 4]
-    jg         wloop
-
-    pop        ebp
-    pop        edi
-    pop        esi
-    ret
+  // TODO(fbarchard): Loop through source height to allow odd height.
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
   }
 }
 
-#define HAS_SCALEROWDOWN34_SSSE3
-// Point samples 32 pixels to 24 pixels.
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    movdqa     xmm3, kShuf0
-    movdqa     xmm4, kShuf1
-    movdqa     xmm5, kShuf2
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm1
-    palignr    xmm1, xmm0, 8
-    pshufb     xmm0, xmm3
-    pshufb     xmm1, xmm4
-    pshufb     xmm2, xmm5
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + 8], xmm1
-    movq       qword ptr [edx + 16], xmm2
-    lea        edx, [edx + 24]
-    sub        ecx, 24
-    jg         wloop
-
-    ret
+static void ScalePlaneDown2_16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr,
+                               enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst_ptr, int dst_width) =
+    filtering == kFilterNone ? ScaleRowDown2_16_C :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :
+        ScaleRowDown2Box_16_C);
+  int row_stride = src_stride << 1;
+  if (!filtering) {
+    src_ptr += src_stride;  // Point to odd rows.
+    src_stride = 0;
   }
-}
 
-// Blends 32x2 rectangle to 24x1
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-// Register usage:
-// xmm0 src_row 0
-// xmm1 src_row 1
-// xmm2 shuf 0
-// xmm3 shuf 1
-// xmm4 shuf 2
-// xmm5 madd 0
-// xmm6 madd 1
-// xmm7 kRound34
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, kShuf01
-    movdqa     xmm3, kShuf11
-    movdqa     xmm4, kShuf21
-    movdqa     xmm5, kMadd01
-    movdqa     xmm6, kMadd11
-    movdqa     xmm7, kRound34
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]           // pixels 0..7
-    movdqa     xmm1, [eax + esi]
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm2
-    pmaddubsw  xmm0, xmm5
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]       // pixels 8..15
-    movdqu     xmm1, [eax + esi + 8]
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm3
-    pmaddubsw  xmm0, xmm6
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 8], xmm0
-    movdqa     xmm0, [eax + 16]      // pixels 16..23
-    movdqa     xmm1, [eax + esi + 16]
-    lea        eax, [eax + 32]
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm4
-    movdqa     xmm1, kMadd21
-    pmaddubsw  xmm0, xmm1
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    sub        ecx, 24
-    movq       qword ptr [edx + 16], xmm0
-    lea        edx, [edx + 24]
-    jg         wloop
-
-    pop        esi
-    ret
+#if defined(HAS_SCALEROWDOWN2_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
+    ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
+        ScaleRowDown2_16_NEON;
   }
-}
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, kShuf01
-    movdqa     xmm3, kShuf11
-    movdqa     xmm4, kShuf21
-    movdqa     xmm5, kMadd01
-    movdqa     xmm6, kMadd11
-    movdqa     xmm7, kRound34
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]           // pixels 0..7
-    movdqa     xmm1, [eax + esi]
-    pavgb      xmm1, xmm0
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm2
-    pmaddubsw  xmm0, xmm5
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]       // pixels 8..15
-    movdqu     xmm1, [eax + esi + 8]
-    pavgb      xmm1, xmm0
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm3
-    pmaddubsw  xmm0, xmm6
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 8], xmm0
-    movdqa     xmm0, [eax + 16]      // pixels 16..23
-    movdqa     xmm1, [eax + esi + 16]
-    lea        eax, [eax + 32]
-    pavgb      xmm1, xmm0
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm4
-    movdqa     xmm1, kMadd21
-    pmaddubsw  xmm0, xmm1
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    sub        ecx, 24
-    movq       qword ptr [edx + 16], xmm0
-    lea        edx, [edx+24]
-    jg         wloop
-
-    pop        esi
-    ret
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
+        ScaleRowDown2Box_16_SSE2);
   }
-}
-
-#define HAS_SCALEROWDOWN38_SSSE3
-// 3/8 point sampler
-
-// Scale 32 pixels to 12
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    movdqa     xmm4, kShuf38a
-    movdqa     xmm5, kShuf38b
-
-    align      16
-  xloop:
-    movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
-    movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
-    lea        eax, [eax + 32]
-    pshufb     xmm0, xmm4
-    pshufb     xmm1, xmm5
-    paddusb    xmm0, xmm1
-
-    sub        ecx, 12
-    movq       qword ptr [edx], xmm0 // write 12 pixels
-    movhlps    xmm1, xmm0
-    movd       [edx + 8], xmm1
-    lea        edx, [edx + 12]
-    jg         xloop
-
-    ret
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
+      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown2 = filtering ?
+        ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
   }
-}
+#endif
 
-// Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, kShufAc
-    movdqa     xmm3, kShufAc3
-    movdqa     xmm4, kScaleAc33
-    pxor       xmm5, xmm5
-
-    align      16
-  xloop:
-    movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
-    movdqa     xmm6, [eax + esi]
-    movhlps    xmm1, xmm0
-    movhlps    xmm7, xmm6
-    punpcklbw  xmm0, xmm5
-    punpcklbw  xmm1, xmm5
-    punpcklbw  xmm6, xmm5
-    punpcklbw  xmm7, xmm5
-    paddusw    xmm0, xmm6
-    paddusw    xmm1, xmm7
-    movdqa     xmm6, [eax + esi * 2]
-    lea        eax, [eax + 16]
-    movhlps    xmm7, xmm6
-    punpcklbw  xmm6, xmm5
-    punpcklbw  xmm7, xmm5
-    paddusw    xmm0, xmm6
-    paddusw    xmm1, xmm7
-
-    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
-    psrldq     xmm0, 2
-    paddusw    xmm6, xmm0
-    psrldq     xmm0, 2
-    paddusw    xmm6, xmm0
-    pshufb     xmm6, xmm2
-
-    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
-    psrldq     xmm1, 2
-    paddusw    xmm7, xmm1
-    psrldq     xmm1, 2
-    paddusw    xmm7, xmm1
-    pshufb     xmm7, xmm3
-    paddusw    xmm6, xmm7
-
-    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
-    packuswb   xmm6, xmm6
-
-    sub        ecx, 6
-    movd       [edx], xmm6           // write 6 pixels
-    psrlq      xmm6, 16
-    movd       [edx + 2], xmm6
-    lea        edx, [edx + 6]
-    jg         xloop
-
-    pop        esi
-    ret
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
   }
-}
-
-// Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, kShufAb0
-    movdqa     xmm3, kShufAb1
-    movdqa     xmm4, kShufAb2
-    movdqa     xmm5, kScaleAb2
-
-    align      16
-  xloop:
-    movdqa     xmm0, [eax]           // average 2 rows into xmm0
-    pavgb      xmm0, [eax + esi]
-    lea        eax, [eax + 16]
-
-    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
-    pshufb     xmm1, xmm2
-    movdqa     xmm6, xmm0
-    pshufb     xmm6, xmm3
-    paddusw    xmm1, xmm6
-    pshufb     xmm0, xmm4
-    paddusw    xmm1, xmm0
-
-    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
-    packuswb   xmm1, xmm1
-
-    sub        ecx, 6
-    movd       [edx], xmm1           // write 6 pixels
-    psrlq      xmm1, 16
-    movd       [edx + 2], xmm1
-    lea        edx, [edx + 6]
-    jg         xloop
-
-    pop        esi
-    ret
+  // TODO(fbarchard): Loop through source height to allow odd height.
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
   }
 }
 
-#define HAS_SCALEADDROWS_SSE2
-
-// Reads 16xN bytes and produces 16 shorts at a time.
-__declspec(naked) __declspec(align(16))
-static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint16* dst_ptr, int src_width,
-                              int src_height) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    push       ebp
-    mov        esi, [esp + 16 + 4]   // src_ptr
-    mov        edx, [esp + 16 + 8]   // src_stride
-    mov        edi, [esp + 16 + 12]  // dst_ptr
-    mov        ecx, [esp + 16 + 16]  // dst_width
-    mov        ebx, [esp + 16 + 20]  // height
-    pxor       xmm4, xmm4
-    dec        ebx
-
-    align      16
-  xloop:
-    // first row
-    movdqa     xmm0, [esi]
-    lea        eax, [esi + edx]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm4
-    punpckhbw  xmm1, xmm4
-    lea        esi, [esi + 16]
-    mov        ebp, ebx
-    test       ebp, ebp
-    je         ydone
-
-    // sum remaining rows
-    align      16
-  yloop:
-    movdqa     xmm2, [eax]       // read 16 pixels
-    lea        eax, [eax + edx]  // advance to next row
-    movdqa     xmm3, xmm2
-    punpcklbw  xmm2, xmm4
-    punpckhbw  xmm3, xmm4
-    paddusw    xmm0, xmm2        // sum 16 words
-    paddusw    xmm1, xmm3
-    sub        ebp, 1
-    jg         yloop
-  ydone:
-    movdqa     [edi], xmm0
-    movdqa     [edi + 16], xmm1
-    lea        edi, [edi + 32]
-
-    sub        ecx, 16
-    jg         xloop
-
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
+// Scale plane, 1/4
+// This is an optimized version for scaling down a plane to 1/4 of
+// its original size.
 
-#ifndef SSE2_DISABLED
-// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
-// Normal formula for bilinear interpolation is:
-//   source_y_fraction * row1 + (1 - source_y_fraction) row0
-// SSE2 version using the a single multiply of difference:
-//   source_y_fraction * (row1 - row0) + row0
-#define HAS_SCALEFILTERROWS_SSE2_DISABLED
-__declspec(naked) __declspec(align(16))
-static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                                 ptrdiff_t src_stride, int dst_width,
-                                 int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-    cmp        eax, 0
-    je         xloop1
-    cmp        eax, 128
-    je         xloop2
-
-    movd       xmm5, eax            // xmm5 = y fraction
-    punpcklbw  xmm5, xmm5
-    punpcklwd  xmm5, xmm5
-    pshufd     xmm5, xmm5, 0
-    pxor       xmm4, xmm4
-
-    align      16
-  xloop:
-    movdqa     xmm0, [esi]  // row0
-    movdqa     xmm2, [esi + edx]  // row1
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    punpcklbw  xmm2, xmm4
-    punpckhbw  xmm3, xmm4
-    punpcklbw  xmm0, xmm4
-    punpckhbw  xmm1, xmm4
-    psubw      xmm2, xmm0  // row1 - row0
-    psubw      xmm3, xmm1
-    pmulhw     xmm2, xmm5  // scale diff
-    pmulhw     xmm3, xmm5
-    paddw      xmm0, xmm2  // sum rows
-    paddw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop
-
-    punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
-
-    align      16
-  xloop1:
-    movdqa     xmm0, [esi]
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop1
-
-    punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
-
-    align      16
-  xloop2:
-    movdqa     xmm0, [esi]
-    pavgb      xmm0, [esi + edx]
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop2
-
-    punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
+static void ScalePlaneDown4(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) =
+      filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
+  int row_stride = src_stride << 2;
+  if (!filtering) {
+    src_ptr += src_stride * 2;  // Point to row 2.
+    src_stride = 0;
   }
-}
-#endif  // SSE2_DISABLED
-// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
-#define HAS_SCALEFILTERROWS_SSSE3
-__declspec(naked) __declspec(align(16))
-static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                                  ptrdiff_t src_stride, int dst_width,
-                                  int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-    shr        eax, 1
-    cmp        eax, 0
-    je         xloop1
-    cmp        eax, 64
-    je         xloop2
-    movd       xmm0, eax  // high fraction 0..127
-    neg        eax
-    add        eax, 128
-    movd       xmm5, eax  // low fraction 128..1
-    punpcklbw  xmm5, xmm0
-    punpcklwd  xmm5, xmm5
-    pshufd     xmm5, xmm5, 0
-
-    align      16
-  xloop:
-    movdqa     xmm0, [esi]
-    movdqa     xmm2, [esi + edx]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm2
-    punpckhbw  xmm1, xmm2
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm1, xmm5
-    psrlw      xmm0, 7
-    psrlw      xmm1, 7
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop
-
-    punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqa     [esi + edi], xmm0
-
-    pop        edi
-    pop        esi
-    ret
-
-    align      16
-  xloop1:
-    movdqa     xmm0, [esi]
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop1
-
-    punpckhbw  xmm0, xmm0
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
-
-    align      16
-  xloop2:
-    movdqa     xmm0, [esi]
-    pavgb      xmm0, [esi + edx]
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop2
-
-    punpckhbw  xmm0, xmm0
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
+#if defined(HAS_SCALEROWDOWN4_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
+    }
   }
-}
-
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-
-// GCC versions of row functions are verbatim conversions from Visual C.
-// Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
-#define HAS_SCALEROWDOWN2_SSE2
-static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    (%0,%3,1),%%xmm2                \n"
-    "movdqa    0x10(%0,%3,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm3                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "pavgw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"(static_cast<intptr_t>(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
-                                         ptrdiff_t src_stride,
-                                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
-                                            ptrdiff_t src_stride,
-                                            uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    (%0,%3,1),%%xmm2                \n"
-    "movdqu    0x10(%0,%3,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm3                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "pavgw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"(static_cast<intptr_t>(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-
-#define HAS_SCALEROWDOWN4_SSE2
-static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrld     $0x18,%%xmm5                    \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width) {
-  intptr_t stridex3 = 0;
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0x8,%%xmm7                     \n"
-    "lea       (%4,%4,2),%3                    \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    (%0,%4,1),%%xmm2                \n"
-    "movdqa    0x10(%0,%4,1),%%xmm3            \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    (%0,%4,2),%%xmm2                \n"
-    "movdqa    0x10(%0,%4,2),%%xmm3            \n"
-    "movdqa    (%0,%3,1),%%xmm4                \n"
-    "movdqa    0x10(%0,%3,1),%%xmm5            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm4,%%xmm2                   \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm5,%%xmm3                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "pand      %%xmm7,%%xmm2                   \n"
-    "pand      %%xmm7,%%xmm3                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "pavgw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "pand      %%xmm7,%%xmm2                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width),   // %2
-    "+r"(stridex3)     // %3
-  : "r"(static_cast<intptr_t>(src_stride))    // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
-#endif
-  );
-}
-
-#define HAS_SCALEROWDOWN8_SSE2
-static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlq     $0x38,%%xmm5                    \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0,(%1)                     \n"
-    "lea       0x4(%1),%1                      \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width) {
-  intptr_t stridex3 = 0;
-  intptr_t row4 = 0;
-  asm volatile (
-    "lea       (%5,%5,2),%3                    \n"
-    "pxor      %%xmm7,%%xmm7                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    (%0,%5,1),%%xmm2                \n"
-    "movdqa    0x10(%0,%5,1),%%xmm3            \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    (%0,%5,2),%%xmm2                \n"
-    "movdqa    0x10(%0,%5,2),%%xmm3            \n"
-    "movdqa    (%0,%3,1),%%xmm4                \n"
-    "movdqa    0x10(%0,%3,1),%%xmm5            \n"
-    "lea       (%0,%5,4),%4                    \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm4,%%xmm2                   \n"
-    "pavgb     %%xmm5,%%xmm3                   \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    0x0(%4),%%xmm2                  \n"
-    "movdqa    0x10(%4),%%xmm3                 \n"
-    "movdqa    0x0(%4,%5,1),%%xmm4             \n"
-    "movdqa    0x10(%4,%5,1),%%xmm5            \n"
-    "pavgb     %%xmm4,%%xmm2                   \n"
-    "pavgb     %%xmm5,%%xmm3                   \n"
-    "movdqa    0x0(%4,%5,2),%%xmm4             \n"
-    "movdqa    0x10(%4,%5,2),%%xmm5            \n"
-    "movdqa    0x0(%4,%3,1),%%xmm6             \n"
-    "pavgb     %%xmm6,%%xmm4                   \n"
-    "movdqa    0x10(%4,%3,1),%%xmm6            \n"
-    "pavgb     %%xmm6,%%xmm5                   \n"
-    "pavgb     %%xmm4,%%xmm2                   \n"
-    "pavgb     %%xmm5,%%xmm3                   \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "psadbw    %%xmm7,%%xmm0                   \n"
-    "psadbw    %%xmm7,%%xmm1                   \n"
-    "pshufd    $0xd8,%%xmm0,%%xmm0             \n"
-    "pshufd    $0x8d,%%xmm1,%%xmm1             \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "psrlw     $0x3,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0,(%1)                     \n"
-    "lea       0x4(%1),%1                      \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+rm"(dst_width),  // %2
-    "+r"(stridex3),    // %3
-    "+r"(row4)         // %4
-  : "r"(static_cast<intptr_t>(src_stride))  // %5
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-
-#define HAS_SCALEROWDOWN34_SSSE3
-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm3                       \n"
-    "movdqa    %1,%%xmm4                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kShuf0),  // %0
-    "m"(kShuf1),  // %1
-    "m"(kShuf2)   // %2
-  );
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm2                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "palignr   $0x8,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm3,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,0x8(%1)                  \n"
-    "movq      %%xmm2,0x10(%1)                 \n"
-    "lea       0x18(%1),%1                     \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"  // kShuf01
-    "movdqa    %1,%%xmm3                       \n"  // kShuf11
-    "movdqa    %2,%%xmm4                       \n"  // kShuf21
-  :
-  : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
-  );
-  asm volatile (
-    "movdqa    %0,%%xmm5                       \n"  // kMadd01
-    "movdqa    %1,%%xmm0                       \n"  // kMadd11
-    "movdqa    %2,%%xmm1                       \n"  // kRound34
-  :
-  : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
-  );
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm6                     \n"
-    "movdqa    (%0,%3),%%xmm7                  \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,(%1)                     \n"
-    "movdqu    0x8(%0),%%xmm6                  \n"
-    "movdqu    0x8(%0,%3),%%xmm7               \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm0,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,0x8(%1)                  \n"
-    "movdqa    0x10(%0),%%xmm6                 \n"
-    "movdqa    0x10(%0,%3),%%xmm7              \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm4,%%xmm6                   \n"
-    "pmaddubsw %4,%%xmm6                       \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,0x10(%1)                 \n"
-    "lea       0x18(%1),%1                     \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  : "r"(static_cast<intptr_t>(src_stride)),  // %3
-    "m"(kMadd21)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-
-static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"  // kShuf01
-    "movdqa    %1,%%xmm3                       \n"  // kShuf11
-    "movdqa    %2,%%xmm4                       \n"  // kShuf21
-  :
-  : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
-  );
-  asm volatile (
-    "movdqa    %0,%%xmm5                       \n"  // kMadd01
-    "movdqa    %1,%%xmm0                       \n"  // kMadd11
-    "movdqa    %2,%%xmm1                       \n"  // kRound34
-  :
-  : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
-  );
-
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm6                     \n"
-    "movdqa    (%0,%3,1),%%xmm7                \n"
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,(%1)                     \n"
-    "movdqu    0x8(%0),%%xmm6                  \n"
-    "movdqu    0x8(%0,%3,1),%%xmm7             \n"
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm0,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,0x8(%1)                  \n"
-    "movdqa    0x10(%0),%%xmm6                 \n"
-    "movdqa    0x10(%0,%3,1),%%xmm7            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm4,%%xmm6                   \n"
-    "pmaddubsw %4,%%xmm6                       \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,0x10(%1)                 \n"
-    "lea       0x18(%1),%1                     \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-    : "+r"(src_ptr),   // %0
-      "+r"(dst_ptr),   // %1
-      "+r"(dst_width)  // %2
-    : "r"(static_cast<intptr_t>(src_stride)),  // %3
-      "m"(kMadd21)     // %4
-    : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-
-#define HAS_SCALEROWDOWN38_SSSE3
-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movhlps   %%xmm0,%%xmm1                   \n"
-    "movd      %%xmm1,0x8(%1)                  \n"
-    "lea       0xc(%1),%1                      \n"
-    "sub       $0xc,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  : "m"(kShuf38a),   // %3
-    "m"(kShuf38b)    // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-      , "xmm0", "xmm1", "xmm4", "xmm5"
-#endif
-  );
-}
-
-static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm4                       \n"
-    "movdqa    %3,%%xmm5                       \n"
-  :
-  : "m"(kShufAb0),   // %0
-    "m"(kShufAb1),   // %1
-    "m"(kShufAb2),   // %2
-    "m"(kScaleAb2)   // %3
-  );
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "pavgb     (%0,%3,1),%%xmm0                \n"
-    "lea       0x10(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pshufb    %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "paddusw   %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "paddusw   %%xmm0,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "sub       $0x6,%2                         \n"
-    "movd      %%xmm1,(%1)                     \n"
-    "psrlq     $0x10,%%xmm1                    \n"
-    "movd      %%xmm1,0x2(%1)                  \n"
-    "lea       0x6(%1),%1                      \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width)    // %2
-  : "r"(static_cast<intptr_t>(src_stride))  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
-  );
-}
-
-static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm4                       \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-  :
-  : "m"(kShufAc),    // %0
-    "m"(kShufAc3),   // %1
-    "m"(kScaleAc33)  // %2
-  );
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    (%0,%3,1),%%xmm6                \n"
-    "movhlps   %%xmm0,%%xmm1                   \n"
-    "movhlps   %%xmm6,%%xmm7                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm6                   \n"
-    "punpcklbw %%xmm5,%%xmm7                   \n"
-    "paddusw   %%xmm6,%%xmm0                   \n"
-    "paddusw   %%xmm7,%%xmm1                   \n"
-    "movdqa    (%0,%3,2),%%xmm6                \n"
-    "lea       0x10(%0),%0                     \n"
-    "movhlps   %%xmm6,%%xmm7                   \n"
-    "punpcklbw %%xmm5,%%xmm6                   \n"
-    "punpcklbw %%xmm5,%%xmm7                   \n"
-    "paddusw   %%xmm6,%%xmm0                   \n"
-    "paddusw   %%xmm7,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "psrldq    $0x2,%%xmm0                     \n"
-    "paddusw   %%xmm0,%%xmm6                   \n"
-    "psrldq    $0x2,%%xmm0                     \n"
-    "paddusw   %%xmm0,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "movdqa    %%xmm1,%%xmm7                   \n"
-    "psrldq    $0x2,%%xmm1                     \n"
-    "paddusw   %%xmm1,%%xmm7                   \n"
-    "psrldq    $0x2,%%xmm1                     \n"
-    "paddusw   %%xmm1,%%xmm7                   \n"
-    "pshufb    %%xmm3,%%xmm7                   \n"
-    "paddusw   %%xmm7,%%xmm6                   \n"
-    "pmulhuw   %%xmm4,%%xmm6                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "sub       $0x6,%2                         \n"
-    "movd      %%xmm6,(%1)                     \n"
-    "psrlq     $0x10,%%xmm6                    \n"
-    "movd      %%xmm6,0x2(%1)                  \n"
-    "lea       0x6(%1),%1                      \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"(static_cast<intptr_t>(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-
-#define HAS_SCALEADDROWS_SSE2
-static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint16* dst_ptr, int src_width, int src_height) {
-  int tmp_height = 0;
-  intptr_t tmp_src = 0;
-  asm volatile (
-    "pxor      %%xmm4,%%xmm4                   \n"
-    "sub       $0x1,%5                         \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "mov       %0,%3                           \n"
-    "add       %6,%0                           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm4,%%xmm0                   \n"
-    "punpckhbw %%xmm4,%%xmm1                   \n"
-    "mov       %5,%2                           \n"
-    "test      %2,%2                           \n"
-    "je        3f                              \n"
-  "2:                                          \n"
-    "movdqa    (%0),%%xmm2                     \n"
-    "add       %6,%0                           \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklbw %%xmm4,%%xmm2                   \n"
-    "punpckhbw %%xmm4,%%xmm3                   \n"
-    "paddusw   %%xmm2,%%xmm0                   \n"
-    "paddusw   %%xmm3,%%xmm1                   \n"
-    "sub       $0x1,%2                         \n"
-    "jg        2b                              \n"
-  "3:                                          \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
-    "lea       0x10(%3),%0                     \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x10,%4                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(tmp_height),  // %2
-    "+r"(tmp_src),     // %3
-    "+r"(src_width),   // %4
-    "+rm"(src_height)  // %5
-  : "rm"(static_cast<intptr_t>(src_stride))  // %6
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
 #endif
-  );
-}
-
-#ifndef SSE2_DISABLED
-// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
-#define HAS_SCALEFILTERROWS_SSE2_DISABLED
-static void ScaleFilterRows_SSE2(uint8* dst_ptr,
-                                 const uint8* src_ptr, ptrdiff_t src_stride,
-                                 int dst_width, int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "cmp       $0x0,%3                         \n"
-    "je        2f                              \n"
-    "cmp       $0x80,%3                        \n"
-    "je        3f                              \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm5,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm2                \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklbw %%xmm4,%%xmm2                   \n"
-    "punpckhbw %%xmm4,%%xmm3                   \n"
-    "punpcklbw %%xmm4,%%xmm0                   \n"
-    "punpckhbw %%xmm4,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm2                   \n"
-    "psubw     %%xmm1,%%xmm3                   \n"
-    "pmulhw    %%xmm5,%%xmm2                   \n"
-    "pmulhw    %%xmm5,%%xmm3                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-    "jmp       4f                              \n"
-    ".p2align  4                               \n"
-  "2:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        2b                              \n"
-    "jmp       4f                              \n"
-    ".p2align  4                               \n"
-  "3:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "pavgb     (%1,%4,1),%%xmm0                \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        3b                              \n"
-    ".p2align  4                               \n"
-  "4:                                          \n"
-    "punpckhbw %%xmm0,%%xmm0                   \n"
-    "pshufhw   $0xff,%%xmm0,%%xmm0             \n"
-    "punpckhqdq %%xmm0,%%xmm0                  \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-  : "+r"(dst_ptr),    // %0
-    "+r"(src_ptr),    // %1
-    "+r"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"(static_cast<intptr_t>(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#if defined(HAS_SCALEROWDOWN4_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
+    }
+  }
 #endif
-  );
-}
-#endif  // SSE2_DISABLED
-
-// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
-#define HAS_SCALEFILTERROWS_SSSE3
-static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
-                                  const uint8* src_ptr, ptrdiff_t src_stride,
-                                  int dst_width, int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "shr       %3                              \n"
-    "cmp       $0x0,%3                         \n"
-    "je        2f                              \n"
-    "cmp       $0x40,%3                        \n"
-    "je        3f                              \n"
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x80,%3                        \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm2                \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm0                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    "pmaddubsw %%xmm5,%%xmm0                   \n"
-    "pmaddubsw %%xmm5,%%xmm1                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-    "jmp       4f                              \n"
-    ".p2align  4                               \n"
-  "2:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        2b                              \n"
-    "jmp       4f                              \n"
-    ".p2align  4                               \n"
-  "3:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "pavgb     (%1,%4,1),%%xmm0                \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        3b                              \n"
-    ".p2align  4                               \n"
-  "4:                                          \n"
-    "punpckhbw %%xmm0,%%xmm0                   \n"
-    "pshufhw   $0xff,%%xmm0,%%xmm0             \n"
-    "punpckhqdq %%xmm0,%%xmm0                  \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-  : "+r"(dst_ptr),    // %0
-    "+r"(src_ptr),    // %1
-    "+r"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"(static_cast<intptr_t>(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm5"
+#if defined(HAS_SCALEROWDOWN4_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
+    }
+  }
 #endif
-  );
-}
-#endif  // defined(__x86_64__) || defined(__i386__)
-
-// CPU agnostic row functions
-static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                            uint8* dst, int dst_width) {
-  uint8* dend = dst + dst_width - 1;
-  do {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[2];
-    dst += 2;
-    src_ptr += 4;
-  } while (dst < dend);
-  if (dst_width & 1) {
-    dst[0] = src_ptr[0];
+#if defined(HAS_SCALEROWDOWN4_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
   }
-}
+#endif
 
-void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
-  uint8* dend = dst + dst_width - 1;
-  do {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
-    dst += 2;
-    s += 4;
-    t += 4;
-  } while (dst < dend);
-  if (dst_width & 1) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
   }
-}
-
-static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                            uint8* dst, int dst_width) {
-  uint8* dend = dst + dst_width - 1;
-  do {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[4];
-    dst += 2;
-    src_ptr += 8;
-  } while (dst < dend);
-  if (dst_width & 1) {
-    dst[0] = src_ptr[0];
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
   }
 }
 
-static void ScaleRowDown4Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
-  intptr_t stride = src_stride;
-  uint8* dend = dst + dst_width - 1;
-  do {
-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
-    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-             src_ptr[stride + 4] + src_ptr[stride + 5] +
-             src_ptr[stride + 6] + src_ptr[stride + 7] +
-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
-             8) >> 4;
-    dst += 2;
-    src_ptr += 8;
-  } while (dst < dend);
-  if (dst_width & 1) {
-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+static void ScalePlaneDown4_16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr,
+                               enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst_ptr, int dst_width) =
+      filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
+  int row_stride = src_stride << 2;
+  if (!filtering) {
+    src_ptr += src_stride * 2;  // Point to row 2.
+    src_stride = 0;
   }
-}
-
-// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
-// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
-static const int kMaxOutputWidth = 640;
-static const int kMaxRow12 = kMaxOutputWidth * 2;
-
-static void ScaleRowDown8_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                            uint8* dst, int dst_width) {
-  uint8* dend = dst + dst_width - 1;
-  do {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[8];
-    dst += 2;
-    src_ptr += 16;
-  } while (dst < dend);
-  if (dst_width & 1) {
-    dst[0] = src_ptr[0];
+#if defined(HAS_SCALEROWDOWN4_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
+        ScaleRowDown4_16_NEON;
   }
-}
-
-// Note calling code checks width is less than max and if not
-// uses ScaleRowDown8_C instead.
-static void ScaleRowDown8Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
-  SIMD_ALIGNED(uint8 src_row[kMaxRow12 * 2]);
-  assert(dst_width <= kMaxOutputWidth);
-  ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
-  ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
-                     src_row + kMaxOutputWidth,
-                     dst_width * 2);
-  ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
-}
-
-static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                             uint8* dst, int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  uint8* dend = dst + dst_width;
-  do {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[1];
-    dst[2] = src_ptr[3];
-    dst += 3;
-    src_ptr += 4;
-  } while (dst < dend);
-}
-
-// Filter rows 0 and 1 together, 3 : 1
-static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* d, int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
-  uint8* dend = d + dst_width;
-  do {
-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-    d[0] = (a0 * 3 + b0 + 2) >> 2;
-    d[1] = (a1 * 3 + b1 + 2) >> 2;
-    d[2] = (a2 * 3 + b2 + 2) >> 2;
-    d += 3;
-    s += 4;
-    t += 4;
-  } while (d < dend);
-}
-
-// Filter rows 1 and 2 together, 1 : 1
-static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* d, int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
-  uint8* dend = d + dst_width;
-  do {
-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-    d[0] = (a0 + b0 + 1) >> 1;
-    d[1] = (a1 + b1 + 1) >> 1;
-    d[2] = (a2 + b2 + 1) >> 1;
-    d += 3;
-    s += 4;
-    t += 4;
-  } while (d < dend);
-}
-
-// (1-f)a + fb can be replaced with a + f(b-a)
-#define BLENDER(a, b, f) (static_cast<int>(a) + \
-    ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
-
-static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                              int dst_width, int x, int dx) {
-  for (int j = 0; j < dst_width - 1; j += 2) {
-    int xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    xi = x >> 16;
-    a = src_ptr[xi];
-    b = src_ptr[xi + 1];
-    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    int xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
+        ScaleRowDown4_16_SSE2;
   }
-}
-
-static const int kMaxInputWidth = 2560;
-
-#if defined(HAS_SCALEFILTERROWS_SSE2)
-// Filter row to 3/4
-static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
-                                int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  const uint8* s = src_ptr;
-  uint8* dend = dst_ptr + dst_width;
-  do {
-    dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    dst_ptr += 3;
-    s += 4;
-  } while (dst_ptr < dend);
-}
-
-#define HAS_SCALEROWDOWN34_SSE2_DISABLED
-// Filter rows 0 and 1 together, 3 : 1
-static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  SIMD_ALIGNED(uint8 row[kMaxInputWidth]);
-  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);
-  ScaleFilterCols34_C(dst_ptr, row, dst_width);
-}
-
-// Filter rows 1 and 2 together, 1 : 1
-static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  SIMD_ALIGNED(uint8 row[kMaxInputWidth]);
-  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
-  ScaleFilterCols34_C(dst_ptr, row, dst_width);
-}
 #endif
-
-static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                             uint8* dst, int dst_width) {
-  assert(dst_width % 3 == 0);
-  for (int x = 0; x < dst_width; x += 3) {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[3];
-    dst[2] = src_ptr[6];
-    dst += 3;
-    src_ptr += 8;
+#if defined(HAS_SCALEROWDOWN4_16_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
   }
-}
+#endif
 
-// 8x3 -> 3x1
-static void ScaleRowDown38_3_Int_C(const uint8* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  intptr_t stride = src_stride;
-  for (int i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-        (65536 / 9) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-        (65536 / 9) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7] +
-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-        (65536 / 6) >> 16;
-    src_ptr += 8;
-    dst_ptr += 3;
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
   }
-}
-
-// 8x2 -> 3x1
-static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  intptr_t stride = src_stride;
-  for (int i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2]) * (65536 / 6) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5]) * (65536 / 6) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7]) *
-        (65536 / 4) >> 16;
-    src_ptr += 8;
-    dst_ptr += 3;
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
   }
 }
 
-// C version 8x2 -> 8x1
-static void ScaleFilterRows_C(uint8* dst_ptr,
-                              const uint8* src_ptr, ptrdiff_t src_stride,
-                              int dst_width, int source_y_fraction) {
-  assert(dst_width > 0);
-  int y1_fraction = source_y_fraction;
-  int y0_fraction = 256 - y1_fraction;
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  uint8* end = dst_ptr + dst_width;
-  do {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
-    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
-    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
-    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
-    dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
-    dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
-    dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
-    dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
-    src_ptr += 8;
-    src_ptr1 += 8;
-    dst_ptr += 8;
-  } while (dst_ptr < end);
-  dst_ptr[0] = dst_ptr[-1];
-}
+// Scale plane down, 3/4
 
-void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height) {
-  assert(src_width > 0);
-  assert(src_height > 0);
-  for (int x = 0; x < src_width; ++x) {
-    const uint8* s = src_ptr + x;
-    int sum = 0;
-    for (int y = 0; y < src_height; ++y) {
-      sum += s[0];
-      s += src_stride;
-    }
-    dst_ptr[x] = sum;
-  }
-}
-
-/**
- * Scale plane, 1/2
- *
- * This is an optimized version for scaling down a plane to 1/2 of
- * its original size.
- *
- */
-static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
-                            FilterMode filtering) {
-  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) =
-      filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
-#if defined(HAS_SCALEROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON) &&
-      IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
+static void ScalePlaneDown34(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr,
+                             enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown34_0 = ScaleRowDown34_C;
+    ScaleRowDown34_1 = ScaleRowDown34_C;
+  } else {
+    ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
   }
-#elif defined(HAS_SCALEROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering ? ScaleRowDown2Int_Unaligned_SSE2 :
-        ScaleRowDown2_Unaligned_SSE2;
-    if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-      ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
+#if defined(HAS_SCALEROWDOWN34_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
+    }
+    if (dst_width % 24 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_NEON;
+        ScaleRowDown34_1 = ScaleRowDown34_NEON;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
+      }
     }
   }
 #endif
-
-  // TODO(fbarchard): Loop through source height to allow odd height.
-  for (int y = 0; y < dst_height; ++y) {
-    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
-    src_ptr += (src_stride << 1);
-    dst_ptr += dst_stride;
-  }
-}
-
-/**
- * Scale plane, 1/4
- *
- * This is an optimized version for scaling down a plane to 1/4 of
- * its original size.
- */
-static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
-                            FilterMode filtering) {
-  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) =
-      filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
-#if defined(HAS_SCALEROWDOWN4_NEON)
-  if (TestCpuFlag(kCpuHasNEON) &&
-      IS_ALIGNED(dst_width, 4)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
+    }
+    if (dst_width % 24 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+        ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
+      }
+    }
   }
-#elif defined(HAS_SCALEROWDOWN4_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(dst_width, 8) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
+#endif
+#if defined(HAS_SCALEROWDOWN34_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_DSPR2;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2;
+    }
   }
 #endif
 
-  for (int y = 0; y < dst_height; ++y) {
-    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
-    src_ptr += (src_stride << 2);
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
+                     dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
     dst_ptr += dst_stride;
   }
-}
-
-/**
- * Scale plane, 1/8
- *
- * This is an optimized version for scaling down a plane to 1/8
- * of its original size.
- *
- */
-static void ScalePlaneDown8(int /* src_width */, int /* src_height */,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
-                            FilterMode filtering) {
-  void (*ScaleRowDown8)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) =
-      filtering && (dst_width <= kMaxOutputWidth) ?
-      ScaleRowDown8Int_C : ScaleRowDown8_C;
-#if defined(HAS_SCALEROWDOWN8_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
-    ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
-  }
-#endif
 
-  for (int y = 0; y < dst_height; ++y) {
-    ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
-    src_ptr += (src_stride << 3);
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
     dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
   }
 }
 
-/**
- * Scale plane down, 3/4
- *
- * Provided by Frank Barchard (fbarchard@google.com)
- *
- */
-static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr,
-                             FilterMode filtering) {
+static void ScalePlaneDown34_16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr,
+                                enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
   assert(dst_width % 3 == 0);
-  void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-  void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
   if (!filtering) {
-    ScaleRowDown34_0 = ScaleRowDown34_C;
-    ScaleRowDown34_1 = ScaleRowDown34_C;
+    ScaleRowDown34_0 = ScaleRowDown34_16_C;
+    ScaleRowDown34_1 = ScaleRowDown34_16_C;
   } else {
-    ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
-    ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
+    ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C;
   }
-#if defined(HAS_SCALEROWDOWN34_NEON)
+#if defined(HAS_SCALEROWDOWN34_16_NEON)
   if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
     if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_NEON;
-      ScaleRowDown34_1 = ScaleRowDown34_NEON;
+      ScaleRowDown34_0 = ScaleRowDown34_16_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_16_NEON;
     } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON;
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN34_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && filtering) {
-    ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
-    ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
+#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3;
+    }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN34_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+#if defined(HAS_SCALEROWDOWN34_16_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
     if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+      ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2;
     } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2;
     }
   }
 #endif
 
-  for (int y = 0; y < dst_height - 2; y += 3) {
-    ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
                      dst_ptr, dst_width);
     src_ptr += src_stride * 2;
     dst_ptr += dst_stride;
@@ -2457,7 +433,7 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
 
   // Remainder 1 or 2 rows with last row vertically unfiltered
   if ((dst_height % 3) == 2) {
-    ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
     ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
@@ -2466,78 +442,110 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
   }
 }
 
-/**
- * Scale plane, 3/8
- *
- * This is an optimized version for scaling down a plane to 3/8
- * of its original size.
- *
- * Uses box filter arranges like this
- * aaabbbcc -> abc
- * aaabbbcc    def
- * aaabbbcc    ghi
- * dddeeeff
- * dddeeeff
- * dddeeeff
- * ggghhhii
- * ggghhhii
- * Boxes are 3x3, 2x3, 3x2 and 2x2
- */
-static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
+
+// Scale plane, 3/8
+// This is an optimized version for scaling down a plane to 3/8
+// of its original size.
+//
+// Uses box filter arranges like this
+// aaabbbcc -> abc
+// aaabbbcc    def
+// aaabbbcc    ghi
+// dddeeeff
+// dddeeeff
+// dddeeeff
+// ggghhhii
+// ggghhhii
+// Boxes are 3x3, 2x3, 3x2 and 2x2
+
+static void ScalePlaneDown38(int src_width, int src_height,
                              int dst_width, int dst_height,
                              int src_stride, int dst_stride,
                              const uint8* src_ptr, uint8* dst_ptr,
-                             FilterMode filtering) {
-  assert(dst_width % 3 == 0);
+                             enum FilterMode filtering) {
+  int y;
   void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
   void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
   if (!filtering) {
     ScaleRowDown38_3 = ScaleRowDown38_C;
     ScaleRowDown38_2 = ScaleRowDown38_C;
   } else {
-    ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
-    ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
+    ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
+    ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
   }
+
 #if defined(HAS_SCALEROWDOWN38_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_NEON;
-      ScaleRowDown38_2 = ScaleRowDown38_NEON;
+      ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
     } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
+    }
+    if (dst_width % 12 == 0) {
+      if (!filtering) {
+        ScaleRowDown38_3 = ScaleRowDown38_NEON;
+        ScaleRowDown38_2 = ScaleRowDown38_NEON;
+      } else {
+        ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
+        ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
+      }
     }
   }
-#elif defined(HAS_SCALEROWDOWN38_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
+    }
+    if (dst_width % 12 == 0 && !filtering) {
       ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
       ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+    }
+    if (dst_width % 6 == 0 && filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_DSPR2;
     } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2;
     }
   }
 #endif
 
-  for (int y = 0; y < dst_height - 2; y += 3) {
-    ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 3;
     dst_ptr += dst_stride;
-    ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 3;
     dst_ptr += dst_stride;
-    ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 2;
     dst_ptr += dst_stride;
   }
 
   // Remainder 1 or 2 rows with last row vertically unfiltered
   if ((dst_height % 3) == 2) {
-    ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 3;
     dst_ptr += dst_stride;
     ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
@@ -2546,36 +554,101 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
   }
 }
 
-static __inline uint32 SumBox(int iboxwidth, int iboxheight,
-                              ptrdiff_t src_stride, const uint8* src_ptr) {
-  assert(iboxwidth > 0);
-  assert(iboxheight > 0);
-  uint32 sum = 0u;
-  for (int y = 0; y < iboxheight; ++y) {
-    for (int x = 0; x < iboxwidth; ++x) {
-      sum += src_ptr[x];
+static void ScalePlaneDown38_16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr,
+                                enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown38_3 = ScaleRowDown38_16_C;
+    ScaleRowDown38_2 = ScaleRowDown38_16_C;
+  } else {
+    ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C;
+    ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C;
+  }
+#if defined(HAS_SCALEROWDOWN38_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_16_NEON;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
     }
-    src_ptr += src_stride;
   }
-  return sum;
-}
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2;
+    }
+  }
+#endif
 
-static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
-                               int x, int dx, ptrdiff_t src_stride,
-                               const uint8* src_ptr, uint8* dst_ptr) {
-  for (int i = 0; i < dst_width; ++i) {
-    int ix = x >> 16;
-    x += dx;
-    int boxwidth = (x >> 16) - ix;
-    *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
-        (boxwidth * boxheight);
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
   }
 }
 
+#define MIN1(x) ((x) < 1 ? 1 : (x))
+
 static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
+  uint32 sum = 0u;
+  int x;
   assert(iboxwidth > 0);
+  for (x = 0; x < iboxwidth; ++x) {
+    sum += src_ptr[x];
+  }
+  return sum;
+}
+
+static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
   uint32 sum = 0u;
-  for (int x = 0; x < iboxwidth; ++x) {
+  int x;
+  assert(iboxwidth > 0);
+  for (x = 0; x < iboxwidth; ++x) {
     sum += src_ptr[x];
   }
   return sum;
@@ -2583,339 +656,895 @@ static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
 
 static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
                             const uint16* src_ptr, uint8* dst_ptr) {
+  int i;
+  int scaletbl[2];
+  int minboxwidth = dx >> 16;
+  int boxwidth;
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+  for (i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+    x += dx;
+    boxwidth = MIN1((x >> 16) - ix);
+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) *
+        scaletbl[boxwidth - minboxwidth] >> 16;
+  }
+}
+
+static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
+                               const uint32* src_ptr, uint16* dst_ptr) {
+  int i;
   int scaletbl[2];
-  int minboxwidth = (dx >> 16);
-  scaletbl[0] = 65536 / (minboxwidth * boxheight);
-  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
-  int *scaleptr = scaletbl - minboxwidth;
-  for (int i = 0; i < dst_width; ++i) {
+  int minboxwidth = dx >> 16;
+  int boxwidth;
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+  for (i = 0; i < dst_width; ++i) {
     int ix = x >> 16;
     x += dx;
-    int boxwidth = (x >> 16) - ix;
-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+    boxwidth = MIN1((x >> 16) - ix);
+    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
+        scaletbl[boxwidth - minboxwidth]  >> 16;
+  }
+}
+
+static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int scaleval = 65536 / boxheight;
+  int i;
+  src_ptr += (x >> 16);
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = src_ptr[i] * scaleval >> 16;
   }
 }
 
 static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
                             const uint16* src_ptr, uint8* dst_ptr) {
-  int boxwidth = (dx >> 16);
+  int boxwidth = MIN1(dx >> 16);
   int scaleval = 65536 / (boxwidth * boxheight);
-  for (int i = 0; i < dst_width; ++i) {
+  int i;
+  x >>= 16;
+  for (i = 0; i < dst_width; ++i) {
     *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
     x += boxwidth;
   }
 }
 
-/**
- * Scale plane down to any dimensions, with interpolation.
- * (boxfilter).
- *
- * Same method as SimpleScale, which is fixed point, outputting
- * one pixel of destination using fixed point (16.16) to step
- * through source, sampling a box of pixel with simple
- * averaging.
- */
+static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
+                               const uint32* src_ptr, uint16* dst_ptr) {
+  int boxwidth = MIN1(dx >> 16);
+  int scaleval = 65536 / (boxwidth * boxheight);
+  int i;
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
+    x += boxwidth;
+  }
+}
+
+// Scale plane down to any dimensions, with interpolation.
+// (boxfilter).
+//
+// Same method as SimpleScale, which is fixed point, outputting
+// one pixel of destination using fixed point (16.16) to step
+// through source, sampling a box of pixel with simple
+// averaging.
 static void ScalePlaneBox(int src_width, int src_height,
                           int dst_width, int dst_height,
                           int src_stride, int dst_stride,
                           const uint8* src_ptr, uint8* dst_ptr) {
-  assert(dst_width > 0);
-  assert(dst_height > 0);
-  int dx = (src_width << 16) / dst_width;
-  int dy = (src_height << 16) / dst_height;
-  int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
-  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
-  int maxy = (src_height << 16);
-  if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
-      dst_height * 2 > src_height) {
-    uint8* dst = dst_ptr;
-    for (int j = 0; j < dst_height; ++j) {
+  int j, k;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height << 16);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  {
+    // Allocate a row buffer of uint16.
+    align_buffer_64(row16, src_width * 2);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+        const uint16* src_ptr, uint8* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_C:
+        ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+    void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
+        ScaleAddRow_C;
+#if defined(HAS_SCALEADDROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2)) {
+      ScaleAddRow = ScaleAddRow_Any_SSE2;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_SSE2;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+      ScaleAddRow = ScaleAddRow_Any_AVX2;
+      if (IS_ALIGNED(src_width, 32)) {
+        ScaleAddRow = ScaleAddRow_AVX2;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      ScaleAddRow = ScaleAddRow_Any_NEON;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_NEON;
+      }
+    }
+#endif
+
+    for (j = 0; j < dst_height; ++j) {
+      int boxheight;
       int iy = y >> 16;
       const uint8* src = src_ptr + iy * src_stride;
       y += dy;
-      if (y > maxy) {
-        y = maxy;
+      if (y > max_y) {
+        y = max_y;
+      }
+      boxheight = MIN1((y >> 16) - iy);
+      memset(row16, 0, src_width * 2);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint16 *)(row16), src_width);
+        src += src_stride;
       }
-      int boxheight = (y >> 16) - iy;
-      ScalePlaneBoxRow_C(dst_width, boxheight,
-                         x, dx, src_stride,
-                         src, dst);
-      dst += dst_stride;
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
+      dst_ptr += dst_stride;
     }
-  } else {
-    SIMD_ALIGNED(uint16 row[kMaxInputWidth]);
-    void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst_ptr, int src_width, int src_height)=
-        ScaleAddRows_C;
+    free_aligned_buffer_64(row16);
+  }
+}
+
+static void ScalePlaneBox_16(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint16* src_ptr, uint16* dst_ptr) {
+  int j, k;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height << 16);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  {
+    // Allocate a row buffer of uint32.
+    align_buffer_64(row32, src_width * 4);
     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-                         const uint16* src_ptr, uint8* dst_ptr);
-    if (dx & 0xffff) {
-      ScaleAddCols = ScaleAddCols2_C;
-    } else {
-      ScaleAddCols = ScaleAddCols1_C;
-    }
-#if defined(HAS_SCALEADDROWS_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) &&
-        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
-      ScaleAddRows = ScaleAddRows_SSE2;
+        const uint32* src_ptr, uint16* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
+    void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
+        ScaleAddRow_16_C;
+
+#if defined(HAS_SCALEADDROW_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
+      ScaleAddRow = ScaleAddRow_16_SSE2;
     }
 #endif
 
-    for (int j = 0; j < dst_height; ++j) {
+    for (j = 0; j < dst_height; ++j) {
+      int boxheight;
       int iy = y >> 16;
-      const uint8* src = src_ptr + iy * src_stride;
+      const uint16* src = src_ptr + iy * src_stride;
       y += dy;
-      if (y > (src_height << 16)) {
-        y = (src_height << 16);
+      if (y > max_y) {
+        y = max_y;
+      }
+      boxheight = MIN1((y >> 16) - iy);
+      memset(row32, 0, src_width * 4);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint32 *)(row32), src_width);
+        src += src_stride;
       }
-      int boxheight = (y >> 16) - iy;
-      ScaleAddRows(src, src_stride, row, src_width, boxheight);
-      ScaleAddCols(dst_width, boxheight, x, dx, row, dst_ptr);
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
       dst_ptr += dst_stride;
     }
+    free_aligned_buffer_64(row32);
   }
 }
 
-/**
- * Scale plane to/from any dimensions, with interpolation.
- */
-static void ScalePlaneBilinearSimple(int src_width, int src_height,
-                                     int dst_width, int dst_height,
-                                     int src_stride, int dst_stride,
-                                     const uint8* src_ptr, uint8* dst_ptr) {
-  int dx = (src_width << 16) / dst_width;
-  int dy = (src_height << 16) / dst_height;
-  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
-  int maxx = (src_width > 1) ? ((src_width - 1) << 16) - 1 : 0;
-  int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
-  for (int i = 0; i < dst_height; ++i) {
-    int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+// Scale plane down with bilinear interpolation.
+void ScalePlaneBilinearDown(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row buffer.
+  align_buffer_64(row, src_width);
+
+  const int max_y = (src_height - 1) << 16;
+  int j;
+  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    InterpolateRow = InterpolateRow_Any_DSPR2;
+    if (IS_ALIGNED(src_width, 4)) {
+      InterpolateRow = InterpolateRow_DSPR2;
+    }
+  }
+#endif
+
+
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleFilterCols_NEON;
+    }
+  }
+#endif
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  for (j = 0; j < dst_height; ++j) {
     int yi = y >> 16;
-    int yf = y & 0xffff;
-    const uint8* src0 = src_ptr + yi * src_stride;
-    const uint8* src1 = (yi < src_height - 1) ? src0 + src_stride : src0;
-    uint8* dst = dst_ptr;
-    for (int j = 0; j < dst_width; ++j) {
-      int xi = x >> 16;
-      int xf = x & 0xffff;
-      int x1 = (xi < src_width - 1) ? xi + 1 : xi;
-      int a = src0[xi];
-      int b = src0[x1];
-      int r0 = BLENDER(a, b, xf);
-      a = src1[xi];
-      b = src1[x1];
-      int r1 = BLENDER(a, b, xf);
-      *dst++ = BLENDER(r0, r1, yf);
-      x += dx;
-      if (x > maxx)
-        x = maxx;
+    const uint8* src = src_ptr + yi * src_stride;
+    if (filtering == kFilterLinear) {
+      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
     }
     dst_ptr += dst_stride;
     y += dy;
-    if (y > maxy)
-      y = maxy;
+    if (y > max_y) {
+      y = max_y;
+    }
   }
-}
+  free_aligned_buffer_64(row);
+}
+
+void ScalePlaneBilinearDown_16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr,
+                               enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row buffer.
+  align_buffer_64(row, src_width * 2);
+
+  const int max_y = (src_height - 1) << 16;
+  int j;
+  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
+  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    InterpolateRow = InterpolateRow_Any_16_DSPR2;
+    if (IS_ALIGNED(src_width, 4)) {
+      InterpolateRow = InterpolateRow_16_DSPR2;
+    }
+  }
+#endif
 
-/**
- * Scale plane to/from any dimensions, with bilinear
- * interpolation.
- */
-void ScalePlaneBilinear(int src_width, int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_ptr, uint8* dst_ptr) {
-  assert(dst_width > 0);
-  assert(dst_height > 0);
-  if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
-    ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
-                             src_stride, dst_stride, src_ptr, dst_ptr);
 
-  } else {
-    SIMD_ALIGNED(uint8 row[kMaxInputWidth + 16]);
-    void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
-                            ptrdiff_t src_stride,
-                            int dst_width, int source_y_fraction) =
-        ScaleFilterRows_C;
-#if defined(HAS_SCALEFILTERROWS_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      ScaleFilterRows = ScaleFilterRows_NEON;
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+  }
+#endif
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    int yi = y >> 16;
+    const uint16* src = src_ptr + yi * src_stride;
+    if (filtering == kFilterLinear) {
+      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow((uint16*)row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);
     }
+    dst_ptr += dst_stride;
+    y += dy;
+    if (y > max_y) {
+      y = max_y;
+    }
+  }
+  free_aligned_buffer_64(row);
+}
+
+// Scale up down with bilinear interpolation.
+void ScalePlaneBilinearUp(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          int src_stride, int dst_stride,
+                          const uint8* src_ptr, uint8* dst_ptr,
+                          enum FilterMode filtering) {
+  int j;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height - 1) << 16;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleFilterCols_C : ScaleCols_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
 #endif
-#if defined(HAS_SCALEFILTERROWS_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) &&
-        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
-      ScaleFilterRows = ScaleFilterRows_SSE2;
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
     }
+  }
 #endif
-#if defined(HAS_SCALEFILTERROWS_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3) &&
-        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
-      ScaleFilterRows = ScaleFilterRows_SSSE3;
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
     }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    InterpolateRow = InterpolateRow_Any_DSPR2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_DSPR2;
+    }
+  }
 #endif
 
-    int dx = (src_width << 16) / dst_width;
-    int dy = (src_height << 16) / dst_height;
-    int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
-    int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
-    int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
-    for (int j = 0; j < dst_height; ++j) {
-      int yi = y >> 16;
-      int yf = (y >> 8) & 255;
-      const uint8* src = src_ptr + yi * src_stride;
-      ScaleFilterRows(row, src, src_stride, src_width, yf);
-      ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
+  if (filtering && src_width >= 32768) {
+    ScaleFilterCols = ScaleFilterCols64_C;
+  }
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleFilterCols_NEON;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleFilterCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+  {
+    int yi = y >> 16;
+    const uint8* src = src_ptr + yi * src_stride;
+
+    // Allocate 2 row buffers.
+    const int kRowSize = (dst_width + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+
+    uint8* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_ptr + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+      }
       dst_ptr += dst_stride;
       y += dy;
-      if (y > maxy) {
-        y = maxy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+void ScalePlaneBilinearUp_16(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint16* src_ptr, uint16* dst_ptr,
+                             enum FilterMode filtering) {
+  int j;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height - 1) << 16;
+  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(dst_width, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    InterpolateRow = InterpolateRow_Any_16_DSPR2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_16_DSPR2;
+    }
+  }
+#endif
+
+  if (filtering && src_width >= 32768) {
+    ScaleFilterCols = ScaleFilterCols64_16_C;
+  }
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleFilterCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_16_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+  {
+    int yi = y >> 16;
+    const uint16* src = src_ptr + yi * src_stride;
+
+    // Allocate 2 row buffers.
+    const int kRowSize = (dst_width + 31) & ~31;
+    align_buffer_64(row, kRowSize * 4);
+
+    uint16* rowptr = (uint16*)row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_ptr + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
       }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+      }
+      dst_ptr += dst_stride;
+      y += dy;
     }
+    free_aligned_buffer_64(row);
   }
 }
 
-/**
- * Scale plane to/from any dimensions, without interpolation.
- * Fixed point math is used for performance: The upper 16 bits
- * of x and dx is the integer part of the source position and
- * the lower 16 bits are the fixed decimal part.
- */
+// Scale Plane to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
 static void ScalePlaneSimple(int src_width, int src_height,
                              int dst_width, int dst_height,
                              int src_stride, int dst_stride,
                              const uint8* src_ptr, uint8* dst_ptr) {
-  int dx = (src_width << 16) / dst_width;
-  int dy = (src_height << 16) / dst_height;
-  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
-  for (int j = 0; j < dst_height; ++j) {
-    int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
-    int yi = y >> 16;
-    const uint8* src = src_ptr + yi * src_stride;
-    uint8* dst = dst_ptr;
-    for (int i = 0; i < dst_width; ++i) {
-      *dst++ = src[x >> 16];
-      x += dx;
+  int i;
+  void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) = ScaleCols_C;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_SSE2;
     }
+#endif
+  }
+
+  for (i = 0; i < dst_height; ++i) {
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
     dst_ptr += dst_stride;
     y += dy;
   }
 }
 
-/**
- * Scale plane to/from any dimensions.
- */
-static void ScalePlaneAnySize(int src_width, int src_height,
-                              int dst_width, int dst_height,
-                              int src_stride, int dst_stride,
-                              const uint8* src_ptr, uint8* dst_ptr,
-                              FilterMode filtering) {
-  if (!filtering) {
-    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
-                     src_stride, dst_stride, src_ptr, dst_ptr);
-  } else {
-    // fall back to non-optimized version
-    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src_ptr, dst_ptr);
+static void ScalePlaneSimple_16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr) {
+  int i;
+  void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,
+      int dst_width, int x, int dx) = ScaleCols_16_C;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_16_SSE2;
+    }
+#endif
   }
-}
 
-/**
- * Scale plane down, any size
- *
- * This is an optimized version for scaling down a plane to any size.
- * The current implementation is ~10 times faster compared to the
- * reference implementation for e.g. XGA->LowResPAL
- *
- */
-static void ScalePlaneDown(int src_width, int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint8* src_ptr, uint8* dst_ptr,
-                           FilterMode filtering) {
-  if (!filtering) {
-    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
-                     src_stride, dst_stride, src_ptr, dst_ptr);
-  } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
-    // between 1/2x and 1x use bilinear
-    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src_ptr, dst_ptr);
-  } else {
-    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
-                  src_stride, dst_stride, src_ptr, dst_ptr);
+  for (i = 0; i < dst_height; ++i) {
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
+              dst_width, x, dx);
+    dst_ptr += dst_stride;
+    y += dy;
   }
 }
 
 // Scale a plane.
-// This function in turn calls a scaling function suitable for handling
-// the desired resolutions.
+// This function dispatches to a specialized scaler based on scale factor.
 
 LIBYUV_API
 void ScalePlane(const uint8* src, int src_stride,
                 int src_width, int src_height,
                 uint8* dst, int dst_stride,
                 int dst_width, int dst_height,
-                FilterMode filtering) {
-#ifdef CPU_X86
-  // environment variable overrides for testing.
-  char *filter_override = getenv("LIBYUV_FILTER");
-  if (filter_override) {
-    filtering = (FilterMode)atoi(filter_override);  // NOLINT
+                enum FilterMode filtering) {
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height, filtering);
+
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
   }
-#endif
+
   // Use specialized scales to improve performance for common resolutions.
   // For example, all the 1/2 scalings will use ScalePlaneDown2()
   if (dst_width == src_width && dst_height == src_height) {
     // Straight copy.
     CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
-  } else if (dst_width <= src_width && dst_height <= src_height) {
+    return;
+  }
+  if (dst_width == src_width && filtering != kFilterBox) {
+    int dy = FixedDiv(src_height, dst_height);
+    // Arbitrary scale vertically, but unscaled horizontally.
+    ScalePlaneVertical(src_height,
+                       dst_width, dst_height,
+                       src_stride, dst_stride, src, dst,
+                       0, 0, dy, 1, filtering);
+    return;
+  }
+  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
     // Scale down.
-    if (use_reference_impl_) {
-      // For testing, allow the optimized versions to be disabled.
-      ScalePlaneDown(src_width, src_height, dst_width, dst_height,
-                     src_stride, dst_stride, src, dst, filtering);
-    } else if (4 * dst_width == 3 * src_width &&
-               4 * dst_height == 3 * src_height) {
+    if (4 * dst_width == 3 * src_width &&
+        4 * dst_height == 3 * src_height) {
       // optimized, 3/4
       ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
                        src_stride, dst_stride, src, dst, filtering);
-    } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      return;
+    }
+    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
       // optimized, 1/2
       ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
                       src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
     // 3/8 rounded up for odd sized chroma height.
-    } else if (8 * dst_width == 3 * src_width &&
-               dst_height == ((src_height * 3 + 7) / 8)) {
+    if (8 * dst_width == 3 * src_width &&
+        dst_height == ((src_height * 3 + 7) / 8)) {
       // optimized, 3/8
       ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
                        src_stride, dst_stride, src, dst, filtering);
-    } else if (4 * dst_width == src_width && 4 * dst_height == src_height &&
-               filtering != kFilterBilinear) {
+      return;
+    }
+    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+        (filtering == kFilterBox || filtering == kFilterNone)) {
       // optimized, 1/4
       ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
                       src_stride, dst_stride, src, dst, filtering);
-    } else if (8 * dst_width == src_width && 8 * dst_height == src_height &&
+      return;
+    }
+  }
+  if (filtering == kFilterBox && dst_height * 2 < src_height) {
+    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+                  src_stride, dst_stride, src, dst);
+    return;
+  }
+  if (filtering && dst_height > src_height) {
+    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  if (filtering) {
+    ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+                           src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+                   src_stride, dst_stride, src, dst);
+}
+
+LIBYUV_API
+void ScalePlane_16(const uint16* src, int src_stride,
+                  int src_width, int src_height,
+                  uint16* dst, int dst_stride,
+                  int dst_width, int dst_height,
+                  enum FilterMode filtering) {
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height, filtering);
+
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  // Use specialized scales to improve performance for common resolutions.
+  // For example, all the 1/2 scalings will use ScalePlaneDown2()
+  if (dst_width == src_width && dst_height == src_height) {
+    // Straight copy.
+    CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
+    return;
+  }
+  if (dst_width == src_width) {
+    int dy = FixedDiv(src_height, dst_height);
+    // Arbitrary scale vertically, but unscaled vertically.
+    ScalePlaneVertical_16(src_height,
+                          dst_width, dst_height,
+                          src_stride, dst_stride, src, dst,
+                          0, 0, dy, 1, filtering);
+    return;
+  }
+  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+    // Scale down.
+    if (4 * dst_width == 3 * src_width &&
+        4 * dst_height == 3 * src_height) {
+      // optimized, 3/4
+      ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
+                          src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      // optimized, 1/2
+      ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    // 3/8 rounded up for odd sized chroma height.
+    if (8 * dst_width == 3 * src_width &&
+        dst_height == ((src_height * 3 + 7) / 8)) {
+      // optimized, 3/8
+      ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
+                          src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
                filtering != kFilterBilinear) {
-      // optimized, 1/8
-      ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst, filtering);
-    } else {
-      // Arbitrary downsample
-      ScalePlaneDown(src_width, src_height, dst_width, dst_height,
-                     src_stride, dst_stride, src, dst, filtering);
+      // optimized, 1/4
+      ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+      return;
     }
-  } else {
-    // Arbitrary scale up and/or down.
-    ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst, filtering);
   }
+  if (filtering == kFilterBox && dst_height * 2 < src_height) {
+    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
+                     src_stride, dst_stride, src, dst);
+    return;
+  }
+  if (filtering && dst_height > src_height) {
+    ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
+                            src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  if (filtering) {
+    ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
+                              src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst);
 }
 
 // Scale an I420 image.
 // This function in turn calls a scaling function for each plane.
 
-#define UNDER_ALLOCATED_HACK 1
-
 LIBYUV_API
 int I420Scale(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
@@ -2925,48 +1554,16 @@ int I420Scale(const uint8* src_y, int src_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int dst_width, int dst_height,
-              FilterMode filtering) {
-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+              enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
       !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
-  // Negative height means invert the image.
-  if (src_height < 0) {
-    src_height = -src_height;
-    int halfheight = (src_height + 1) >> 1;
-    src_y = src_y + (src_height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  int src_halfwidth = (src_width + 1) >> 1;
-  int src_halfheight = (src_height + 1) >> 1;
-  int dst_halfwidth = (dst_width + 1) >> 1;
-  int dst_halfheight = (dst_height + 1) >> 1;
-
-#ifdef UNDER_ALLOCATED_HACK
-  // If caller passed width / 2 for stride, adjust halfwidth to match.
-  if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) {
-    src_halfwidth = src_width >> 1;
-  }
-  if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) {
-    dst_halfwidth = dst_width >> 1;
-  }
-  // If caller used height / 2 when computing src_v, it will point into what
-  // should be the src_u plane. Detect this and reduce halfheight to match.
-  int uv_src_plane_size = src_halfwidth * src_halfheight;
-  if ((src_height & 1) &&
-      (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
-    src_halfheight = src_height >> 1;
-  }
-  int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
-  if ((dst_height & 1) &&
-      (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
-    dst_halfheight = dst_height >> 1;
-  }
-#endif
 
   ScalePlane(src_y, src_stride_y, src_width, src_height,
              dst_y, dst_stride_y, dst_width, dst_height,
@@ -2980,6 +1577,38 @@ int I420Scale(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
+LIBYUV_API
+int I420Scale_16(const uint16* src_y, int src_stride_y,
+                 const uint16* src_u, int src_stride_u,
+                 const uint16* src_v, int src_stride_v,
+                 int src_width, int src_height,
+                 uint16* dst_y, int dst_stride_y,
+                 uint16* dst_u, int dst_stride_u,
+                 uint16* dst_v, int dst_stride_v,
+                 int dst_width, int dst_height,
+                 enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane_16(src_y, src_stride_y, src_width, src_height,
+                dst_y, dst_stride_y, dst_width, dst_height,
+                filtering);
+  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,
+                dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+                filtering);
+  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,
+                dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+                filtering);
+  return 0;
+}
+
 // Deprecated api
 LIBYUV_API
 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
@@ -2988,90 +1617,53 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
           uint8* dst_y, uint8* dst_u, uint8* dst_v,
           int dst_stride_y, int dst_stride_u, int dst_stride_v,
           int dst_width, int dst_height,
-          bool interpolate) {
-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (src_height < 0) {
-    src_height = -src_height;
-    int halfheight = (src_height + 1) >> 1;
-    src_y = src_y + (src_height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  int src_halfwidth = (src_width + 1) >> 1;
-  int src_halfheight = (src_height + 1) >> 1;
-  int dst_halfwidth = (dst_width + 1) >> 1;
-  int dst_halfheight = (dst_height + 1) >> 1;
-  FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
-
-#ifdef UNDER_ALLOCATED_HACK
-  // If caller passed width / 2 for stride, adjust halfwidth to match.
-  if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) {
-    src_halfwidth = src_width >> 1;
-  }
-  if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) {
-    dst_halfwidth = dst_width >> 1;
-  }
-  // If caller used height / 2 when computing src_v, it will point into what
-  // should be the src_u plane. Detect this and reduce halfheight to match.
-  int uv_src_plane_size = src_halfwidth * src_halfheight;
-  if ((src_height & 1) &&
-      (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
-    src_halfheight = src_height >> 1;
-  }
-  int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
-  if ((dst_height & 1) &&
-      (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
-    dst_halfheight = dst_height >> 1;
-  }
-#endif
-
-  ScalePlane(src_y, src_stride_y, src_width, src_height,
-             dst_y, dst_stride_y, dst_width, dst_height,
-             filtering);
-  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
-             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
-             filtering);
-  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
-             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
-             filtering);
-  return 0;
+          LIBYUV_BOOL interpolate) {
+  return I420Scale(src_y, src_stride_y,
+                   src_u, src_stride_u,
+                   src_v, src_stride_v,
+                   src_width, src_height,
+                   dst_y, dst_stride_y,
+                   dst_u, dst_stride_u,
+                   dst_v, dst_stride_v,
+                   dst_width, dst_height,
+                   interpolate ? kFilterBox : kFilterNone);
 }
 
 // Deprecated api
 LIBYUV_API
 int ScaleOffset(const uint8* src, int src_width, int src_height,
                 uint8* dst, int dst_width, int dst_height, int dst_yoffset,
-                bool interpolate) {
-  if (!src || src_width <= 0 || src_height <= 0 ||
-      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
-      dst_yoffset >= dst_height) {
-    return -1;
-  }
-  dst_yoffset = dst_yoffset & ~1;  // chroma requires offset to multiple of 2.
-  int src_halfwidth = (src_width + 1) >> 1;
-  int src_halfheight = (src_height + 1) >> 1;
-  int dst_halfwidth = (dst_width + 1) >> 1;
-  int dst_halfheight = (dst_height + 1) >> 1;
-  int aheight = dst_height - dst_yoffset * 2;  // actual output height
+                LIBYUV_BOOL interpolate) {
+  // Chroma requires offset to multiple of 2.
+  int dst_yoffset_even = dst_yoffset & ~1;
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int aheight = dst_height - dst_yoffset_even * 2;  // actual output height
   const uint8* src_y = src;
   const uint8* src_u = src + src_width * src_height;
   const uint8* src_v = src + src_width * src_height +
                              src_halfwidth * src_halfheight;
-  uint8* dst_y = dst + dst_yoffset * dst_width;
+  uint8* dst_y = dst + dst_yoffset_even * dst_width;
   uint8* dst_u = dst + dst_width * dst_height +
-                 (dst_yoffset >> 1) * dst_halfwidth;
+                 (dst_yoffset_even >> 1) * dst_halfwidth;
   uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
-                 (dst_yoffset >> 1) * dst_halfwidth;
-  return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
-               src_width, src_height, dst_y, dst_u, dst_v, dst_width,
-               dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
+                 (dst_yoffset_even >> 1) * dst_halfwidth;
+  if (!src || src_width <= 0 || src_height <= 0 ||
+      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||
+      dst_yoffset_even >= dst_height) {
+    return -1;
+  }
+  return I420Scale(src_y, src_width,
+                   src_u, src_halfwidth,
+                   src_v, src_halfwidth,
+                   src_width, src_height,
+                   dst_y, dst_width,
+                   dst_u, dst_halfwidth,
+                   dst_v, dst_halfwidth,
+                   dst_width, aheight,
+                   interpolate ? kFilterBox : kFilterNone);
 }
 
 #ifdef __cplusplus
diff --git a/files/source/scale_any.cc b/files/source/scale_any.cc
new file mode 100644
index 00000000..ed76a9e4
--- /dev/null
+++ b/files/source/scale_any.cc
@@ -0,0 +1,221 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
+    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
+                 int dst_width, int x, int dx) {                               \
+      int n = dst_width & ~MASK;                                               \
+      if (n > 0) {                                                             \
+        TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                 \
+      }                                                                        \
+      TERP_C(dst_ptr + n * BPP, src_ptr,                                       \
+             dst_width & MASK, x + n * dx, dx);                                \
+    }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
+     ScaleARGBFilterCols_C, 4, 3)
+#endif
+#undef CANY
+
+// Fixed scale down.
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
+                     dst_ptr + n * BPP, r);                                    \
+    }
+
+// Fixed scale down for odd source width.  Used by I420Blend subsampling.
+// Since dst_width is (width + 1) / 2, this function scales one less pixel
+// and copies the last pixel.
+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1));               \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
+                     dst_ptr + n * BPP, r);                                    \
+    }
+
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
+      ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
+      2, 1, 15)
+SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_Odd_C, 2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_AVX2
+SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
+      ScaleRowDown2Linear_C, 2, 1, 31)
+SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
+      2, 1, 31)
+SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
+      2, 1, 31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_NEON
+SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
+      ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_C, 2, 1, 15)
+SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_Odd_C, 2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_SSSE3
+SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
+      4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_AVX2
+SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
+      4, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_NEON
+SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
+      4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN34_SSSE3
+SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
+      ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
+      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
+      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_NEON
+SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
+      ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
+      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
+      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN38_SSSE3
+SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
+      ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
+      ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
+      ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
+#endif
+#ifdef HAS_SCALEROWDOWN38_NEON
+SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
+      ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
+      ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
+      ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_SSE2
+SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
+      ScaleARGBRowDown2_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
+      ScaleARGBRowDown2Linear_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
+      ScaleARGBRowDown2Box_C, 2, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
+      ScaleARGBRowDown2_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
+      ScaleARGBRowDown2Linear_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
+      ScaleARGBRowDown2Box_C, 2, 4, 7)
+#endif
+#undef SDANY
+
+// Scale down by even scale factor.
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)          \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx,    \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);         \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride,              \
+                     src_stepx, dst_ptr + n * BPP, r);                         \
+    }
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
+SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
+       ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
+       ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
+       ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
+       ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+
+// Add rows box filter scale down.
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)                  \
+  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) {         \
+      int n = src_width & ~MASK;                                               \
+      if (n > 0) {                                                             \
+        SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                                 \
+      }                                                                        \
+      SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);               \
+    }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
+#endif
+#undef SAANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+
+
+
+
diff --git a/files/source/scale_argb.cc b/files/source/scale_argb.cc
index 5d4e1ac0..17f51ae9 100644
--- a/files/source/scale_argb.cc
+++ b/files/source/scale_argb.cc
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -12,1023 +12,847 @@
 
 #include <assert.h>
 #include <string.h>
-#include <stdlib.h>  // For getenv()
 
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"  // For CopyARGB
 #include "libyuv/row.h"
+#include "libyuv/scale_row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-// Bilinear SSE2 is disabled.
-#define SSE2_DISABLED 1
-
-// ARGB scaling uses bilinear or point, but not box filter.
-/**
- * SSE2 downscalers with bilinear interpolation.
- */
-
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-
-#define HAS_SCALEARGBROWDOWN2_SSE2
-// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr,
-                                   ptrdiff_t /* src_stride */,
-                                   uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    shufps     xmm0, xmm1, 0x88
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    ret
-  }
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
 }
 
-// Blends 8x2 rectangle to 4x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
-    pavgb      xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    pop        esi
-    ret
+// ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
+// its original size.
+static void ScaleARGBDown2(int src_width, int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint8* src_argb, uint8* dst_argb,
+                           int x, int dx, int y, int dy,
+                           enum FilterMode filtering) {
+  int j;
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) =
+    filtering == kFilterNone ? ScaleARGBRowDown2_C :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
+        ScaleARGBRowDown2Box_C);
+  assert(dx == 65536 * 2);  // Test scale factor of 2.
+  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
+  // Advance to odd row, even column.
+  if (filtering == kFilterBilinear) {
+    src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  } else {
+    src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
   }
-}
-
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-// Reads 4 pixels at a time.
-// Alignment requirement: dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       ebx
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_ptr
-                                     // src_stride ignored
-    mov        ebx, [esp + 8 + 12]   // src_stepx
-    mov        edx, [esp + 8 + 16]   // dst_ptr
-    mov        ecx, [esp + 8 + 20]   // dst_width
-    lea        ebx, [ebx * 4]
-    lea        edi, [ebx + ebx * 2]
 
-    align      16
-  wloop:
-    movd       xmm0, [eax]
-    movd       xmm1, [eax + ebx]
-    punpckldq  xmm0, xmm1
-    movd       xmm2, [eax + ebx * 2]
-    movd       xmm3, [eax + edi]
-    lea        eax,  [eax + ebx * 4]
-    punpckldq  xmm2, xmm3
-    punpcklqdq xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    pop        edi
-    pop        ebx
-    ret
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
+        ScaleARGBRowDown2Box_Any_SSE2);
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+          ScaleARGBRowDown2Box_SSE2);
+    }
   }
-}
-
-// Blends four 2x2 to 4x1.
-// Alignment requirement: dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr,
-                                         ptrdiff_t src_stride,
-                                         int src_stepx,
-                                         uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       ebx
-    push       esi
-    push       edi
-    mov        eax, [esp + 12 + 4]    // src_ptr
-    mov        esi, [esp + 12 + 8]    // src_stride
-    mov        ebx, [esp + 12 + 12]   // src_stepx
-    mov        edx, [esp + 12 + 16]   // dst_ptr
-    mov        ecx, [esp + 12 + 20]   // dst_width
-    lea        esi, [eax + esi]      // row1 pointer
-    lea        ebx, [ebx * 4]
-    lea        edi, [ebx + ebx * 2]
-
-    align      16
-  wloop:
-    movq       xmm0, qword ptr [eax] // row0 4 pairs
-    movhps     xmm0, qword ptr [eax + ebx]
-    movq       xmm1, qword ptr [eax + ebx * 2]
-    movhps     xmm1, qword ptr [eax + edi]
-    lea        eax,  [eax + ebx * 4]
-    movq       xmm2, qword ptr [esi] // row1 4 pairs
-    movhps     xmm2, qword ptr [esi + ebx]
-    movq       xmm3, qword ptr [esi + ebx * 2]
-    movhps     xmm3, qword ptr [esi + edi]
-    lea        esi,  [esi + ebx * 4]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
-    pavgb      xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
+        ScaleARGBRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
+          ScaleARGBRowDown2Box_NEON);
+    }
+  }
+#endif
 
-    pop        edi
-    pop        esi
-    pop        ebx
-    ret
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
   }
 }
 
-// Bilinear row filtering combines 4x2 -> 4x1. SSE2 version.
-#ifndef SSE2_DISABLED
-#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
-__declspec(naked) __declspec(align(16))
-void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                              ptrdiff_t src_stride, int dst_width,
-                              int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-    cmp        eax, 0
-    je         xloop1
-    cmp        eax, 128
-    je         xloop2
-
-    movd       xmm5, eax            // xmm5 = y fraction
-    punpcklbw  xmm5, xmm5
-    punpcklwd  xmm5, xmm5
-    pshufd     xmm5, xmm5, 0
-    pxor       xmm4, xmm4
-
-    // f * row1 + (1 - frac) row0
-    // frac * (row1 - row0) + row0
-    align      16
-  xloop:
-    movdqa     xmm0, [esi]  // row0
-    movdqa     xmm2, [esi + edx]  // row1
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    punpcklbw  xmm2, xmm4
-    punpckhbw  xmm3, xmm4
-    punpcklbw  xmm0, xmm4
-    punpckhbw  xmm1, xmm4
-    psubw      xmm2, xmm0  // row1 - row0
-    psubw      xmm3, xmm1
-    pmulhw     xmm2, xmm5  // scale diff
-    pmulhw     xmm3, xmm5
-    paddw      xmm0, xmm2  // sum rows
-    paddw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-    sub        ecx, 4
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop
-
-    shufps     xmm0, xmm0, 0xff
-    movdqa     [esi + edi], xmm0    // duplicate last pixel for filtering
-    pop        edi
-    pop        esi
-    ret
-
-    align      16
-  xloop1:
-    movdqa     xmm0, [esi]
-    sub        ecx, 4
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop1
-
-    shufps     xmm0, xmm0, 0xff
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
-
-    align      16
-  xloop2:
-    movdqa     xmm0, [esi]
-    pavgb      xmm0, [esi + edx]
-    sub        ecx, 4
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop2
+// ScaleARGB ARGB, 1/4
+// This is an optimized version for scaling down a ARGB to 1/4 of
+// its original size.
+static void ScaleARGBDown4Box(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_argb, uint8* dst_argb,
+                              int x, int dx, int y, int dy) {
+  int j;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
+  align_buffer_64(row, kRowSize * 2);
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+    uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
+  // Advance to odd row, even column.
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  assert(dx == 65536 * 4);  // Test scale factor of 4.
+  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+    }
+  }
+#endif
 
-    shufps     xmm0, xmm0, 0xff
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
+                      row + kRowSize, dst_width * 2);
+    ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
   }
+  free_aligned_buffer_64(row);
 }
-#endif  // SSE2_DISABLED
-
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
-#define HAS_SCALEARGBFILTERROWS_SSSE3
-__declspec(naked) __declspec(align(16))
-void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                               ptrdiff_t src_stride, int dst_width,
-                               int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-    shr        eax, 1
-    cmp        eax, 0
-    je         xloop1
-    cmp        eax, 64
-    je         xloop2
-    movd       xmm0, eax  // high fraction 0..127
-    neg        eax
-    add        eax, 128
-    movd       xmm5, eax  // low fraction 128..1
-    punpcklbw  xmm5, xmm0
-    punpcklwd  xmm5, xmm5
-    pshufd     xmm5, xmm5, 0
-
-    align      16
-  xloop:
-    movdqa     xmm0, [esi]
-    movdqa     xmm2, [esi + edx]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm2
-    punpckhbw  xmm1, xmm2
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm1, xmm5
-    psrlw      xmm0, 7
-    psrlw      xmm1, 7
-    packuswb   xmm0, xmm1
-    sub        ecx, 4
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop
 
-    shufps     xmm0, xmm0, 0xff
-    movdqa     [esi + edi], xmm0    // duplicate last pixel for filtering
-    pop        edi
-    pop        esi
-    ret
-
-    align      16
-  xloop1:
-    movdqa     xmm0, [esi]
-    sub        ecx, 4
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop1
-
-    shufps     xmm0, xmm0, 0xff
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
-
-    align      16
-  xloop2:
-    movdqa     xmm0, [esi]
-    pavgb      xmm0, [esi + edx]
-    sub        ecx, 4
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop2
+// ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
+// multiple of its original size.
+static void ScaleARGBDownEven(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_argb, uint8* dst_argb,
+                              int x, int dx, int y, int dy,
+                              enum FilterMode filtering) {
+  int j;
+  int col_step = dx >> 16;
+  int row_stride = (dy >> 16) * src_stride;
+  void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_step, uint8* dst_argb, int dst_width) =
+      filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+  assert(IS_ALIGNED(src_width, 2));
+  assert(IS_ALIGNED(src_height, 2));
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
+        ScaleARGBRowDownEven_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+          ScaleARGBRowDownEven_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
+        ScaleARGBRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+          ScaleARGBRowDownEven_NEON;
+    }
+  }
+#endif
 
-    shufps     xmm0, xmm0, 0xff
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
   }
 }
 
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-
-// GCC versions of row functions are verbatim conversions from Visual C.
-// Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
-#define HAS_SCALEARGBROWDOWN2_SSE2
-static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr,
-                                   ptrdiff_t /* src_stride */,
-                                   uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
+// Scale ARGB down with bilinear interpolation.
+static void ScaleARGBBilinearDown(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  int src_stride, int dst_stride,
+                                  const uint8* src_argb, uint8* dst_argb,
+                                  int x, int dx, int y, int dy,
+                                  enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
+  int64 xlast = x + (int64)(dst_width - 1) * dx;
+  int64 xl = (dx >= 0) ? x : xlast;
+  int64 xr = (dx >= 0) ? xlast : x;
+  int clip_src_width;
+  xl = (xl >> 16) & ~3;  // Left edge aligned.
+  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.
+  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
+  if (xr > src_width) {
+    xr = src_width;
+  }
+  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
+  src_argb += xl * 4;
+  x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
 #endif
-  );
-}
-
-static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    (%0,%3,1),%%xmm2                \n"
-    "movdqa    0x10(%0,%3,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"(static_cast<intptr_t>(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
 #endif
-  );
-}
-
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-// Reads 4 pixels at a time.
-// Alignment requirement: dst_ptr 16 byte aligned.
-void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8* dst_ptr, int dst_width) {
-  intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
-  intptr_t src_stepx_x12 = 0;
-  asm volatile (
-    "lea       0x0(,%1,4),%1                   \n"
-    "lea       (%1,%1,2),%4                    \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movd      (%0),%%xmm0                     \n"
-    "movd      (%0,%1,1),%%xmm1                \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    "movd      (%0,%1,2),%%xmm2                \n"
-    "movd      (%0,%4,1),%%xmm3                \n"
-    "lea       (%0,%1,4),%0                    \n"
-    "punpckldq %%xmm3,%%xmm2                   \n"
-    "punpcklqdq %%xmm2,%%xmm0                  \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),       // %0
-    "+r"(src_stepx_x4),  // %1
-    "+r"(dst_ptr),       // %2
-    "+r"(dst_width),     // %3
-    "+r"(src_stepx_x12)  // %4
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
 #endif
-  );
-}
-
-// Blends four 2x2 to 4x1.
-// Alignment requirement: dst_ptr 16 byte aligned.
-static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr,
-                                         ptrdiff_t src_stride, int src_stepx,
-                                         uint8* dst_ptr, int dst_width) {
-  intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
-  intptr_t src_stepx_x12 = 0;
-  intptr_t row1 = static_cast<intptr_t>(src_stride);
-  asm volatile (
-    "lea       0x0(,%1,4),%1                   \n"
-    "lea       (%1,%1,2),%4                    \n"
-    "lea       (%0,%5,1),%5                    \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movq      (%0),%%xmm0                     \n"
-    "movhps    (%0,%1,1),%%xmm0                \n"
-    "movq      (%0,%1,2),%%xmm1                \n"
-    "movhps    (%0,%4,1),%%xmm1                \n"
-    "lea       (%0,%1,4),%0                    \n"
-    "movq      (%5),%%xmm2                     \n"
-    "movhps    (%5,%1,1),%%xmm2                \n"
-    "movq      (%5,%1,2),%%xmm3                \n"
-    "movhps    (%5,%4,1),%%xmm3                \n"
-    "lea       (%5,%1,4),%5                    \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),        // %0
-    "+r"(src_stepx_x4),   // %1
-    "+r"(dst_ptr),        // %2
-    "+rm"(dst_width),     // %3
-    "+r"(src_stepx_x12),  // %4
-    "+r"(row1)            // %5
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_DSPR2;
+    if (IS_ALIGNED(clip_src_width, 4)) {
+      InterpolateRow = InterpolateRow_DSPR2;
+    }
+  }
 #endif
-  );
-}
-
-#ifndef SSE2_DISABLED
-// Bilinear row filtering combines 4x2 -> 4x1. SSE2 version
-#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
-void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                              ptrdiff_t src_stride, int dst_width,
-                              int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "cmp       $0x0,%3                         \n"
-    "je        2f                              \n"
-    "cmp       $0x80,%3                        \n"
-    "je        3f                              \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm5,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm2                \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklbw %%xmm4,%%xmm2                   \n"
-    "punpckhbw %%xmm4,%%xmm3                   \n"
-    "punpcklbw %%xmm4,%%xmm0                   \n"
-    "punpckhbw %%xmm4,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm2                   \n"
-    "psubw     %%xmm1,%%xmm3                   \n"
-    "pmulhw    %%xmm5,%%xmm2                   \n"
-    "pmulhw    %%xmm5,%%xmm3                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-    "jmp       4f                              \n"
-    ".p2align  4                               \n"
-  "2:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        2b                              \n"
-    "jmp       4f                              \n"
-    ".p2align  4                               \n"
-  "3:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "pavgb     (%1,%4,1),%%xmm0                \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        3b                              \n"
-    ".p2align  4                               \n"
-  "4:                                          \n"
-    "shufps    $0xff,%%xmm0,%%xmm0             \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-  : "+r"(dst_ptr),     // %0
-    "+r"(src_ptr),     // %1
-    "+r"(dst_width),   // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"(static_cast<intptr_t>(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
 #endif
-  );
-}
-#endif  // SSE2_DISABLED
-
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-#define HAS_SCALEARGBFILTERROWS_SSSE3
-void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                               ptrdiff_t src_stride, int dst_width,
-                               int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "shr       %3                              \n"
-    "cmp       $0x0,%3                         \n"
-    "je        2f                              \n"
-    "cmp       $0x40,%3                        \n"
-    "je        3f                              \n"
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x80,%3                        \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm2                \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm0                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    "pmaddubsw %%xmm5,%%xmm0                   \n"
-    "pmaddubsw %%xmm5,%%xmm1                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-    "jmp       4f                              \n"
-    ".p2align  4                               \n"
-  "2:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        2b                              \n"
-    "jmp       4f                              \n"
-    ".p2align  4                               \n"
-  "3:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "pavgb     (%1,%4,1),%%xmm0                \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        3b                              \n"
-  "4:                                          \n"
-    ".p2align  4                               \n"
-    "shufps    $0xff,%%xmm0,%%xmm0             \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-  : "+r"(dst_ptr),     // %0
-    "+r"(src_ptr),     // %1
-    "+r"(dst_width),   // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"(static_cast<intptr_t>(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm5"
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
 #endif
-  );
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row of ARGB.
+  {
+    align_buffer_64(row, clip_src_width * 4);
+
+    const int max_y = (src_height - 1) << 16;
+    if (y > max_y) {
+      y = max_y;
+    }
+    for (j = 0; j < dst_height; ++j) {
+      int yi = y >> 16;
+      const uint8* src = src_argb + yi * src_stride;
+      if (filtering == kFilterLinear) {
+        ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(row, src, src_stride, clip_src_width, yf);
+        ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+      }
+      dst_argb += dst_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+    }
+    free_aligned_buffer_64(row);
+  }
 }
-#endif  // defined(__x86_64__) || defined(__i386__)
 
-static void ScaleARGBRowDown2_C(const uint8* src_ptr,
-                                ptrdiff_t /* src_stride */,
-                                uint8* dst_ptr, int dst_width) {
-  const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
-  uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
-
-  for (int x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src[0];
-    dst[1] = src[2];
-    src += 4;
-    dst += 2;
+// Scale ARGB up with bilinear interpolation.
+static void ScaleARGBBilinearUp(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint8* src_argb, uint8* dst_argb,
+                                int x, int dx, int y, int dy,
+                                enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
   }
-  if (dst_width & 1) {
-    dst[0] = src[0];
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
   }
-}
-
-static void ScaleARGBRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width) {
-  for (int x = 0; x < dst_width; ++x) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[4] +
-                  src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2;
-    dst_ptr[1] = (src_ptr[1] + src_ptr[5] +
-                  src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2;
-    dst_ptr[2] = (src_ptr[2] + src_ptr[6] +
-                  src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2;
-    dst_ptr[3] = (src_ptr[3] + src_ptr[7] +
-                  src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2;
-    src_ptr += 8;
-    dst_ptr += 4;
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
   }
-}
-
-void ScaleARGBRowDownEven_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                            int src_stepx,
-                            uint8* dst_ptr, int dst_width) {
-  const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
-  uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
-
-  for (int x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src[0];
-    dst[1] = src[src_stepx];
-    src += src_stepx * 2;
-    dst += 2;
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_DSPR2;
   }
-  if (dst_width & 1) {
-    dst[0] = src[0];
+#endif
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols = filtering ?
+        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
   }
-}
-
-static void ScaleARGBRowDownEvenInt_C(const uint8* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      int src_stepx,
-                                      uint8* dst_ptr, int dst_width) {
-  for (int x = 0; x < dst_width; ++x) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[4] +
-                  src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2;
-    dst_ptr[1] = (src_ptr[1] + src_ptr[5] +
-                  src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2;
-    dst_ptr[2] = (src_ptr[2] + src_ptr[6] +
-                  src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2;
-    dst_ptr[3] = (src_ptr[3] + src_ptr[7] +
-                  src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2;
-    src_ptr += src_stepx * 4;
-    dst_ptr += 4;
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
   }
-}
 
-// (1-f)a + fb can be replaced with a + f(b-a)
+  if (y > max_y) {
+    y = max_y;
+  }
 
-#define BLENDER1(a, b, f) (static_cast<int>(a) + \
-    ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
+  {
+    int yi = y >> 16;
+    const uint8* src = src_argb + yi * src_stride;
 
-#define BLENDERC(a, b, f, s) static_cast<uint32>( \
-    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (dst_width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
 
-#define BLENDER(a, b, f) \
-    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
-    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+    uint8* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
 
-static void ScaleARGBFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                                  int dst_width, int x, int dx) {
-  const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
-  uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
-  for (int j = 0; j < dst_width - 1; j += 2) {
-    int xi = x >> 16;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
-    dst[0] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    xi = x >> 16;
-    a = src[xi];
-    b = src[xi + 1];
-    dst[1] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    int xi = x >> 16;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
-    dst[0] = BLENDER(a, b, x & 0xffff);
+    ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_argb + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+      }
+      dst_argb += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
   }
 }
 
-static const int kMaxInputWidth = 2560;
-
-// C version 2x2 -> 2x1
-void ScaleARGBFilterRows_C(uint8* dst_ptr, const uint8* src_ptr,
-                           ptrdiff_t src_stride,
-                           int dst_width, int source_y_fraction) {
-  assert(dst_width > 0);
-  int y1_fraction = source_y_fraction;
-  int y0_fraction = 256 - y1_fraction;
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  uint8* end = dst_ptr + (dst_width << 2);
-  do {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
-    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
-    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
-    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
-    dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
-    dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
-    dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
-    dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
-    src_ptr += 8;
-    src_ptr1 += 8;
-    dst_ptr += 8;
-  } while (dst_ptr < end);
-  // Duplicate the last pixel (4 bytes) for filtering.
-  dst_ptr[0] = dst_ptr[-4];
-  dst_ptr[1] = dst_ptr[-3];
-  dst_ptr[2] = dst_ptr[-2];
-  dst_ptr[3] = dst_ptr[-1];
-}
-
-/**
- * ScaleARGB ARGB, 1/2
- *
- * This is an optimized version for scaling down a ARGB to 1/2 of
- * its original size.
- *
- */
-static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint8* src_ptr, uint8* dst_ptr,
-                           FilterMode filtering) {
-  void (*ScaleARGBRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) =
-      filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C;
-#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 :
-        ScaleARGBRowDown2_SSE2;
+#ifdef YUVSCALEUP
+// Scale YUV to ARGB up with bilinear interpolation.
+static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
+                                     int dst_width, int dst_height,
+                                     int src_stride_y,
+                                     int src_stride_u,
+                                     int src_stride_v,
+                                     int dst_stride_argb,
+                                     const uint8* src_y,
+                                     const uint8* src_u,
+                                     const uint8* src_v,
+                                     uint8* dst_argb,
+                                     int x, int dx, int y, int dy,
+                                     enum FilterMode filtering) {
+  int j;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
   }
 #endif
-
-  // TODO(fbarchard): Loop through source height to allow odd height.
-  for (int y = 0; y < dst_height; ++y) {
-    ScaleARGBRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
-    src_ptr += (src_stride << 1);
-    dst_ptr += dst_stride;
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
   }
-}
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_DSPR2;
+  }
+#endif
 
-/**
- * ScaleARGB ARGB Even
- *
- * This is an optimized version for scaling down a ARGB to even
- * multiple of its original size.
- *
- */
-static void ScaleARGBDownEven(int src_width, int src_height,
-                              int dst_width, int dst_height,
-                              int src_stride, int dst_stride,
-                              const uint8* src_ptr, uint8* dst_ptr,
-                              FilterMode filtering) {
-  assert(IS_ALIGNED(src_width, 2));
-  assert(IS_ALIGNED(src_height, 2));
-  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, ptrdiff_t src_stride,
-                               int src_step, uint8* dst_ptr, int dst_width) =
-      filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C;
-#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_SSE2 :
-        ScaleARGBRowDownEven_SSE2;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
   }
 #endif
-  int src_step = src_width / dst_width;
-  // Adjust to point to center of box.
-  int row_step = src_height / dst_height;
-  int row_stride = row_step * src_stride;
-  src_ptr += ((row_step >> 1) - 1) * src_stride + ((src_step >> 1) - 1) * 4;
-  for (int y = 0; y < dst_height; ++y) {
-    ScaleARGBRowDownEven(src_ptr, src_stride, src_step, dst_ptr, dst_width);
-    src_ptr += row_stride;
-    dst_ptr += dst_stride;
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
   }
-}
-/**
- * ScaleARGB ARGB to/from any dimensions, with bilinear
- * interpolation.
- */
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    InterpolateRow = InterpolateRow_DSPR2;
+  }
+#endif
 
-static void ScaleARGBBilinear(int src_width, int src_height,
-                              int dst_width, int dst_height,
-                              int src_stride, int dst_stride,
-                              const uint8* src_ptr, uint8* dst_ptr) {
-  assert(dst_width > 0);
-  assert(dst_height > 0);
-  assert(src_width <= kMaxInputWidth);
-  SIMD_ALIGNED(uint8 row[kMaxInputWidth * 4 + 16]);
-  void (*ScaleARGBFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
-                              ptrdiff_t src_stride,
-                              int dst_width, int source_y_fraction) =
-      ScaleARGBFilterRows_C;
-#if defined(HAS_SCALEARGBFILTERROWS_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
-    ScaleARGBFilterRows = ScaleARGBFilterRows_SSE2;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols = filtering ?
+        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
   }
 #endif
-#if defined(HAS_SCALEARGBFILTERROWS_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
-    ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3;
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
   }
 #endif
-  int dx = (src_width << 16) / dst_width;
-  int dy = (src_height << 16) / dst_height;
-  int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
-  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
-  int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
-  for (int j = 0; j < dst_height; ++j) {
-    int yi = y >> 16;
-    int yf = (y >> 8) & 255;
-    const uint8* src = src_ptr + yi * src_stride;
-    ScaleARGBFilterRows(row, src, src_stride, src_width, yf);
-    ScaleARGBFilterCols_C(dst_ptr, row, dst_width, x, dx);
-    dst_ptr += dst_stride;
-    y += dy;
-    if (y > maxy) {
-      y = maxy;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
     }
   }
-}
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
 
-// Scales a single row of pixels using point sampling.
-// Code is adapted from libyuv bilinear yuv scaling, but with bilinear
-//     interpolation off, and argb pixels instead of yuv.
-static void ScaleARGBCols(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx) {
-  const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
-  uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
-  for (int j = 0; j < dst_width - 1; j += 2) {
-    dst[0] = src[x >> 16];
-    x += dx;
-    dst[1] = src[x >> 16];
-    x += dx;
-    dst += 2;
+  const int max_y = (src_height - 1) << 16;
+  if (y > max_y) {
+    y = max_y;
+  }
+  const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
+  int yi = y >> 16;
+  int uv_yi = yi >> kYShift;
+  const uint8* src_row_y = src_y + yi * src_stride_y;
+  const uint8* src_row_u = src_u + uv_yi * src_stride_u;
+  const uint8* src_row_v = src_v + uv_yi * src_stride_v;
+
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 4 + 31) & ~31;
+  align_buffer_64(row, kRowSize * 2);
+
+  // Allocate 1 row of ARGB for source conversion.
+  align_buffer_64(argb_row, src_width * 4);
+
+  uint8* rowptr = row;
+  int rowstride = kRowSize;
+  int lasty = yi;
+
+  // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
+  ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
+  if (src_height > 1) {
+    src_row_y += src_stride_y;
+    if (yi & 1) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+  ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
+  if (src_height > 2) {
+    src_row_y += src_stride_y;
+    if (!(yi & 1)) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
   }
-  if (dst_width & 1) {
-    dst[0] = src[x >> 16];
+
+  for (j = 0; j < dst_height; ++j) {
+    yi = y >> 16;
+    if (yi != lasty) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+        uv_yi = yi >> kYShift;
+        src_row_y = src_y + yi * src_stride_y;
+        src_row_u = src_u + uv_yi * src_stride_u;
+        src_row_v = src_v + uv_yi * src_stride_v;
+      }
+      if (yi != lasty) {
+        // TODO(fbarchard): Convert the clipped region of row.
+        I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
+        ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
+        rowptr += rowstride;
+        rowstride = -rowstride;
+        lasty = yi;
+        src_row_y += src_stride_y;
+        if (yi & 1) {
+          src_row_u += src_stride_u;
+          src_row_v += src_stride_v;
+        }
+      }
+    }
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    }
+    dst_argb += dst_stride_argb;
+    y += dy;
   }
+  free_aligned_buffer_64(row);
+  free_aligned_buffer_64(row_argb);
 }
+#endif
 
-/**
- * ScaleARGB ARGB to/from any dimensions, without interpolation.
- * Fixed point math is used for performance: The upper 16 bits
- * of x and dx is the integer part of the source position and
- * the lower 16 bits are the fixed decimal part.
- */
+// Scale ARGB to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
 
 static void ScaleARGBSimple(int src_width, int src_height,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr) {
-  int dx = (src_width << 16) / dst_width;
-  int dy = (src_height << 16) / dst_height;
-  int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
-  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
-  for (int i = 0; i < dst_height; ++i) {
-    ScaleARGBCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
-    dst_ptr += dst_stride;
-    y += dy;
+                            const uint8* src_argb, uint8* dst_argb,
+                            int x, int dx, int y, int dy) {
+  int j;
+  void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
   }
-}
 
-/**
- * ScaleARGB ARGB to/from any dimensions.
- */
-static void ScaleARGBAnySize(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr,
-                             FilterMode filtering) {
-  if (!filtering || (src_width > kMaxInputWidth)) {
-    ScaleARGBSimple(src_width, src_height, dst_width, dst_height,
-                    src_stride, dst_stride, src_ptr, dst_ptr);
-  } else {
-    ScaleARGBBilinear(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src_ptr, dst_ptr);
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
+                  dst_width, x, dx);
+    dst_argb += dst_stride;
+    y += dy;
   }
 }
 
 // ScaleARGB a ARGB.
-//
 // This function in turn calls a scaling function
 // suitable for handling the desired resolutions.
-
 static void ScaleARGB(const uint8* src, int src_stride,
                       int src_width, int src_height,
                       uint8* dst, int dst_stride,
                       int dst_width, int dst_height,
-                      FilterMode filtering) {
-#ifdef CPU_X86
-  // environment variable overrides for testing.
-  char *filter_override = getenv("LIBYUV_FILTER");
-  if (filter_override) {
-    filtering = (FilterMode)atoi(filter_override);  // NOLINT
+                      int clip_x, int clip_y, int clip_width, int clip_height,
+                      enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // ARGB does not support box filter yet, but allow the user to pass it.
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height,
+                                filtering);
+
+  // Negative src_height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
   }
-#endif
-  if (dst_width == src_width && dst_height == src_height) {
-    // Straight copy.
-    ARGBCopy(src, src_stride, dst, dst_stride, dst_width, dst_height);
-    return;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  if (clip_x) {
+    int64 clipf = (int64)(clip_x) * dx;
+    x += (clipf & 0xffff);
+    src += (clipf >> 16) * 4;
+    dst += clip_x * 4;
   }
-  if (2 * dst_width == src_width && 2 * dst_height == src_height) {
-    // Optimized 1/2.
-    ScaleARGBDown2(src_width, src_height, dst_width, dst_height,
-                   src_stride, dst_stride, src, dst, filtering);
-    return;
+  if (clip_y) {
+    int64 clipf = (int64)(clip_y) * dy;
+    y += (clipf & 0xffff);
+    src += (clipf >> 16) * src_stride;
+    dst += clip_y * dst_stride;
   }
-  int scale_down_x = src_width / dst_width;
-  int scale_down_y = src_height / dst_height;
-  if (dst_width * scale_down_x == src_width &&
-      dst_height * scale_down_y == src_height) {
-    if (!(scale_down_x & 1) && !(scale_down_y & 1)) {
-      // Optimized even scale down. ie 4, 6, 8, 10x
-      ScaleARGBDownEven(src_width, src_height, dst_width, dst_height,
-                        src_stride, dst_stride, src, dst, filtering);
-      return;
-    }
-    if ((scale_down_x & 1) && (scale_down_y & 1)) {
+
+  // Special case for integer step values.
+  if (((dx | dy) & 0xffff) == 0) {
+    if (!dx || !dy) {  // 1 pixel wide and/or tall.
       filtering = kFilterNone;
+    } else {
+      // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+      if (!(dx & 0x10000) && !(dy & 0x10000)) {
+        if (dx == 0x20000) {
+          // Optimized 1/2 downsample.
+          ScaleARGBDown2(src_width, src_height,
+                         clip_width, clip_height,
+                         src_stride, dst_stride, src, dst,
+                         x, dx, y, dy, filtering);
+          return;
+        }
+        if (dx == 0x40000 && filtering == kFilterBox) {
+          // Optimized 1/4 box downsample.
+          ScaleARGBDown4Box(src_width, src_height,
+                            clip_width, clip_height,
+                            src_stride, dst_stride, src, dst,
+                            x, dx, y, dy);
+          return;
+        }
+        ScaleARGBDownEven(src_width, src_height,
+                          clip_width, clip_height,
+                          src_stride, dst_stride, src, dst,
+                          x, dx, y, dy, filtering);
+        return;
+      }
+      // Optimized odd scale down. ie 3, 5, 7, 9x.
+      if ((dx & 0x10000) && (dy & 0x10000)) {
+        filtering = kFilterNone;
+        if (dx == 0x10000 && dy == 0x10000) {
+          // Straight copy.
+          ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
+                   dst, dst_stride, clip_width, clip_height);
+          return;
+        }
+      }
     }
   }
-  // Arbitrary scale up and/or down.
-  ScaleARGBAnySize(src_width, src_height, dst_width, dst_height,
-                   src_stride, dst_stride, src, dst, filtering);
+  if (dx == 0x10000 && (x & 0xffff) == 0) {
+    // Arbitrary scale vertically, but unscaled vertically.
+    ScalePlaneVertical(src_height,
+                       clip_width, clip_height,
+                       src_stride, dst_stride, src, dst,
+                       x, y, dy, 4, filtering);
+    return;
+  }
+  if (filtering && dy < 65536) {
+    ScaleARGBBilinearUp(src_width, src_height,
+                        clip_width, clip_height,
+                        src_stride, dst_stride, src, dst,
+                        x, dx, y, dy, filtering);
+    return;
+  }
+  if (filtering) {
+    ScaleARGBBilinearDown(src_width, src_height,
+                          clip_width, clip_height,
+                          src_stride, dst_stride, src, dst,
+                          x, dx, y, dy, filtering);
+    return;
+  }
+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
+                  src_stride, dst_stride, src, dst,
+                  x, dx, y, dy);
 }
 
-// ScaleARGB an ARGB image.
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+                  int src_width, int src_height,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int dst_width, int dst_height,
+                  int clip_x, int clip_y, int clip_width, int clip_height,
+                  enum FilterMode filtering) {
+  if (!src_argb || src_width == 0 || src_height == 0 ||
+      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
+      clip_x < 0 || clip_y < 0 ||
+      clip_width > 32768 || clip_height > 32768 ||
+      (clip_x + clip_width) > dst_width ||
+      (clip_y + clip_height) > dst_height) {
+    return -1;
+  }
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+            dst_argb, dst_stride_argb, dst_width, dst_height,
+            clip_x, clip_y, clip_width, clip_height, filtering);
+  return 0;
+}
+
+// Scale an ARGB image.
 LIBYUV_API
 int ARGBScale(const uint8* src_argb, int src_stride_argb,
-             int src_width, int src_height,
-             uint8* dst_argb, int dst_stride_argb,
-             int dst_width, int dst_height,
-             FilterMode filtering) {
-  if (!src_argb || src_width <= 0 || src_height == 0 ||
+              int src_width, int src_height,
+              uint8* dst_argb, int dst_stride_argb,
+              int dst_width, int dst_height,
+              enum FilterMode filtering) {
+  if (!src_argb || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
       !dst_argb || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
-  // Negative height means invert the image.
-  if (src_height < 0) {
-    src_height = -src_height;
-    src_argb = src_argb + (src_height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
   ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
             dst_argb, dst_stride_argb, dst_width, dst_height,
-            filtering);
+            0, 0, dst_width, dst_height, filtering);
   return 0;
 }
 
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint32 src_fourcc,
+                       int src_width, int src_height,
+                       uint8* dst_argb, int dst_stride_argb,
+                       uint32 dst_fourcc,
+                       int dst_width, int dst_height,
+                       int clip_x, int clip_y, int clip_width, int clip_height,
+                       enum FilterMode filtering) {
+  uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
+  int r;
+  I420ToARGB(src_y, src_stride_y,
+             src_u, src_stride_u,
+             src_v, src_stride_v,
+             argb_buffer, src_width * 4,
+             src_width, src_height);
+
+  r = ARGBScaleClip(argb_buffer, src_width * 4,
+                    src_width, src_height,
+                    dst_argb, dst_stride_argb,
+                    dst_width, dst_height,
+                    clip_x, clip_y, clip_width, clip_height,
+                    filtering);
+  free(argb_buffer);
+  return r;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/source/scale_common.cc b/files/source/scale_common.cc
new file mode 100644
index 00000000..3507aa4d
--- /dev/null
+++ b/files/source/scale_common.cc
@@ -0,0 +1,1159 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// CPU agnostic row functions
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[1];
+    dst[1] = src_ptr[3];
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[1];
+  }
+}
+
+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[1];
+    dst[1] = src_ptr[3];
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[1];
+  }
+}
+
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+    dst[1] = (s[2] + s[3] + 1) >> 1;
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+  }
+}
+
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                              uint16* dst, int dst_width) {
+  const uint16* s = src_ptr;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+    dst[1] = (s[2] + s[3] + 1) >> 1;
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+  }
+}
+
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+  }
+}
+
+void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  dst_width -= 1;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst += 1;
+    s += 2;
+    t += 2;
+  }
+  dst[0] = (s[0] + t[0] + 1) >> 1;
+}
+
+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+  }
+}
+
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[2];
+    dst[1] = src_ptr[6];
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[2];
+  }
+}
+
+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[2];
+    dst[1] = src_ptr[6];
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[2];
+  }
+}
+
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+             src_ptr[stride + 4] + src_ptr[stride + 5] +
+             src_ptr[stride + 6] + src_ptr[stride + 7] +
+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+             8) >> 4;
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+  }
+}
+
+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+             src_ptr[stride + 4] + src_ptr[stride + 5] +
+             src_ptr[stride + 6] + src_ptr[stride + 7] +
+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+             8) >> 4;
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+  }
+}
+
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width) {
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  }
+}
+
+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width) {
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  }
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                 int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[0] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr[1] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[x >> 16];
+  }
+}
+
+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                    int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[0] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr[1] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+                    int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+    src_ptr += 1;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[0];
+  }
+}
+
+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+    src_ptr += 1;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[0];
+  }
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+#if defined(__arm__) || defined(__aarch64__)
+#define BLENDER(a, b, f) (uint8)((int)(a) + \
+    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+#else
+// inteluses 7 bit math with rounding.
+#define BLENDER(a, b, f) (uint8)((int)(a) + \
+    (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
+#endif
+
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+
+void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
+                         int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+#undef BLENDER
+
+// Same as 8 bit arm blender but return is cast to uint16
+#define BLENDER(a, b, f) (uint16)((int)(a) + \
+    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+
+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+#undef BLENDER
+
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width) {
+  int x;
+  assert(dst_width % 3 == 0);
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width) {
+  int x;
+  assert(dst_width % 3 == 0);
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
+// 8x3 -> 3x1
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+        (65536 / 9) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+        (65536 / 9) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7] +
+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+        (65536 / 6) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+        (65536 / 9) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+        (65536 / 9) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7] +
+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+        (65536 / 6) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+// 8x2 -> 3x1
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2]) * (65536 / 6) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5]) * (65536 / 6) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7]) *
+        (65536 / 4) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2]) * (65536 / 6) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5]) * (65536 / 6) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7]) *
+        (65536 / 4) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  int x;
+  assert(src_width > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
+  }
+}
+
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
+  int x;
+  assert(src_width > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
+  }
+}
+
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+                         ptrdiff_t src_stride,
+                         uint8* dst_argb, int dst_width) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[1];
+    dst[1] = src[3];
+    src += 4;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[1];
+  }
+}
+
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
+    src_argb += 8;
+    dst_argb += 4;
+  }
+}
+
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] +
+                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] +
+                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] +
+                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] +
+                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    src_argb += 8;
+    dst_argb += 4;
+  }
+}
+
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            int src_stepx,
+                            uint8* dst_argb, int dst_width) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[0];
+    dst[1] = src[src_stepx];
+    src += src_stepx * 2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] +
+                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] +
+                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] +
+                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] +
+                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    src_argb += src_stepx * 4;
+    dst_argb += 4;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+                     int dst_width, int x, int dx) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
+                       int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[1] = dst[0] = src[0];
+    src += 1;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
+#define BLENDERC(a, b, f, s) (uint32)( \
+    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) \
+    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
+    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+
+void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
+                             int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
+// Scale plane vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_argb, uint8* dst_argb,
+                        int x, int y, int dy,
+                        int bpp, enum FilterMode filtering) {
+  // TODO(fbarchard): Allow higher bpp.
+  int dst_width_bytes = dst_width * bpp;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  int j;
+  assert(bpp >= 1 && bpp <= 4);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  src_argb += (x >> 16) * bpp;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_DSPR2;
+    if (IS_ALIGNED(dst_width_bytes, 4)) {
+      InterpolateRow = InterpolateRow_DSPR2;
+    }
+  }
+#endif
+  for (j = 0; j < dst_height; ++j) {
+    int yi;
+    int yf;
+    if (y > max_y) {
+      y = max_y;
+    }
+    yi = y >> 16;
+    yf = filtering ? ((y >> 8) & 255) : 0;
+    InterpolateRow(dst_argb, src_argb + yi * src_stride,
+                   src_stride, dst_width_bytes, yf);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+void ScalePlaneVertical_16(int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint16* src_argb, uint16* dst_argb,
+                           int x, int y, int dy,
+                           int wpp, enum FilterMode filtering) {
+  // TODO(fbarchard): Allow higher wpp.
+  int dst_width_words = dst_width * wpp;
+  void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  int j;
+  assert(wpp >= 1 && wpp <= 2);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  src_argb += (x >> 16) * wpp;
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_16_DSPR2;
+    if (IS_ALIGNED(dst_width_bytes, 4)) {
+      InterpolateRow = InterpolateRow_16_DSPR2;
+    }
+  }
+#endif
+  for (j = 0; j < dst_height; ++j) {
+    int yi;
+    int yf;
+    if (y > max_y) {
+      y = max_y;
+    }
+    yi = y >> 16;
+    yf = filtering ? ((y >> 8) & 255) : 0;
+    InterpolateRow(dst_argb, src_argb + yi * src_stride,
+                   src_stride, dst_width_words, yf);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  enum FilterMode filtering) {
+  if (src_width < 0) {
+    src_width = -src_width;
+  }
+  if (src_height < 0) {
+    src_height = -src_height;
+  }
+  if (filtering == kFilterBox) {
+    // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
+    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+      filtering = kFilterBilinear;
+    }
+  }
+  if (filtering == kFilterBilinear) {
+    if (src_height == 1) {
+      filtering = kFilterLinear;
+    }
+    // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
+    if (dst_height == src_height || dst_height * 3 == src_height) {
+      filtering = kFilterLinear;
+    }
+    // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
+    // avoid reading 2 pixels horizontally that causes memory exception.
+    if (src_width == 1) {
+      filtering = kFilterNone;
+    }
+  }
+  if (filtering == kFilterLinear) {
+    if (src_width == 1) {
+      filtering = kFilterNone;
+    }
+    // TODO(fbarchard): Detect any odd scale factor and reduce to None.
+    if (dst_width == src_width || dst_width * 3 == src_width) {
+      filtering = kFilterNone;
+    }
+  }
+  return filtering;
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div) {
+  return (int)(((int64)(num) << 16) / div);
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div) {
+  return (int)((((int64)(num) << 16) - 0x00010001) /
+                          (div - 1));
+}
+
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+                int dst_width, int dst_height,
+                enum FilterMode filtering,
+                int* x, int* y, int* dx, int* dy) {
+  assert(x != NULL);
+  assert(y != NULL);
+  assert(dx != NULL);
+  assert(dy != NULL);
+  assert(src_width != 0);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  // Check for 1 pixel and avoid FixedDiv overflow.
+  if (dst_width == 1 && src_width >= 32768) {
+    dst_width = src_width;
+  }
+  if (dst_height == 1 && src_height >= 32768) {
+    dst_height = src_height;
+  }
+  if (filtering == kFilterBox) {
+    // Scale step for point sampling duplicates all pixels equally.
+    *dx = FixedDiv(Abs(src_width), dst_width);
+    *dy = FixedDiv(src_height, dst_height);
+    *x = 0;
+    *y = 0;
+  } else if (filtering == kFilterBilinear) {
+    // Scale step for bilinear sampling renders last pixel once for upsample.
+    if (dst_width <= Abs(src_width)) {
+      *dx = FixedDiv(Abs(src_width), dst_width);
+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_width > 1) {
+      *dx = FixedDiv1(Abs(src_width), dst_width);
+      *x = 0;
+    }
+    if (dst_height <= src_height) {
+      *dy = FixedDiv(src_height,  dst_height);
+      *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_height > 1) {
+      *dy = FixedDiv1(src_height, dst_height);
+      *y = 0;
+    }
+  } else if (filtering == kFilterLinear) {
+    // Scale step for bilinear sampling renders last pixel once for upsample.
+    if (dst_width <= Abs(src_width)) {
+      *dx = FixedDiv(Abs(src_width), dst_width);
+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_width > 1) {
+      *dx = FixedDiv1(Abs(src_width), dst_width);
+      *x = 0;
+    }
+    *dy = FixedDiv(src_height, dst_height);
+    *y = *dy >> 1;
+  } else {
+    // Scale step for point sampling duplicates all pixels equally.
+    *dx = FixedDiv(Abs(src_width), dst_width);
+    *dy = FixedDiv(src_height, dst_height);
+    *x = CENTERSTART(*dx, 0);
+    *y = CENTERSTART(*dy, 0);
+  }
+  // Negative src_width means horizontally mirror.
+  if (src_width < 0) {
+    *x += (dst_width - 1) * *dx;
+    *dx = -*dx;
+    // src_width = -src_width;   // Caller must do this.
+  }
+}
+#undef CENTERSTART
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/scale_gcc.cc b/files/source/scale_gcc.cc
new file mode 100644
index 00000000..e2f88544
--- /dev/null
+++ b/files/source/scale_gcc.cc
@@ -0,0 +1,1322 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+  { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb    %%xmm4,%%xmm4                  \n"
+    "psrlw      $0xf,%%xmm4                    \n"
+    "packuswb   %%xmm4,%%xmm4                  \n"
+    "pxor       %%xmm5,%%xmm5                  \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pmaddubsw  %%xmm4,%%xmm0                  \n"
+    "pmaddubsw  %%xmm4,%%xmm1                  \n"
+    "pavgw      %%xmm5,%%xmm0                  \n"
+    "pavgw      %%xmm5,%%xmm1                  \n"
+    "packuswb   %%xmm1,%%xmm0                  \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+  );
+}
+
+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb    %%xmm4,%%xmm4                  \n"
+    "psrlw      $0xf,%%xmm4                    \n"
+    "packuswb   %%xmm4,%%xmm4                  \n"
+    "pxor       %%xmm5,%%xmm5                  \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pmaddubsw  %%xmm4,%%xmm0                  \n"
+    "pmaddubsw  %%xmm4,%%xmm1                  \n"
+    "pmaddubsw  %%xmm4,%%xmm2                  \n"
+    "pmaddubsw  %%xmm4,%%xmm3                  \n"
+    "paddw      %%xmm2,%%xmm0                  \n"
+    "paddw      %%xmm3,%%xmm1                  \n"
+    "psrlw      $0x1,%%xmm0                    \n"
+    "psrlw      $0x1,%%xmm1                    \n"
+    "pavgw      %%xmm5,%%xmm0                  \n"
+    "pavgw      %%xmm5,%%xmm1                  \n"
+    "packuswb   %%xmm1,%%xmm0                  \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "lea        " MEMLEA(0x40,0) ",%0          \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "sub        $0x20,%2                       \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
+    "lea        " MEMLEA(0x40,0) ",%0          \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "sub        $0x20,%2                       \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+  );
+}
+
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
+    MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
+    "lea        " MEMLEA(0x40,0) ",%0          \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
+    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "sub        $0x20,%2                       \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SCALEROWDOWN2_AVX2
+
+void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrld     $0x18,%%xmm5                    \n"
+    "pslld     $0x10,%%xmm5                    \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  intptr_t stridex3;
+  asm volatile (
+    "pcmpeqb    %%xmm4,%%xmm4                  \n"
+    "psrlw      $0xf,%%xmm4                    \n"
+    "movdqa     %%xmm4,%%xmm5                  \n"
+    "packuswb   %%xmm4,%%xmm4                  \n"
+    "psllw      $0x3,%%xmm5                    \n"
+    "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "pmaddubsw  %%xmm4,%%xmm0                  \n"
+    "pmaddubsw  %%xmm4,%%xmm1                  \n"
+    "pmaddubsw  %%xmm4,%%xmm2                  \n"
+    "pmaddubsw  %%xmm4,%%xmm3                  \n"
+    "paddw      %%xmm2,%%xmm0                  \n"
+    "paddw      %%xmm3,%%xmm1                  \n"
+    MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
+    "pmaddubsw  %%xmm4,%%xmm2                  \n"
+    "pmaddubsw  %%xmm4,%%xmm3                  \n"
+    "paddw      %%xmm2,%%xmm0                  \n"
+    "paddw      %%xmm3,%%xmm1                  \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pmaddubsw  %%xmm4,%%xmm2                  \n"
+    "pmaddubsw  %%xmm4,%%xmm3                  \n"
+    "paddw      %%xmm2,%%xmm0                  \n"
+    "paddw      %%xmm3,%%xmm1                  \n"
+    "phaddw     %%xmm1,%%xmm0                  \n"
+    "paddw      %%xmm5,%%xmm0                  \n"
+    "psrlw      $0x4,%%xmm0                    \n"
+    "packuswb   %%xmm0,%%xmm0                  \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width),   // %2
+    "=&r"(stridex3)    // %3
+  : "r"((intptr_t)(src_stride))    // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
+    "vpslld     $0x10,%%ymm5,%%ymm5            \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "lea        " MEMLEA(0x40,0) ",%0          \n"
+    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x10,1) ",%1          \n"
+    "sub        $0x10,%2                       \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+    "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
+    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
+    MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+    MEMOPREG(vmovdqu,0x00,0,3,2,ymm2)          //  vmovdqu  (%0,%3,2),%%ymm2
+    MEMOPREG(vmovdqu,0x20,0,3,2,ymm3)          //  vmovdqu  0x20(%0,%3,2),%%ymm3
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+    MEMOPREG(vmovdqu,0x00,0,4,1,ymm2)          //  vmovdqu  (%0,%4,1),%%ymm2
+    MEMOPREG(vmovdqu,0x20,0,4,1,ymm3)          //  vmovdqu  0x20(%0,%4,1),%%ymm3
+    "lea        " MEMLEA(0x40,0) ",%0          \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x10,1) ",%1          \n"
+    "sub        $0x10,%2                       \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "r"((intptr_t)(src_stride * 3))   // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_SCALEROWDOWN4_AVX2
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm3                       \n"
+    "movdqa    %1,%%xmm4                       \n"
+    "movdqa    %2,%%xmm5                       \n"
+  :
+  : "m"(kShuf0),  // %0
+    "m"(kShuf1),  // %1
+    "m"(kShuf2)   // %2
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "palignr   $0x8,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm3,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
+    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"  // kShuf01
+    "movdqa    %1,%%xmm3                       \n"  // kShuf11
+    "movdqa    %2,%%xmm4                       \n"  // kShuf21
+  :
+  : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+  );
+  asm volatile (
+    "movdqa    %0,%%xmm5                       \n"  // kMadd01
+    "movdqa    %1,%%xmm0                       \n"  // kMadd11
+    "movdqa    %2,%%xmm1                       \n"  // kRound34
+  :
+  : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
+    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm0,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm4,%%xmm6                   \n"
+    "pmaddubsw %4,%%xmm6                       \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "m"(kMadd21)     // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"  // kShuf01
+    "movdqa    %1,%%xmm3                       \n"  // kShuf11
+    "movdqa    %2,%%xmm4                       \n"  // kShuf21
+  :
+  : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+  );
+  asm volatile (
+    "movdqa    %0,%%xmm5                       \n"  // kMadd01
+    "movdqa    %1,%%xmm0                       \n"  // kMadd11
+    "movdqa    %2,%%xmm1                       \n"  // kRound34
+  :
+  : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+  );
+
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
+    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm0,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm4,%%xmm6                   \n"
+    "pmaddubsw %4,%%xmm6                       \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+    : "+r"(src_ptr),   // %0
+      "+r"(dst_ptr),   // %1
+      "+r"(dst_width)  // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "m"(kMadd21)     // %4
+    : "memory", "cc", NACL_R14
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "movhlps   %%xmm0,%%xmm1                   \n"
+    "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
+    "lea       " MEMLEA(0xc,1) ",%1            \n"
+    "sub       $0xc,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  : "m"(kShuf38a),   // %3
+    "m"(kShuf38b)    // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+  );
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm4                       \n"
+    "movdqa    %3,%%xmm5                       \n"
+  :
+  : "m"(kShufAb0),   // %0
+    "m"(kShufAb1),   // %1
+    "m"(kShufAb2),   // %2
+    "m"(kScaleAb2)   // %3
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pshufb    %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "paddusw   %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "paddusw   %%xmm0,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movd      %%xmm1," MEMACCESS(1) "         \n"
+    "psrlq     $0x10,%%xmm1                    \n"
+    "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
+    "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "sub       $0x6,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"((intptr_t)(src_stride))  // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm4                       \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+  :
+  : "m"(kShufAc),    // %0
+    "m"(kShufAc3),   // %1
+    "m"(kScaleAc33)  // %2
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
+    "movhlps   %%xmm0,%%xmm1                   \n"
+    "movhlps   %%xmm6,%%xmm7                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm6                   \n"
+    "punpcklbw %%xmm5,%%xmm7                   \n"
+    "paddusw   %%xmm6,%%xmm0                   \n"
+    "paddusw   %%xmm7,%%xmm1                   \n"
+    MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movhlps   %%xmm6,%%xmm7                   \n"
+    "punpcklbw %%xmm5,%%xmm6                   \n"
+    "punpcklbw %%xmm5,%%xmm7                   \n"
+    "paddusw   %%xmm6,%%xmm0                   \n"
+    "paddusw   %%xmm7,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "psrldq    $0x2,%%xmm0                     \n"
+    "paddusw   %%xmm0,%%xmm6                   \n"
+    "psrldq    $0x2,%%xmm0                     \n"
+    "paddusw   %%xmm0,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "movdqa    %%xmm1,%%xmm7                   \n"
+    "psrldq    $0x2,%%xmm1                     \n"
+    "paddusw   %%xmm1,%%xmm7                   \n"
+    "psrldq    $0x2,%%xmm1                     \n"
+    "paddusw   %%xmm1,%%xmm7                   \n"
+    "pshufb    %%xmm3,%%xmm7                   \n"
+    "paddusw   %%xmm7,%%xmm6                   \n"
+    "pmulhuw   %%xmm4,%%xmm6                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movd      %%xmm6," MEMACCESS(1) "         \n"
+    "psrlq     $0x10,%%xmm6                    \n"
+    "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
+    "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "sub       $0x6,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  asm volatile (
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm1   \n"
+    "movdqa    %%xmm3,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpckhbw %%xmm5,%%xmm3                   \n"
+    "paddusw   %%xmm2,%%xmm0                   \n"
+    "paddusw   %%xmm3,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(src_width)    // %2
+  :
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  asm volatile (
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm3        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"  // src_ptr += 32
+    "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
+    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpaddusw   " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
+    "vpaddusw   " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(src_width)    // %2
+  :
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SCALEADDROW_AVX2
+
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static uvec8 kFsub80 =
+  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static uvec16 kFadd40 =
+  { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx) {
+  intptr_t x0, x1, temp_pixel;
+  asm volatile (
+    "movd      %6,%%xmm2                       \n"
+    "movd      %7,%%xmm3                       \n"
+    "movl      $0x04040000,%k2                 \n"
+    "movd      %k2,%%xmm5                      \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $15,%%xmm7                      \n"  // 0x00010001
+
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "subl      $0x2,%5                         \n"
+    "jl        29f                             \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "punpckldq %%xmm0,%%xmm2                   \n"
+    "punpckldq %%xmm3,%%xmm3                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+    LABELALIGN
+  "2:                                          \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
+    "movd      %k2,%%xmm0                      \n"
+    "psrlw     $0x9,%%xmm1                     \n"
+    MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
+    "movd      %k2,%%xmm4                      \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "punpcklwd %%xmm4,%%xmm0                   \n"
+    "psubb     %8,%%xmm0                       \n"  // make pixels signed.
+    "pxor      %%xmm6,%%xmm1                   \n"  // 128 -f = (f ^ 127 ) + 1
+    "paddusb   %%xmm7,%%xmm1                   \n"
+    "pmaddubsw %%xmm0,%%xmm1                   \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+    "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.
+    "psrlw     $0x7,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movd      %%xmm1,%k2                      \n"
+    "mov       %w2," MEMACCESS(0) "            \n"
+    "lea       " MEMLEA(0x2,0) ",%0            \n"
+    "subl      $0x2,%5                         \n"
+    "jge       2b                              \n"
+
+    LABELALIGN
+  "29:                                         \n"
+    "addl      $0x1,%5                         \n"
+    "jl        99f                             \n"
+    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
+    "movd      %k2,%%xmm0                      \n"
+    "psrlw     $0x9,%%xmm2                     \n"
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "psubb     %8,%%xmm0                       \n"  // make pixels signed.
+    "pxor      %%xmm6,%%xmm2                   \n"
+    "paddusb   %%xmm7,%%xmm2                   \n"
+    "pmaddubsw %%xmm0,%%xmm2                   \n"
+    "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm2                   \n"
+    "movd      %%xmm2,%k2                      \n"
+    "mov       %b2," MEMACCESS(0) "            \n"
+  "99:                                         \n"
+  : "+r"(dst_ptr),      // %0
+    "+r"(src_ptr),      // %1
+    "=&a"(temp_pixel),  // %2
+    "=&r"(x0),          // %3
+    "=&r"(x1),          // %4
+#if defined(__x86_64__)
+    "+rm"(dst_width)    // %5
+#else
+    "+m"(dst_width)    // %5
+#endif
+  : "rm"(x),            // %6
+    "rm"(dx),           // %7
+#if defined(__x86_64__)
+    "x"(kFsub80),       // %8
+    "x"(kFadd40)        // %9
+#else
+    "m"(kFsub80),       // %8
+    "m"(kFadd40)        // %9
+#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x20,%2                         \n"
+    "jg        1b                              \n"
+
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+r"(dst_width)    // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "shufps    $0xdd,%%xmm1,%%xmm0             \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(dst_width)  // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(dst_width)  // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12;
+  asm volatile (
+    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
+    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movd      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
+    MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
+    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
+    "punpckldq %%xmm3,%%xmm2                   \n"
+    "punpcklqdq %%xmm2,%%xmm0                  \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),       // %0
+    "+r"(src_stepx_x4),   // %1
+    "+r"(dst_argb),       // %2
+    "+r"(dst_width),      // %3
+    "=&r"(src_stepx_x12)  // %4
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride, int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12;
+  intptr_t row1 = (intptr_t)(src_stride);
+  asm volatile (
+    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
+    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
+    "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
+    MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
+    MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
+    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
+    "movq      " MEMACCESS(5) ",%%xmm2         \n"
+    MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
+    MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
+    MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
+    "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),        // %0
+    "+r"(src_stepx_x4),    // %1
+    "+r"(dst_argb),        // %2
+    "+rm"(dst_width),      // %3
+    "=&r"(src_stepx_x12),  // %4
+    "+r"(row1)             // %5
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  intptr_t x0, x1;
+  asm volatile (
+    "movd      %5,%%xmm2                       \n"
+    "movd      %6,%%xmm3                       \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    "pshufd    $0x11,%%xmm3,%%xmm0             \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pshufd    $0x5,%%xmm3,%%xmm0              \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pextrw    $0x1,%%xmm2,%k0                 \n"
+    "pextrw    $0x3,%%xmm2,%k1                 \n"
+    "cmp       $0x0,%4                         \n"
+    "jl        99f                             \n"
+    "sub       $0x4,%4                         \n"
+    "jl        49f                             \n"
+
+    LABELALIGN
+  "40:                                         \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
+    "pextrw    $0x5,%%xmm2,%k0                 \n"
+    "pextrw    $0x7,%%xmm2,%k1                 \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
+    MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
+    "pextrw    $0x1,%%xmm2,%k0                 \n"
+    "pextrw    $0x3,%%xmm2,%k1                 \n"
+    "punpckldq %%xmm4,%%xmm1                   \n"
+    "punpcklqdq %%xmm1,%%xmm0                  \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%4                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "test      $0x2,%4                         \n"
+    "je        29f                             \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
+    "pextrw    $0x5,%%xmm2,%k0                 \n"
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x8,2) ",%2            \n"
+  "29:                                         \n"
+    "test      $0x1,%4                         \n"
+    "je        99f                             \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+  "99:                                         \n"
+  : "=&a"(x0),         // %0
+    "=&d"(x1),         // %1
+    "+r"(dst_argb),    // %2
+    "+r"(src_argb),    // %3
+    "+r"(dst_width)    // %4
+  : "rm"(x),           // %5
+    "rm"(dx)           // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpckldq %%xmm0,%%xmm0                   \n"
+    "punpckhdq %%xmm1,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+
+  : "+r"(dst_argb),    // %0
+    "+r"(src_argb),    // %1
+    "+r"(dst_width)    // %2
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
+  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx) {
+  intptr_t x0, x1;
+  asm volatile (
+    "movdqa    %0,%%xmm4                       \n"
+    "movdqa    %1,%%xmm5                       \n"
+  :
+  : "m"(kShuffleColARGB),  // %0
+    "m"(kShuffleFractions)  // %1
+  );
+
+  asm volatile (
+    "movd      %5,%%xmm2                       \n"
+    "movd      %6,%%xmm3                       \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x9,%%xmm6                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "sub       $0x2,%2                         \n"
+    "jl        29f                             \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "punpckldq %%xmm0,%%xmm2                   \n"
+    "punpckldq %%xmm3,%%xmm3                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+    LABELALIGN
+  "2:                                          \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
+    "psrlw     $0x9,%%xmm1                     \n"
+    MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm1                   \n"
+    "pmaddubsw %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(0) "         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x2,%2                         \n"
+    "jge       2b                              \n"
+
+    LABELALIGN
+  "29:                                         \n"
+    "add       $0x1,%2                         \n"
+    "jl        99f                             \n"
+    "psrlw     $0x9,%%xmm2                     \n"
+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm2                   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(0) "         \n"
+
+    LABELALIGN
+  "99:                                         \n"
+  : "+r"(dst_argb),    // %0
+    "+r"(src_argb),    // %1
+    "+rm"(dst_width),  // %2
+    "=&r"(x0),         // %3
+    "=&r"(x1)          // %4
+  : "rm"(x),           // %5
+    "rm"(dx)           // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+  asm volatile (
+    "cdq                                       \n"
+    "shld      $0x10,%%eax,%%edx               \n"
+    "shl       $0x10,%%eax                     \n"
+    "idiv      %1                              \n"
+    "mov       %0, %%eax                       \n"
+    : "+a"(num)  // %0
+    : "c"(div)   // %1
+    : "memory", "cc", "edx"
+  );
+  return num;
+}
+
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_X86(int num, int div) {
+  asm volatile (
+    "cdq                                       \n"
+    "shld      $0x10,%%eax,%%edx               \n"
+    "shl       $0x10,%%eax                     \n"
+    "sub       $0x10001,%%eax                  \n"
+    "sbb       $0x0,%%edx                      \n"
+    "sub       $0x1,%1                         \n"
+    "idiv      %1                              \n"
+    "mov       %0, %%eax                       \n"
+    : "+a"(num)  // %0
+    : "c"(div)   // %1
+    : "memory", "cc", "edx"
+  );
+  return num;
+}
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/scale_mips.cc b/files/source/scale_mips.cc
new file mode 100644
index 00000000..ae953073
--- /dev/null
+++ b/files/source/scale_mips.cc
@@ -0,0 +1,644 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC MIPS DSPR2
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst, int dst_width) {
+  __asm__ __volatile__(
+    ".set push                                     \n"
+    ".set noreorder                                \n"
+
+    "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
+    "beqz           $t9, 2f                        \n"
+    " nop                                          \n"
+
+  "1:                                              \n"
+    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
+    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
+    "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
+    "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
+    "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
+    "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
+    "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
+    "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
+    // TODO(fbarchard): Use odd pixels instead of even.
+    "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
+    "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
+    "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
+    "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
+    "addiu          %[src_ptr], %[src_ptr], 32     \n"
+    "addiu          $t9, $t9, -1                   \n"
+    "sw             $t8, 0(%[dst])                 \n"
+    "sw             $t0, 4(%[dst])                 \n"
+    "sw             $t1, 8(%[dst])                 \n"
+    "sw             $t2, 12(%[dst])                \n"
+    "bgtz           $t9, 1b                        \n"
+    " addiu         %[dst], %[dst], 16             \n"
+
+  "2:                                              \n"
+    "andi           $t9, %[dst_width], 0xf         \n"  // residue
+    "beqz           $t9, 3f                        \n"
+    " nop                                          \n"
+
+  "21:                                             \n"
+    "lbu            $t0, 0(%[src_ptr])             \n"
+    "addiu          %[src_ptr], %[src_ptr], 2      \n"
+    "addiu          $t9, $t9, -1                   \n"
+    "sb             $t0, 0(%[dst])                 \n"
+    "bgtz           $t9, 21b                       \n"
+    " addiu         %[dst], %[dst], 1              \n"
+
+  "3:                                              \n"
+    ".set pop                                      \n"
+  : [src_ptr] "+r" (src_ptr),
+    [dst] "+r" (dst)
+  : [dst_width] "r" (dst_width)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  const uint8* t = src_ptr + src_stride;
+
+  __asm__ __volatile__ (
+    ".set push                                    \n"
+    ".set noreorder                               \n"
+
+    "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
+    "bltz           $t9, 2f                       \n"
+    " nop                                         \n"
+
+  "1:                                             \n"
+    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
+    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
+    "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
+    "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
+    "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
+    "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
+    "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
+    "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
+    "addiu          $t9, $t9, -1                  \n"
+    "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
+    "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
+    "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
+    "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
+    "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
+    "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
+    "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
+    "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
+    "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
+    "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
+    "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
+    "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
+    "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
+    "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
+    "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
+    "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
+    "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
+    "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
+    "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
+    "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
+    "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
+    "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
+    "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
+    "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
+    "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
+    "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
+    "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
+    "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
+    "addiu          %[src_ptr], %[src_ptr], 16    \n"
+    "addiu          %[t], %[t], 16                \n"
+    "sb             $t0, 0(%[dst])                \n"
+    "sb             $t4, 1(%[dst])                \n"
+    "sb             $t1, 2(%[dst])                \n"
+    "sb             $t5, 3(%[dst])                \n"
+    "sb             $t2, 4(%[dst])                \n"
+    "sb             $t6, 5(%[dst])                \n"
+    "sb             $t3, 6(%[dst])                \n"
+    "sb             $t7, 7(%[dst])                \n"
+    "bgtz           $t9, 1b                       \n"
+    " addiu         %[dst], %[dst], 8             \n"
+
+  "2:                                             \n"
+    "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
+    "beqz           $t9, 3f                       \n"
+    " nop                                         \n"
+
+    "21:                                          \n"
+    "lwr            $t1, 0(%[src_ptr])            \n"
+    "lwl            $t1, 3(%[src_ptr])            \n"
+    "lwr            $t2, 0(%[t])                  \n"
+    "lwl            $t2, 3(%[t])                  \n"
+    "srl            $t8, $t1, 16                  \n"
+    "ins            $t1, $t2, 16, 16              \n"
+    "ins            $t2, $t8, 0, 16               \n"
+    "raddu.w.qb     $t1, $t1                      \n"
+    "raddu.w.qb     $t2, $t2                      \n"
+    "shra_r.w       $t1, $t1, 2                   \n"
+    "shra_r.w       $t2, $t2, 2                   \n"
+    "sb             $t1, 0(%[dst])                \n"
+    "sb             $t2, 1(%[dst])                \n"
+    "addiu          %[src_ptr], %[src_ptr], 4     \n"
+    "addiu          $t9, $t9, -2                  \n"
+    "addiu          %[t], %[t], 4                 \n"
+    "bgtz           $t9, 21b                      \n"
+    " addiu         %[dst], %[dst], 2             \n"
+
+  "3:                                             \n"
+    ".set pop                                     \n"
+
+  : [src_ptr] "+r" (src_ptr),
+    [dst] "+r" (dst), [t] "+r" (t)
+  : [dst_width] "r" (dst_width)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                    \n"
+      ".set noreorder                               \n"
+
+      "srl            $t9, %[dst_width], 3          \n"
+      "beqz           $t9, 2f                       \n"
+      " nop                                         \n"
+
+     "1:                                            \n"
+      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
+      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
+      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
+      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
+      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
+      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
+      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
+      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
+      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
+      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
+      "addiu          %[src_ptr], %[src_ptr], 32    \n"
+      "addiu          $t9, $t9, -1                  \n"
+      "sw             $t1, 0(%[dst])                \n"
+      "sw             $t5, 4(%[dst])                \n"
+      "bgtz           $t9, 1b                       \n"
+      " addiu         %[dst], %[dst], 8             \n"
+
+    "2:                                             \n"
+      "andi           $t9, %[dst_width], 7          \n"  // residue
+      "beqz           $t9, 3f                       \n"
+      " nop                                         \n"
+
+    "21:                                            \n"
+      "lbu            $t1, 0(%[src_ptr])            \n"
+      "addiu          %[src_ptr], %[src_ptr], 4     \n"
+      "addiu          $t9, $t9, -1                  \n"
+      "sb             $t1, 0(%[dst])                \n"
+      "bgtz           $t9, 21b                      \n"
+      " addiu         %[dst], %[dst], 1             \n"
+
+    "3:                                             \n"
+      ".set pop                                     \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst)
+      : [dst_width] "r" (dst_width)
+      : "t1", "t2", "t3", "t4", "t5",
+        "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* s1 = src_ptr + stride;
+  const uint8* s2 = s1 + stride;
+  const uint8* s3 = s2 + stride;
+
+  __asm__ __volatile__ (
+      ".set push                                  \n"
+      ".set noreorder                             \n"
+
+      "srl           $t9, %[dst_width], 1         \n"
+      "andi          $t8, %[dst_width], 1         \n"
+
+     "1:                                          \n"
+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
+      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
+      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
+      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
+      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
+      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
+      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
+      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
+      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
+      "add           $t0, $t0, $t1                \n"
+      "add           $t1, $t2, $t3                \n"
+      "add           $t0, $t0, $t1                \n"
+      "add           $t4, $t4, $t5                \n"
+      "add           $t6, $t6, $t7                \n"
+      "add           $t4, $t4, $t6                \n"
+      "shra_r.w      $t0, $t0, 4                  \n"
+      "shra_r.w      $t4, $t4, 4                  \n"
+      "sb            $t0, 0(%[dst])               \n"
+      "sb            $t4, 1(%[dst])               \n"
+      "addiu         %[src_ptr], %[src_ptr], 8    \n"
+      "addiu         %[s1], %[s1], 8              \n"
+      "addiu         %[s2], %[s2], 8              \n"
+      "addiu         %[s3], %[s3], 8              \n"
+      "addiu         $t9, $t9, -1                 \n"
+      "bgtz          $t9, 1b                      \n"
+      " addiu        %[dst], %[dst], 2            \n"
+      "beqz          $t8, 2f                      \n"
+      " nop                                       \n"
+
+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
+      "add           $t0, $t0, $t1                \n"
+      "add           $t1, $t2, $t3                \n"
+      "add           $t0, $t0, $t1                \n"
+      "shra_r.w      $t0, $t0, 4                  \n"
+      "sb            $t0, 0(%[dst])               \n"
+
+      "2:                                         \n"
+      ".set pop                                   \n"
+
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [s1] "+r" (s1),
+        [s2] "+r" (s2),
+        [s3] "+r" (s3)
+      : [dst_width] "r" (dst_width)
+      : "t0", "t1", "t2", "t3", "t4", "t5",
+        "t6","t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                          \n"
+      ".set noreorder                                     \n"
+    "1:                                                   \n"
+      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
+      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
+      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
+      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
+      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
+      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
+      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
+      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
+      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
+      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
+      "addiu           %[dst_width], %[dst_width], -24    \n"
+      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
+      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
+      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
+      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
+      "addiu           %[src_ptr], %[src_ptr], 32         \n"
+      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
+      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
+      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
+      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
+      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
+      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
+      "sw              $t1, 0(%[dst])                     \n"
+      "sw              $t0, 4(%[dst])                     \n"
+      "sw              $t3, 8(%[dst])                     \n"
+      "sw              $t5, 12(%[dst])                    \n"
+      "sw              $t9, 16(%[dst])                    \n"
+      "sw              $t7, 20(%[dst])                    \n"
+      "bnez            %[dst_width], 1b                   \n"
+      " addiu          %[dst], %[dst], 24                 \n"
+      ".set pop                                           \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3", "t4", "t5",
+        "t6","t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* d, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "repl.ph           $t3, 3                          \n"  // 0x00030003
+
+    "1:                                                  \n"
+      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
+      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
+      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
+      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
+      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
+      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
+      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
+      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
+      "raddu.w.qb        $t0, $t0                        \n"
+      "raddu.w.qb        $t1, $t1                        \n"
+      "shra_r.w          $t0, $t0, 1                     \n"
+      "shra_r.w          $t1, $t1, 1                     \n"
+      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
+      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
+      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
+      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
+      "addu.ph           $t2, $t2, $t4                   \n"
+      "addu.ph           $t6, $t6, $t5                   \n"
+      "sll               $t5, $t0, 1                     \n"
+      "add               $t0, $t5, $t0                   \n"
+      "shra_r.ph         $t2, $t2, 2                     \n"
+      "shra_r.ph         $t6, $t6, 2                     \n"
+      "shll.ph           $t4, $t2, 1                     \n"
+      "addq.ph           $t4, $t4, $t2                   \n"
+      "addu              $t0, $t0, $t1                   \n"
+      "addiu             %[src_ptr], %[src_ptr], 4       \n"
+      "shra_r.w          $t0, $t0, 2                     \n"
+      "addu.ph           $t6, $t6, $t4                   \n"
+      "shra_r.ph         $t6, $t6, 2                     \n"
+      "srl               $t1, $t6, 16                    \n"
+      "addiu             %[dst_width], %[dst_width], -3  \n"
+      "sb                $t1, 0(%[d])                    \n"
+      "sb                $t0, 1(%[d])                    \n"
+      "sb                $t6, 2(%[d])                    \n"
+      "bgtz              %[dst_width], 1b                \n"
+      " addiu            %[d], %[d], 3                   \n"
+    "3:                                                  \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [src_stride] "+r" (src_stride),
+        [d] "+r" (d),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3",
+        "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* d, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                           \n"
+      ".set noreorder                                      \n"
+      "repl.ph           $t2, 3                            \n"  // 0x00030003
+
+    "1:                                                    \n"
+      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
+      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
+      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
+      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
+      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
+      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
+      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
+      "raddu.w.qb        $t0, $t0                          \n"
+      "raddu.w.qb        $t1, $t1                          \n"
+      "shra_r.w          $t0, $t0, 1                       \n"
+      "shra_r.w          $t1, $t1, 1                       \n"
+      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
+      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
+      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
+      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
+      "addu.ph           $t4, $t4, $t3                     \n"
+      "addu.ph           $t6, $t6, $t5                     \n"
+      "shra_r.ph         $t6, $t6, 2                       \n"
+      "shra_r.ph         $t4, $t4, 2                       \n"
+      "addu.ph           $t6, $t6, $t4                     \n"
+      "addiu             %[src_ptr], %[src_ptr], 4         \n"
+      "shra_r.ph         $t6, $t6, 1                       \n"
+      "addu              $t0, $t0, $t1                     \n"
+      "addiu             %[dst_width], %[dst_width], -3    \n"
+      "shra_r.w          $t0, $t0, 1                       \n"
+      "srl               $t1, $t6, 16                      \n"
+      "sb                $t1, 0(%[d])                      \n"
+      "sb                $t0, 1(%[d])                      \n"
+      "sb                $t6, 2(%[d])                      \n"
+      "bgtz              %[dst_width], 1b                  \n"
+      " addiu            %[d], %[d], 3                     \n"
+    "3:                                                    \n"
+      ".set pop                                            \n"
+      : [src_ptr] "+r" (src_ptr),
+        [src_stride] "+r" (src_stride),
+        [d] "+r" (d),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3",
+        "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                     \n"
+      ".set noreorder                                \n"
+
+    "1:                                              \n"
+      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
+      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
+      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
+      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
+      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
+      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
+      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
+      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
+      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
+      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
+      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
+      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
+      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
+      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
+      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
+      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
+      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
+      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
+      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
+      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
+      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
+      "addiu      %[src_ptr], %[src_ptr], 32         \n"
+      "addiu      %[dst_width], %[dst_width], -12    \n"
+      "addiu      $t8,%[dst_width], -12              \n"
+      "sw         $t1, 0(%[dst])                     \n"
+      "sw         $t4, 4(%[dst])                     \n"
+      "sw         $t6, 8(%[dst])                     \n"
+      "bgez       $t8, 1b                            \n"
+      " addiu     %[dst], %[dst], 12                 \n"
+      ".set pop                                      \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t6", "t7", "t8"
+  );
+}
+
+void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* t = src_ptr + stride;
+  const int c = 0x2AAA;
+
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
+      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
+      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
+      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
+      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
+      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
+      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
+      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
+      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
+      "srl             $t4, $t4, 2                       \n"  // t4 / 4
+      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
+      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
+      "addu            $t6, $t5, $t6                     \n"
+      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
+      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
+      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
+      "addu            $t0, $t0, $t2                     \n"
+      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
+      "addiu           %[src_ptr], %[src_ptr], 8         \n"
+      "addiu           %[t], %[t], 8                     \n"
+      "addiu           %[dst_width], %[dst_width], -3    \n"
+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
+      "srl             $t6, $t6, 16                      \n"
+      "srl             $t0, $t0, 16                      \n"
+      "sb              $t4, -1(%[dst_ptr])               \n"
+      "sb              $t6, -2(%[dst_ptr])               \n"
+      "bgtz            %[dst_width], 1b                  \n"
+      " sb             $t0, -3(%[dst_ptr])               \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst_ptr] "+r" (dst_ptr),
+        [t] "+r" (t),
+        [dst_width] "+r" (dst_width)
+      : [c] "r" (c)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* s1 = src_ptr + stride;
+  stride += stride;
+  const uint8* s2 = src_ptr + stride;
+  const int c1 = 0x1C71;
+  const int c2 = 0x2AAA;
+
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
+      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
+      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
+      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
+      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
+      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
+      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
+      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
+      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
+      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
+      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
+      "addu            $t7, $t7, $t8                     \n"
+      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
+      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
+      "addu            $t6, $t6, $t8                     \n"
+      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
+      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
+      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
+      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
+      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
+      "addu            $t7, $t7, $t8                     \n"
+      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
+      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
+      "raddu.w.qb      $t0, $t0                          \n"
+      "raddu.w.qb      $t2, $t2                          \n"
+      "raddu.w.qb      $t4, $t4                          \n"
+      "addu            $t0, $t0, $t2                     \n"
+      "addu            $t0, $t0, $t4                     \n"
+      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
+      "addiu           %[src_ptr], %[src_ptr], 8         \n"
+      "addiu           %[s1], %[s1], 8                   \n"
+      "addiu           %[s2], %[s2], 8                   \n"
+      "addiu           %[dst_width], %[dst_width], -3    \n"
+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
+      "srl             $t6, $t6, 16                      \n"
+      "srl             $t7, $t7, 16                      \n"
+      "srl             $t0, $t0, 16                      \n"
+      "sb              $t6, -1(%[dst_ptr])               \n"
+      "sb              $t7, -2(%[dst_ptr])               \n"
+      "bgtz            %[dst_width], 1b                  \n"
+      " sb             $t0, -3(%[dst_ptr])               \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst_ptr] "+r" (dst_ptr),
+        [s1] "+r" (s1),
+        [s2] "+r" (s2),
+        [dst_width] "+r" (dst_width)
+      : [c1] "r" (c1), [c2] "r" (c2)
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t6", "t7", "t8"
+  );
+}
+
+#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc
index a1946f05..44b0c808 100644
--- a/files/source/scale_neon.cc
+++ b/files/source/scale_neon.cc
@@ -4,11 +4,10 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/basic_types.h"
 #include "libyuv/row.h"
 
 #ifdef __cplusplus
@@ -16,88 +15,120 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for GCC Neon
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
 
-/**
- * NEON downscalers with interpolation.
- *
- * Provided by Fritz Koenig
- *
- */
+// NEON downscalers with interpolation.
+// Provided by Fritz Koenig
 
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst, int dst_width) {
   asm volatile (
-    "1:                                        \n"
+  "1:                                          \n"
     // load even pixels into q0, odd into q1
-    "vld2.u8    {q0,q1}, [%0]!                 \n"
-    "vst1.u8    {q0}, [%1]!                    \n"  // store even pixels
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"
     "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
     "bgt        1b                             \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst),              // %1
-      "+r"(dst_width)         // %2
-    :
-    : "q0", "q1"              // Clobber List
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1"              // Clobber List
   );
 }
 
-void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #1                     \n"
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1"     // Clobber List
+  );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst, int dst_width) {
   asm volatile (
     // change the stride to row 2 pointer
     "add        %1, %0                         \n"
-    "1:                                        \n"
-    "vld1.u8    {q0,q1}, [%0]!                 \n"  // load row 1 and post inc
-    "vld1.u8    {q2,q3}, [%1]!                 \n"  // load row 2 and post inc
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
+    MEMACCESS(1)
+    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
     "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
     "vpaddl.u8  q1, q1                         \n"
     "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
     "vpadal.u8  q1, q3                         \n"
     "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
     "vrshrn.u16 d1, q1, #2                     \n"
-    "vst1.u8    {q0}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"
     "bgt        1b                             \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(src_stride),       // %1
-      "+r"(dst),              // %2
-      "+r"(dst_width)         // %3
-    :
-    : "q0", "q1", "q2", "q3"     // Clobber List
-   );
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "q0", "q1", "q2", "q3"     // Clobber List
+  );
 }
 
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) {
   asm volatile (
-    "1:                                        \n"
-    "vld2.u8    {d0, d1}, [%0]!                \n"
-    "vtrn.u8    d1, d0                         \n"
-    "vshrn.u16  d0, q0, #8                     \n"
-    "vst1.u32   {d0[1]}, [%1]!                 \n"
-    "subs       %2, #4                         \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
+    "subs       %2, %2, #8                     \n" // 8 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {d2}, [%1]!                    \n"
     "bgt        1b                             \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width)         // %2
-    :
-    : "q0", "q1", "memory", "cc"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1", "memory", "cc"
   );
 }
 
-void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "add        r4, %0, %3                     \n"
-    "add        r5, r4, %3                     \n"
-    "add        %3, r5, %3                     \n"
-    "1:                                        \n"
-    "vld1.u8    {q0}, [%0]!                    \n"   // load up 16x4
-    "vld1.u8    {q1}, [r4]!                    \n"
-    "vld1.u8    {q2}, [r5]!                    \n"
-    "vld1.u8    {q3}, [%3]!                    \n"
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
+    MEMACCESS(3)
+    "vld1.8     {q1}, [%3]!                    \n"
+    MEMACCESS(4)
+    "vld1.8     {q2}, [%4]!                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q3}, [%5]!                    \n"
+    "subs       %2, %2, #4                     \n"
     "vpaddl.u8  q0, q0                         \n"
     "vpadal.u8  q0, q1                         \n"
     "vpadal.u8  q0, q2                         \n"
@@ -105,14 +136,17 @@ void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
     "vpaddl.u16 q0, q0                         \n"
     "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
     "vmovn.u16  d0, q0                         \n"
-    "vst1.u32   {d0[0]}, [%1]!                 \n"
-    "subs       %2, #4                         \n"
+    MEMACCESS(1)
+    "vst1.32    {d0[0]}, [%1]!                 \n"
     "bgt        1b                             \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width)         // %2
-    : "r"(src_stride)         // %3
-    : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width), // %2
+    "+r"(src_ptr1),  // %3
+    "+r"(src_ptr2),  // %4
+    "+r"(src_ptr3)   // %5
+  :
+  : "q0", "q1", "q2", "q3", "memory", "cc"
   );
 }
 
@@ -120,32 +154,37 @@ void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
 // to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
 void ScaleRowDown34_NEON(const uint8* src_ptr,
-                         ptrdiff_t /* src_stride */,
+                         ptrdiff_t src_stride,
                          uint8* dst_ptr, int dst_width) {
   asm volatile (
-    "1:                                        \n"
-    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    "vmov         d2, d3                       \n" // order d0, d1, d2
-    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
-    "subs         %2, #24                      \n"
-    "bgt          1b                           \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width)         // %2
-    :
-    : "d0", "d1", "d2", "d3", "memory", "cc"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "subs       %2, %2, #24                  \n"
+    "vmov       d2, d3                       \n" // order d0, d1, d2
+    MEMACCESS(1)
+    "vst3.8     {d0, d1, d2}, [%1]!          \n"
+    "bgt        1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "d0", "d1", "d2", "d3", "memory", "cc"
   );
 }
 
-void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
   asm volatile (
-    "vmov.u8      d24, #3                      \n"
-    "add          %3, %0                       \n"
-    "1:                                        \n"
-    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+    "subs         %2, %2, #24                  \n"
 
     // filter src line 0 with src line 1
     // expand chars to shorts to allow for room
@@ -180,29 +219,31 @@ void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
     "vmlal.u8     q8, d3, d24                  \n"
     "vqrshrn.u16  d2, q8, #2                   \n"
 
-    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
+    MEMACCESS(1)
+    "vst3.8       {d0, d1, d2}, [%1]!          \n"
 
-    "subs         %2, #24                      \n"
     "bgt          1b                           \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width),        // %2
-      "+r"(src_stride)        // %3
-    :
-    : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
   );
 }
 
-void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
   asm volatile (
-    "vmov.u8      d24, #3                      \n"
-    "add          %3, %0                       \n"
-    "1:                                        \n"
-    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
-
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+    "subs         %2, %2, #24                  \n"
     // average src line 0 with src line 1
     "vrhadd.u8    q0, q0, q2                   \n"
     "vrhadd.u8    q1, q1, q3                   \n"
@@ -220,72 +261,83 @@ void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
     "vmlal.u8     q3, d3, d24                  \n"
     "vqrshrn.u16  d2, q3, #2                   \n"
 
-    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
-
-    "subs         %2, #24                      \n"
+    MEMACCESS(1)
+    "vst3.8       {d0, d1, d2}, [%1]!          \n"
     "bgt          1b                           \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width),        // %2
-      "+r"(src_stride)        // %3
-    :
-    : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
   );
 }
 
 #define HAS_SCALEROWDOWN38_NEON
-const uvec8 kShuf38 =
+static uvec8 kShuf38 =
   { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-const uvec8 kShuf38_2 =
+static uvec8 kShuf38_2 =
   { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
-const vec16 kMult38_Div6 =
+static vec16 kMult38_Div6 =
   { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
     65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-const vec16 kMult38_Div9 =
+static vec16 kMult38_Div9 =
   { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
     65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
 
 // 32 -> 12
 void ScaleRowDown38_NEON(const uint8* src_ptr,
-                         ptrdiff_t /* src_stride */,
+                         ptrdiff_t src_stride,
                          uint8* dst_ptr, int dst_width) {
   asm volatile (
-    "vld1.u8      {q3}, [%3]                   \n"
-    "1:                                        \n"
-    "vld1.u8      {d0, d1, d2, d3}, [%0]!      \n"
-    "vtbl.u8      d4, {d0, d1, d2, d3}, d6     \n"
-    "vtbl.u8      d5, {d0, d1, d2, d3}, d7     \n"
-    "vst1.u8      {d4}, [%1]!                  \n"
-    "vst1.u32     {d5[0]}, [%1]!               \n"
-    "subs         %2, #12                      \n"
-    "bgt          1b                           \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width)         // %2
-    : "r"(&kShuf38)           // %3
-    : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+    MEMACCESS(3)
+    "vld1.8     {q3}, [%3]                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
+    "subs       %2, %2, #12                    \n"
+    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
+    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
+    MEMACCESS(1)
+    "vst1.8     {d4}, [%1]!                    \n"
+    MEMACCESS(1)
+    "vst1.32    {d5[0]}, [%1]!                 \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  : "r"(&kShuf38)           // %3
+  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
   );
 }
 
 // 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
                                       ptrdiff_t src_stride,
                                       uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
   asm volatile (
-    "vld1.u16     {q13}, [%4]                  \n"
-    "vld1.u8      {q14}, [%5]                  \n"
-    "vld1.u8      {q15}, [%6]                  \n"
-    "add          r4, %0, %3, lsl #1           \n"
-    "add          %3, %0                       \n"
-    "1:                                        \n"
+    MEMACCESS(5)
+    "vld1.16    {q13}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {q14}, [%6]                    \n"
+    MEMACCESS(7)
+    "vld1.8     {q15}, [%7]                    \n"
+    "add        %3, %0                         \n"
+  "1:                                          \n"
 
     // d0 = 00 40 01 41 02 42 03 43
     // d1 = 10 50 11 51 12 52 13 53
     // d2 = 20 60 21 61 22 62 23 63
     // d3 = 30 70 31 71 32 72 33 73
-    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
-    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
-    "vld4.u8      {d16, d17, d18, d19}, [r4]!  \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+    MEMACCESS(4)
+    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
+    "subs         %2, %2, #12                  \n"
 
     // Shuffle the input data around to get align the data
     //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
@@ -361,38 +413,44 @@ void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
     "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
     "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
 
-    "vst1.u8      {d3}, [%1]!                  \n"
-    "vst1.u32     {d4[0]}, [%1]!               \n"
-    "subs         %2, #12                      \n"
+    MEMACCESS(1)
+    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
+    "vst1.32      {d4[0]}, [%1]!               \n"
     "bgt          1b                           \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width),        // %2
-      "+r"(src_stride)        // %3
-    : "r"(&kMult38_Div6),     // %4
-      "r"(&kShuf38_2),        // %5
-      "r"(&kMult38_Div9)      // %6
-    : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
-      "q13", "q14", "q15", "memory", "cc"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_ptr1)          // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
   );
 }
 
 // 32x2 -> 12x1
-void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
   asm volatile (
-    "vld1.u16     {q13}, [%4]                  \n"
-    "vld1.u8      {q14}, [%5]                  \n"
-    "add          %3, %0                       \n"
-    "1:                                        \n"
+    MEMACCESS(4)
+    "vld1.16    {q13}, [%4]                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q14}, [%5]                    \n"
+    "add        %3, %0                         \n"
+  "1:                                          \n"
 
     // d0 = 00 40 01 41 02 42 03 43
     // d1 = 10 50 11 51 12 52 13 53
     // d2 = 20 60 21 61 22 62 23 63
     // d3 = 30 70 31 71 32 72 33 73
-    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
-    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+    "subs         %2, %2, #12                  \n"
 
     // Shuffle the input data around to get align the data
     //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
@@ -457,78 +515,507 @@ void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
     "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
     "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
 
-    "vst1.u8      {d3}, [%1]!                  \n"
-    "vst1.u32     {d4[0]}, [%1]!               \n"
-    "subs         %2, #12                      \n"
+    MEMACCESS(1)
+    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
+    "vst1.32      {d4[0]}, [%1]!               \n"
     "bgt          1b                           \n"
-    : "+r"(src_ptr),       // %0
-      "+r"(dst_ptr),       // %1
-      "+r"(dst_width),     // %2
-      "+r"(src_stride)     // %3
-    : "r"(&kMult38_Div6),  // %4
-      "r"(&kShuf38_2)      // %5
-    : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+  : "+r"(src_ptr),       // %0
+    "+r"(dst_ptr),       // %1
+    "+r"(dst_width),     // %2
+    "+r"(src_stride)     // %3
+  : "r"(&kMult38_Div6),  // %4
+    "r"(&kShuf38_2)      // %5
+  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+  );
+}
+
+void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  const uint8* src_tmp;
+  asm volatile (
+  "1:                                          \n"
+    "mov       %0, %1                          \n"
+    "mov       r12, %5                         \n"
+    "veor      q2, q2, q2                      \n"
+    "veor      q3, q3, q3                      \n"
+  "2:                                          \n"
+    // load 16 pixels into q0
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], %3                 \n"
+    "vaddw.u8   q3, q3, d1                     \n"
+    "vaddw.u8   q2, q2, d0                     \n"
+    "subs       r12, r12, #1                   \n"
+    "bgt        2b                             \n"
+    MEMACCESS(2)
+    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
+    "add        %1, %1, #16                    \n"
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop
+    "bgt        1b                             \n"
+  : "=&r"(src_tmp),    // %0
+    "+r"(src_ptr),     // %1
+    "+r"(dst_ptr),     // %2
+    "+r"(src_stride),  // %3
+    "+r"(src_width),   // %4
+    "+r"(src_height)   // %5
+  :
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
   );
 }
 
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                                    \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5                     \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
+
+// The NEON version mimics this formula:
+// #define BLENDER(a, b, f) (uint8)((int)(a) +
+//    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_ptr;
+  asm volatile (
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q1, q1, q0                     \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+    "vadd.s32   q2, q1, q3                     \n"
+    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
+  "1:                                          \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+    "vmov       q10, q1                        \n"
+    "vmov       q11, q2                        \n"
+    "vuzp.16    q10, q11                       \n"
+    "vmovl.u8   q8, d6                         \n"
+    "vmovl.u8   q9, d7                         \n"
+    "vsubl.s16  q11, d18, d16                  \n"
+    "vsubl.s16  q12, d19, d17                  \n"
+    "vmovl.u16  q13, d20                       \n"
+    "vmovl.u16  q10, d21                       \n"
+    "vmul.s32   q11, q11, q13                  \n"
+    "vmul.s32   q12, q12, q10                  \n"
+    "vrshrn.s32  d18, q11, #16                 \n"
+    "vrshrn.s32  d19, q12, #16                 \n"
+    "vadd.s16   q8, q8, q9                     \n"
+    "vmovn.s16  d6, q8                         \n"
+
+    MEMACCESS(0)
+    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
+    "vadd.s32   q1, q1, q0                     \n"
+    "vadd.s32   q2, q2, q0                     \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3",
+    "q8", "q9", "q10", "q11", "q12", "q13"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
 // 16x2 -> 16x1
 void ScaleFilterRows_NEON(uint8* dst_ptr,
                           const uint8* src_ptr, ptrdiff_t src_stride,
                           int dst_width, int source_y_fraction) {
   asm volatile (
     "cmp          %4, #0                       \n"
-    "beq          2f                           \n"
+    "beq          100f                         \n"
     "add          %2, %1                       \n"
+    "cmp          %4, #64                      \n"
+    "beq          75f                          \n"
     "cmp          %4, #128                     \n"
-    "beq          3f                           \n"
+    "beq          50f                          \n"
+    "cmp          %4, #192                     \n"
+    "beq          25f                          \n"
 
     "vdup.8       d5, %4                       \n"
     "rsb          %4, #256                     \n"
     "vdup.8       d4, %4                       \n"
-    "1:                                        \n"
-    "vld1.u8      {q0}, [%1]!                  \n"
-    "vld1.u8      {q1}, [%2]!                  \n"
-    "subs         %3, #16                      \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
     "vmull.u8     q13, d0, d4                  \n"
     "vmull.u8     q14, d1, d4                  \n"
     "vmlal.u8     q13, d2, d5                  \n"
     "vmlal.u8     q14, d3, d5                  \n"
     "vrshrn.u16   d0, q13, #8                  \n"
     "vrshrn.u16   d1, q14, #8                  \n"
-    "vst1.u8      {q0}, [%0]!                  \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
     "bgt          1b                           \n"
-    "b            4f                           \n"
-
-    "2:                                        \n"
-    "vld1.u8      {q0}, [%1]!                  \n"
-    "subs         %3, #16                      \n"
-    "vst1.u8      {q0}, [%0]!                  \n"
-    "bgt          2b                           \n"
-    "b            4f                           \n"
-
-    "3:                                        \n"
-    "vld1.u8      {q0}, [%1]!                  \n"
-    "vld1.u8      {q1}, [%2]!                  \n"
-    "subs         %3, #16                      \n"
+    "b            99f                          \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          25b                          \n"
+    "b            99f                          \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
     "vrhadd.u8    q0, q1                       \n"
-    "vst1.u8      {q0}, [%0]!                  \n"
-    "bgt          3b                           \n"
-    "4:                                        \n"
-    "vst1.u8      {d1[7]}, [%0]                \n"
-    : "+r"(dst_ptr),          // %0
-      "+r"(src_ptr),          // %1
-      "+r"(src_stride),       // %2
-      "+r"(dst_width),        // %3
-      "+r"(source_y_fraction) // %4
-    :
-    : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          50b                          \n"
+    "b            99f                          \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q1}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q0}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          75b                          \n"
+    "b            99f                          \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          100b                         \n"
+
+  "99:                                         \n"
+    MEMACCESS(0)
+    "vst1.8       {d1[7]}, [%0]                \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction) // %4
+  :
+  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
   );
 }
 
-#endif  // __ARM_NEON__
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS(0)
+    "vld2.32    {q0, q1}, [%0]!                \n"
+    MEMACCESS(0)
+    "vld2.32    {q2, q3}, [%0]!                \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    MEMACCESS(1)
+    "vst1.8     {q3}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #1                     \n"
+    "vrshrn.u16 d2, q2, #1                     \n"
+    "vrshrn.u16 d3, q3, #1                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"
+    "bgt       1b                              \n"
+  : "+r"(src_argb),         // %0
+    "+r"(dst_argb),         // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
+    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    "vrshrn.u16 d2, q2, #2                     \n"
+    "vrshrn.u16 d3, q3, #2                     \n"
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %3, lsl #2                \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.32    {d0[0]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d0[1]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d1[0]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d1[1]}, [%0], r12             \n"
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(dst_width)    // %2
+  : "r"(src_stepx)     // %3
+  : "memory", "cc", "r12", "q0"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %4, lsl #2                \n"
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d4}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d5}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d6}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d7}, [%1], r12                \n"
+    "vaddl.u8   q0, d0, d1                     \n"
+    "vaddl.u8   q1, d2, d3                     \n"
+    "vaddl.u8   q2, d4, d5                     \n"
+    "vaddl.u8   q3, d6, d7                     \n"
+    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
+    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
+    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
+    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
+    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
+    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
+    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(src_stride),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(dst_width)    // %3
+  : "r"(src_stepx)     // %4
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(dn, n)                               \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5, lsl #2             \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "vld1.32    {"#dn"["#n"]}, [%6]            \n"
+
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  int tmp;
+  const uint8* src_tmp = src_argb;
+  asm volatile (
+  "1:                                          \n"
+    LOAD1_DATA32_LANE(d0, 0)
+    LOAD1_DATA32_LANE(d0, 1)
+    LOAD1_DATA32_LANE(d1, 0)
+    LOAD1_DATA32_LANE(d1, 1)
+    LOAD1_DATA32_LANE(d2, 0)
+    LOAD1_DATA32_LANE(d2, 1)
+    LOAD1_DATA32_LANE(d3, 0)
+    LOAD1_DATA32_LANE(d3, 1)
+
+    MEMACCESS(0)
+    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),   // %0
+    "+r"(src_argb),   // %1
+    "+r"(dst_width),  // %2
+    "+r"(x),          // %3
+    "+r"(dx),         // %4
+    "=&r"(tmp),       // %5
+    "+r"(src_tmp)     // %6
+  :
+  : "memory", "cc", "q0", "q1"
+  );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
+    "lsr        %5, %3, #16                           \n"      \
+    "add        %6, %1, %5, lsl #2                    \n"      \
+    "add        %3, %3, %4                            \n"      \
+    MEMACCESS(6)                                               \
+    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_argb;
+  asm volatile (
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
+    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q8, q1, q0                     \n"
+  "1:                                          \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(d0, d2, 0)
+    LOAD2_DATA32_LANE(d0, d2, 1)
+    LOAD2_DATA32_LANE(d1, d3, 0)
+    LOAD2_DATA32_LANE(d1, d3, 1)
+    "vshrn.i32   d22, q8, #9                   \n"
+    "vand.16     d22, d22, d30                 \n"
+    "vdup.8      d24, d22[0]                   \n"
+    "vdup.8      d25, d22[2]                   \n"
+    "vdup.8      d26, d22[4]                   \n"
+    "vdup.8      d27, d22[6]                   \n"
+    "vext.8      d4, d24, d25, #4              \n"
+    "vext.8      d5, d26, d27, #4              \n"  // f
+    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
+    "vmull.u8    q11, d0, d20                  \n"
+    "vmull.u8    q12, d1, d21                  \n"
+    "vmull.u8    q13, d2, d4                   \n"
+    "vmull.u8    q14, d3, d5                   \n"
+    "vadd.i16    q11, q11, q13                 \n"
+    "vadd.i16    q12, q12, q14                 \n"
+    "vshrn.i16   d0, q11, #7                   \n"
+    "vshrn.i16   d1, q12, #7                   \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
+    "vadd.s32    q8, q8, q9                    \n"
+    "subs        %2, %2, #4                    \n"  // 4 processed per loop
+    "bgt         1b                            \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
diff --git a/files/source/scale_neon64.cc b/files/source/scale_neon64.cc
new file mode 100644
index 00000000..ff277f26
--- /dev/null
+++ b/files/source/scale_neon64.cc
@@ -0,0 +1,1042 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    // load even pixels into v0, odd into v1
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1"              // Clobber List
+  );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+    "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
+    "uaddlp     v1.8h, v1.16b                  \n"
+    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
+    "rshrn2     v0.16b, v1.8h, #1              \n"
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1"     // Clobber List
+  );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
+    MEMACCESS(1)
+    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
+    "uaddlp     v1.8h, v1.16b                  \n"
+    "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
+    "uadalp     v1.8h, v3.16b                  \n"
+    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
+    "rshrn2     v0.16b, v1.8h, #2              \n"
+    MEMACCESS(2)
+    "st1        {v0.16b}, [%2], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "v0", "v1", "v2", "v3"     // Clobber List
+  );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    MEMACCESS(1)
+    "st1     {v2.8b}, [%1], #8                 \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
+    MEMACCESS(3)
+    "ld1     {v1.16b}, [%2], #16               \n"
+    MEMACCESS(4)
+    "ld1     {v2.16b}, [%3], #16               \n"
+    MEMACCESS(5)
+    "ld1     {v3.16b}, [%4], #16               \n"
+    "subs    %w5, %w5, #4                      \n"
+    "uaddlp  v0.8h, v0.16b                     \n"
+    "uadalp  v0.8h, v1.16b                     \n"
+    "uadalp  v0.8h, v2.16b                     \n"
+    "uadalp  v0.8h, v3.16b                     \n"
+    "addp    v0.8h, v0.8h, v0.8h               \n"
+    "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
+    MEMACCESS(1)
+    "st1    {v0.s}[0], [%1], #4                \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(src_ptr1),  // %2
+    "+r"(src_ptr2),  // %3
+    "+r"(src_ptr3),  // %4
+    "+r"(dst_width)  // %5
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    "subs      %w2, %w2, #24                           \n"
+    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movi      v20.8b, #3                              \n"
+    "add       %3, %3, %0                              \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
+    "subs         %w2, %w2, #24                        \n"
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "ushll     v16.8h, v4.8b, #0                       \n"
+    "ushll     v17.8h, v5.8b, #0                       \n"
+    "ushll     v18.8h, v6.8b, #0                       \n"
+    "ushll     v19.8h, v7.8b, #0                       \n"
+
+    // 3 * line_0 + line_1
+    "umlal     v16.8h, v0.8b, v20.8b                   \n"
+    "umlal     v17.8h, v1.8b, v20.8b                   \n"
+    "umlal     v18.8h, v2.8b, v20.8b                   \n"
+    "umlal     v19.8h, v3.8b, v20.8b                   \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "uqrshrn   v0.8b, v16.8h, #2                       \n"
+    "uqrshrn   v1.8b, v17.8h, #2                       \n"
+    "uqrshrn   v2.8b, v18.8h, #2                       \n"
+    "uqrshrn   v3.8b, v19.8h, #2                       \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "ushll     v16.8h, v1.8b, #0                       \n"
+    "umlal     v16.8h, v0.8b, v20.8b                   \n"
+    "uqrshrn   v0.8b, v16.8h, #2                       \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "ushll     v16.8h, v2.8b, #0                       \n"
+    "umlal     v16.8h, v3.8b, v20.8b                   \n"
+    "uqrshrn   v2.8b, v16.8h, #2                       \n"
+
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
+    "v20", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movi      v20.8b, #3                              \n"
+    "add       %3, %3, %0                              \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
+    "subs         %w2, %w2, #24                        \n"
+    // average src line 0 with src line 1
+    "urhadd    v0.8b, v0.8b, v4.8b                     \n"
+    "urhadd    v1.8b, v1.8b, v5.8b                     \n"
+    "urhadd    v2.8b, v2.8b, v6.8b                     \n"
+    "urhadd    v3.8b, v3.8b, v7.8b                     \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "ushll     v4.8h, v1.8b, #0                        \n"
+    "umlal     v4.8h, v0.8b, v20.8b                    \n"
+    "uqrshrn   v0.8b, v4.8h, #2                        \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "ushll     v4.8h, v2.8b, #0                        \n"
+    "umlal     v4.8h, v3.8b, v20.8b                    \n"
+    "uqrshrn   v2.8b, v4.8h, #2                        \n"
+
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
+  );
+}
+
+static uvec8 kShuf38 =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+  { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1       {v3.16b}, [%3]                          \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld1       {v0.16b,v1.16b}, [%0], #32             \n"
+    "subs      %w2, %w2, #12                           \n"
+    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"
+    MEMACCESS(1)
+    "st1       {v2.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v2.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  : "r"(&kShuf38)           // %3
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+  ptrdiff_t tmp_src_stride = src_stride;
+
+  asm volatile (
+    MEMACCESS(5)
+    "ld1       {v29.8h}, [%5]                          \n"
+    MEMACCESS(6)
+    "ld1       {v30.16b}, [%6]                         \n"
+    MEMACCESS(7)
+    "ld1       {v31.8h}, [%7]                          \n"
+    "add       %2, %2, %0                              \n"
+  "1:                                                  \n"
+
+    // 00 40 01 41 02 42 03 43
+    // 10 50 11 51 12 52 13 53
+    // 20 60 21 61 22 62 23 63
+    // 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
+    MEMACCESS(4)
+    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
+    "subs      %w4, %w4, #12                           \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // 00 10 01 11 02 12 03 13
+    // 40 50 41 51 42 52 43 53
+    "trn1      v20.8b, v0.8b, v1.8b                    \n"
+    "trn2      v21.8b, v0.8b, v1.8b                    \n"
+    "trn1      v22.8b, v4.8b, v5.8b                    \n"
+    "trn2      v23.8b, v4.8b, v5.8b                    \n"
+    "trn1      v24.8b, v16.8b, v17.8b                  \n"
+    "trn2      v25.8b, v16.8b, v17.8b                  \n"
+
+    // 20 30 21 31 22 32 23 33
+    // 60 70 61 71 62 72 63 73
+    "trn1      v0.8b, v2.8b, v3.8b                     \n"
+    "trn2      v1.8b, v2.8b, v3.8b                     \n"
+    "trn1      v4.8b, v6.8b, v7.8b                     \n"
+    "trn2      v5.8b, v6.8b, v7.8b                     \n"
+    "trn1      v16.8b, v18.8b, v19.8b                  \n"
+    "trn2      v17.8b, v18.8b, v19.8b                  \n"
+
+    // 00+10 01+11 02+12 03+13
+    // 40+50 41+51 42+52 43+53
+    "uaddlp    v20.4h, v20.8b                          \n"
+    "uaddlp    v21.4h, v21.8b                          \n"
+    "uaddlp    v22.4h, v22.8b                          \n"
+    "uaddlp    v23.4h, v23.8b                          \n"
+    "uaddlp    v24.4h, v24.8b                          \n"
+    "uaddlp    v25.4h, v25.8b                          \n"
+
+    // 60+70 61+71 62+72 63+73
+    "uaddlp    v1.4h, v1.8b                            \n"
+    "uaddlp    v5.4h, v5.8b                            \n"
+    "uaddlp    v17.4h, v17.8b                          \n"
+
+    // combine source lines
+    "add       v20.4h, v20.4h, v22.4h                  \n"
+    "add       v21.4h, v21.4h, v23.4h                  \n"
+    "add       v20.4h, v20.4h, v24.4h                  \n"
+    "add       v21.4h, v21.4h, v25.4h                  \n"
+    "add       v2.4h, v1.4h, v5.4h                     \n"
+    "add       v2.4h, v2.4h, v17.4h                    \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
+    "xtn       v2.8b,  v2.8h                           \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "ushll     v16.8h, v16.8b, #0                      \n"
+    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+    // combine source lines
+    "add       v0.8h, v0.8h, v16.8h                    \n"
+
+    // xx 20 xx 21 xx 22 xx 23
+    // xx 30 xx 31 xx 32 xx 33
+    "trn1      v1.8h, v0.8h, v0.8h                     \n"
+    "trn2      v4.8h, v0.8h, v0.8h                     \n"
+    "xtn       v0.4h, v1.4s                            \n"
+    "xtn       v4.4h, v4.4s                            \n"
+
+    // 0+1+2, 3+4+5
+    "add       v20.8h, v20.8h, v0.8h                   \n"
+    "add       v21.8h, v21.8h, v4.8h                   \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
+    "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+
+    MEMACCESS(1)
+    "st1       {v3.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v3.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(tmp_src_stride),   // %2
+    "+r"(src_ptr1),         // %3
+    "+r"(dst_width)         // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
+    "v30", "v31", "memory", "cc"
+  );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  // TODO(fbarchard): use src_stride directly for clang 3.5+.
+  ptrdiff_t tmp_src_stride = src_stride;
+  asm volatile (
+    MEMACCESS(4)
+    "ld1       {v30.8h}, [%4]                          \n"
+    MEMACCESS(5)
+    "ld1       {v31.16b}, [%5]                         \n"
+    "add       %2, %2, %0                              \n"
+  "1:                                                  \n"
+
+    // 00 40 01 41 02 42 03 43
+    // 10 50 11 51 12 52 13 53
+    // 20 60 21 61 22 62 23 63
+    // 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
+    "subs      %w3, %w3, #12                           \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // 00 10 01 11 02 12 03 13
+    // 40 50 41 51 42 52 43 53
+    "trn1      v16.8b, v0.8b, v1.8b                    \n"
+    "trn2      v17.8b, v0.8b, v1.8b                    \n"
+    "trn1      v18.8b, v4.8b, v5.8b                    \n"
+    "trn2      v19.8b, v4.8b, v5.8b                    \n"
+
+    // 20 30 21 31 22 32 23 33
+    // 60 70 61 71 62 72 63 73
+    "trn1      v0.8b, v2.8b, v3.8b                     \n"
+    "trn2      v1.8b, v2.8b, v3.8b                     \n"
+    "trn1      v4.8b, v6.8b, v7.8b                     \n"
+    "trn2      v5.8b, v6.8b, v7.8b                     \n"
+
+    // 00+10 01+11 02+12 03+13
+    // 40+50 41+51 42+52 43+53
+    "uaddlp    v16.4h, v16.8b                          \n"
+    "uaddlp    v17.4h, v17.8b                          \n"
+    "uaddlp    v18.4h, v18.8b                          \n"
+    "uaddlp    v19.4h, v19.8b                          \n"
+
+    // 60+70 61+71 62+72 63+73
+    "uaddlp    v1.4h, v1.8b                            \n"
+    "uaddlp    v5.4h, v5.8b                            \n"
+
+    // combine source lines
+    "add       v16.4h, v16.4h, v18.4h                  \n"
+    "add       v17.4h, v17.4h, v19.4h                  \n"
+    "add       v2.4h, v1.4h, v5.4h                     \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "uqrshrn   v2.8b, v2.8h, #2                        \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+
+    // combine source lines
+    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+    // xx 20 xx 21 xx 22 xx 23
+    // xx 30 xx 31 xx 32 xx 33
+    "trn1      v1.8h, v0.8h, v0.8h                     \n"
+    "trn2      v4.8h, v0.8h, v0.8h                     \n"
+    "xtn       v0.4h, v1.4s                            \n"
+    "xtn       v4.4h, v4.4s                            \n"
+
+    // 0+1+2, 3+4+5
+    "add       v16.8h, v16.8h, v0.8h                   \n"
+    "add       v17.8h, v17.8h, v4.8h                   \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
+    "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+
+    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+
+    MEMACCESS(1)
+    "st1       {v3.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v3.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),         // %0
+    "+r"(dst_ptr),         // %1
+    "+r"(tmp_src_stride),  // %2
+    "+r"(dst_width)        // %3
+  : "r"(&kMult38_Div6),    // %4
+    "r"(&kShuf38_2)        // %5
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v30", "v31", "memory", "cc"
+  );
+}
+
+void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  const uint8* src_tmp;
+  asm volatile (
+  "1:                                          \n"
+    "mov       %0, %1                          \n"
+    "mov       w12, %w5                        \n"
+    "eor       v2.16b, v2.16b, v2.16b          \n"
+    "eor       v3.16b, v3.16b, v3.16b          \n"
+  "2:                                          \n"
+    // load 16 pixels into q0
+    MEMACCESS(0)
+    "ld1       {v0.16b}, [%0], %3              \n"
+    "uaddw2    v3.8h, v3.8h, v0.16b            \n"
+    "uaddw     v2.8h, v2.8h, v0.8b             \n"
+    "subs      w12, w12, #1                    \n"
+    "b.gt      2b                              \n"
+    MEMACCESS(2)
+    "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
+    "add      %1, %1, #16                      \n"
+    "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
+    "b.gt     1b                               \n"
+  : "=&r"(src_tmp),    // %0
+    "+r"(src_ptr),     // %1
+    "+r"(dst_ptr),     // %2
+    "+r"(src_stride),  // %3
+    "+r"(src_width),   // %4
+    "+r"(src_height)   // %5
+  :
+  : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                                    \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5                    \n"              \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "ld2        {v4.b, v5.b}["#n"], [%6]      \n"
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_ptr;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // x
+    "dup        v1.4s, %w4                     \n"  // dx
+    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
+    "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
+    "mul        v1.4s, v1.4s, v2.4s            \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "add        v1.4s, v1.4s, v0.4s            \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+    "add        v2.4s, v1.4s, v3.4s            \n"
+    "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
+  "1:                                          \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+    "mov       v6.16b, v1.16b                  \n"
+    "mov       v7.16b, v2.16b                  \n"
+    "uzp1      v6.8h, v6.8h, v7.8h             \n"
+    "ushll     v4.8h, v4.8b, #0                \n"
+    "ushll     v5.8h, v5.8b, #0                \n"
+    "ssubl     v16.4s, v5.4h, v4.4h            \n"
+    "ssubl2    v17.4s, v5.8h, v4.8h            \n"
+    "ushll     v7.4s, v6.4h, #0                \n"
+    "ushll2    v6.4s, v6.8h, #0                \n"
+    "mul       v16.4s, v16.4s, v7.4s           \n"
+    "mul       v17.4s, v17.4s, v6.4s           \n"
+    "rshrn      v6.4h, v16.4s, #16             \n"
+    "rshrn2     v6.8h, v17.4s, #16             \n"
+    "add       v4.8h, v4.8h, v6.8h             \n"
+    "xtn       v4.8b, v4.8h                    \n"
+
+    MEMACCESS(0)
+    "st1       {v4.8b}, [%0], #8               \n"  // store pixels
+    "add       v1.4s, v1.4s, v0.4s             \n"
+    "add       v2.4s, v2.4s, v0.4s             \n"
+    "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
+    "b.gt      1b                              \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3",
+    "v4", "v5", "v6", "v7", "v16", "v17"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+    int y_fraction = 256 - source_y_fraction;
+  asm volatile (
+    "cmp          %w4, #0                      \n"
+    "b.eq         100f                         \n"
+    "add          %2, %2, %1                   \n"
+    "cmp          %w4, #64                     \n"
+    "b.eq         75f                          \n"
+    "cmp          %w4, #128                    \n"
+    "b.eq         50f                          \n"
+    "cmp          %w4, #192                    \n"
+    "b.eq         25f                          \n"
+
+    "dup          v5.8b, %w4                   \n"
+    "dup          v4.8b, %w5                   \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "umull        v6.8h, v0.8b, v4.8b          \n"
+    "umull2       v7.8h, v0.16b, v4.16b        \n"
+    "umlal        v6.8h, v1.8b, v5.8b          \n"
+    "umlal2       v7.8h, v1.16b, v5.16b        \n"
+    "rshrn        v0.8b, v6.8h, #8             \n"
+    "rshrn2       v0.16b, v7.8h, #8            \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         1b                           \n"
+    "b            99f                          \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         25b                          \n"
+    "b            99f                          \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         50b                          \n"
+    "b            99f                          \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v1.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v0.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         75b                          \n"
+    "b            99f                          \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         100b                         \n"
+
+  "99:                                         \n"
+    MEMACCESS(0)
+    "st1          {v0.b}[15], [%0]             \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction),// %4
+    "+r"(y_fraction)        // %5
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
+  );
+}
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS (0)
+    "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
+    MEMACCESS (0)
+    "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    MEMACCESS (1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+    MEMACCESS (1)
+    "st1        {v3.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r" (src_ptr),          // %0
+    "+r" (dst),              // %1
+    "+r" (dst_width)         // %2
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS (0)
+    // load 8 ARGB pixels.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
+    "rshrn      v1.8b, v1.8h, #1               \n"
+    "rshrn      v2.8b, v2.8h, #1               \n"
+    "rshrn      v3.8b, v3.8h, #1               \n"
+    MEMACCESS (1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),         // %0
+    "+r"(dst_argb),         // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS (0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS (1)
+    "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
+    "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
+    "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
+    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
+    "rshrn      v1.8b, v1.8h, #2               \n"
+    "rshrn      v2.8b, v2.8h, #2               \n"
+    "rshrn      v3.8b, v3.8h, #2               \n"
+    MEMACCESS (2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+  : "+r" (src_ptr),          // %0
+    "+r" (src_stride),       // %1
+    "+r" (dst),              // %2
+    "+r" (dst_width)         // %3
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[0], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[1], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[2], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[3], [%0], %3            \n"
+    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(dst_width)    // %2
+  : "r"((int64)(src_stepx * 4)) // %3
+  : "memory", "cc", "v0"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+// TODO(Yang Zhang): Might be worth another optimization pass in future.
+// It could be upgraded to 8 pixels at a time to start with.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v4.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v5.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v6.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v7.8b}, [%1], %4              \n"
+    "uaddl      v0.8h, v0.8b, v1.8b            \n"
+    "uaddl      v2.8h, v2.8b, v3.8b            \n"
+    "uaddl      v4.8h, v4.8b, v5.8b            \n"
+    "uaddl      v6.8h, v6.8b, v7.8b            \n"
+    "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
+    "mov        v0.d[1], v2.d[0]               \n"
+    "mov        v2.d[0], v16.d[1]              \n"
+    "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
+    "mov        v4.d[1], v6.d[0]               \n"
+    "mov        v6.d[0], v16.d[1]              \n"
+    "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
+    "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
+    "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
+    "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
+    "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
+    MEMACCESS(2)
+    "st1     {v0.16b}, [%2], #16               \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(src_stride),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(dst_width)    // %3
+  : "r"((int64)(src_stepx * 4)) // %4
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(vn, n)                               \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5, lsl #2             \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "ld1        {"#vn".s}["#n"], [%6]          \n"
+
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  const uint8* src_tmp = src_argb;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  int64 tmp64;
+  asm volatile (
+  "1:                                          \n"
+    LOAD1_DATA32_LANE(v0, 0)
+    LOAD1_DATA32_LANE(v0, 1)
+    LOAD1_DATA32_LANE(v0, 2)
+    LOAD1_DATA32_LANE(v0, 3)
+    LOAD1_DATA32_LANE(v1, 0)
+    LOAD1_DATA32_LANE(v1, 1)
+    LOAD1_DATA32_LANE(v1, 2)
+    LOAD1_DATA32_LANE(v1, 3)
+
+    MEMACCESS(0)
+    "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    "b.gt        1b                            \n"
+  : "+r"(dst_argb),     // %0
+    "+r"(src_argb),     // %1
+    "+r"(dst_width64),  // %2
+    "+r"(x64),          // %3
+    "+r"(dx64),         // %4
+    "=&r"(tmp64),       // %5
+    "+r"(src_tmp)       // %6
+  :
+  : "memory", "cc", "v0", "v1"
+  );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(vn1, vn2, n)                         \
+    "lsr        %5, %3, #16                           \n"      \
+    "add        %6, %1, %5, lsl #2                    \n"      \
+    "add        %3, %3, %4                            \n"      \
+    MEMACCESS(6)                                               \
+    "ld2        {"#vn1".s, "#vn2".s}["#n"], [%6]      \n"
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_argb;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // x
+    "dup        v1.4s, %w4                     \n"  // dx
+    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
+    "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
+    "mul        v1.4s, v1.4s, v2.4s            \n"
+    "movi       v3.16b, #0x7f                  \n"  // 0x7F
+    "movi       v4.8h, #0x7f                   \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "add        v5.4s, v1.4s, v0.4s            \n"
+  "1:                                          \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(v0, v1, 0)
+    LOAD2_DATA32_LANE(v0, v1, 1)
+    LOAD2_DATA32_LANE(v0, v1, 2)
+    LOAD2_DATA32_LANE(v0, v1, 3)
+    "shrn       v2.4h, v5.4s, #9               \n"
+    "and        v2.8b, v2.8b, v4.8b            \n"
+    "dup        v16.8b, v2.b[0]                \n"
+    "dup        v17.8b, v2.b[2]                \n"
+    "dup        v18.8b, v2.b[4]                \n"
+    "dup        v19.8b, v2.b[6]                \n"
+    "ext        v2.8b, v16.8b, v17.8b, #4      \n"
+    "ext        v17.8b, v18.8b, v19.8b, #4     \n"
+    "ins        v2.d[1], v17.d[0]              \n"  // f
+    "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
+    "umull      v16.8h, v0.8b, v7.8b           \n"
+    "umull2     v17.8h, v0.16b, v7.16b         \n"
+    "umull      v18.8h, v1.8b, v2.8b           \n"
+    "umull2     v19.8h, v1.16b, v2.16b         \n"
+    "add        v16.8h, v16.8h, v18.8h         \n"
+    "add        v17.8h, v17.8h, v19.8h         \n"
+    "shrn       v0.8b, v16.8h, #7              \n"
+    "shrn2      v0.16b, v17.8h, #7             \n"
+
+    MEMACCESS(0)
+    "st1     {v0.4s}, [%0], #16                \n"  // store pixels
+    "add     v5.4s, v5.4s, v6.4s               \n"
+    "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
+    "b.gt    1b                                \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+    "v6", "v7", "v16", "v17", "v18", "v19"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/scale_win.cc b/files/source/scale_win.cc
new file mode 100644
index 00000000..f1709736
--- /dev/null
+++ b/files/source/scale_win.cc
@@ -0,0 +1,1374 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+  { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+__declspec(naked)
+void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8               // isolate odd pixels.
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x1 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+    pcmpeqb    xmm4, xmm4            // constant 0x0101
+    psrlw      xmm4, 15
+    packuswb   xmm4, xmm4
+    pxor       xmm5, xmm5            // constant 0
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pmaddubsw  xmm0, xmm4      // horizontal add
+    pmaddubsw  xmm1, xmm4
+    pavgw      xmm0, xmm5      // (x + 1) / 2
+    pavgw      xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+
+    pcmpeqb    xmm4, xmm4            // constant 0x0101
+    psrlw      xmm4, 15
+    packuswb   xmm4, xmm4
+    pxor       xmm5, xmm5            // constant 0
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pmaddubsw  xmm0, xmm4      // horizontal add
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2      // vertical add
+    paddw      xmm1, xmm3
+    psrlw      xmm0, 1
+    psrlw      xmm1, 1
+    pavgw      xmm0, xmm5      // (x + 1) / 2
+    pavgw      xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+// Reads 64 pixels, throws half away and writes 32 pixels.
+__declspec(naked)
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
+    vpsrlw      ymm1, ymm1, 8
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x1 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov         eax, [esp + 4]        // src_ptr
+                                      // src_stride
+    mov         edx, [esp + 12]       // dst_ptr
+    mov         ecx, [esp + 16]       // dst_width
+
+    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpsrlw      ymm4, ymm4, 15
+    vpackuswb   ymm4, ymm4, ymm4
+    vpxor       ymm5, ymm5, ymm5      // constant 0
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// For rounding, average = (sum + 2) / 4
+// becomes average((sum >> 1), 0)
+// Blends 64x2 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push        esi
+    mov         eax, [esp + 4 + 4]    // src_ptr
+    mov         esi, [esp + 4 + 8]    // src_stride
+    mov         edx, [esp + 4 + 12]   // dst_ptr
+    mov         ecx, [esp + 4 + 16]   // dst_width
+
+    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpsrlw      ymm4, ymm4, 15
+    vpackuswb   ymm4, ymm4, ymm4
+    vpxor       ymm5, ymm5, ymm5      // constant 0
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    vmovdqu     ymm2, [eax + esi]
+    vmovdqu     ymm3, [eax + esi + 32]
+    lea         eax,  [eax + 64]
+    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2      // vertical add
+    vpaddw      ymm1, ymm1, ymm3
+    vpsrlw      ymm0, ymm0, 1         // (x + 2) / 4 = (x / 2 + 1) / 2
+    vpsrlw      ymm1, ymm1, 1
+    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    pop         esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEROWDOWN2_AVX2
+
+// Point samples 32 pixels to 8 pixels.
+__declspec(naked)
+void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
+    psrld      xmm5, 24
+    pslld      xmm5, 16
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    psrlw      xmm0, 8
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 8
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x4 rectangle to 8x1.
+__declspec(naked)
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_ptr
+    mov        esi, [esp + 8 + 8]    // src_stride
+    mov        edx, [esp + 8 + 12]   // dst_ptr
+    mov        ecx, [esp + 8 + 16]   // dst_width
+    lea        edi, [esi + esi * 2]  // src_stride * 3
+    pcmpeqb    xmm4, xmm4            // constant 0x0101
+    psrlw      xmm4, 15
+    movdqa     xmm5, xmm4
+    packuswb   xmm4, xmm4
+    psllw      xmm5, 3               // constant 0x0008
+
+  wloop:
+    movdqu     xmm0, [eax]           // average rows
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    pmaddubsw  xmm0, xmm4      // horizontal add
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2      // vertical add rows 0, 1
+    paddw      xmm1, xmm3
+    movdqu     xmm2, [eax + esi * 2]
+    movdqu     xmm3, [eax + esi * 2 + 16]
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2      // add row 2
+    paddw      xmm1, xmm3
+    movdqu     xmm2, [eax + edi]
+    movdqu     xmm3, [eax + edi + 16]
+    lea        eax, [eax + 32]
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2      // add row 3
+    paddw      xmm1, xmm3
+    phaddw     xmm0, xmm1
+    paddw      xmm0, xmm5      // + 8 for round
+    psrlw      xmm0, 4         // /16 for average of 4 * 4
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 8
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+// Point samples 64 pixels to 16 pixels.
+__declspec(naked)
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov         eax, [esp + 4]        // src_ptr
+                                      // src_stride ignored
+    mov         edx, [esp + 12]       // dst_ptr
+    mov         ecx, [esp + 16]       // dst_width
+    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
+    vpsrld      ymm5, ymm5, 24
+    vpslld      ymm5, ymm5, 16
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpand       ymm0, ymm0, ymm5
+    vpand       ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpsrlw      ymm0, ymm0, 8
+    vpackuswb   ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vmovdqu     [edx], xmm0
+    lea         edx, [edx + 16]
+    sub         ecx, 16
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x4 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push        esi
+    push        edi
+    mov         eax, [esp + 8 + 4]    // src_ptr
+    mov         esi, [esp + 8 + 8]    // src_stride
+    mov         edx, [esp + 8 + 12]   // dst_ptr
+    mov         ecx, [esp + 8 + 16]   // dst_width
+    lea         edi, [esi + esi * 2]  // src_stride * 3
+    vpcmpeqb    ymm4, ymm4, ymm4            // constant 0x0101
+    vpsrlw      ymm4, ymm4, 15
+    vpsllw      ymm5, ymm4, 3               // constant 0x0008
+    vpackuswb   ymm4, ymm4, ymm4
+
+  wloop:
+    vmovdqu     ymm0, [eax]           // average rows
+    vmovdqu     ymm1, [eax + 32]
+    vmovdqu     ymm2, [eax + esi]
+    vmovdqu     ymm3, [eax + esi + 32]
+    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2      // vertical add rows 0, 1
+    vpaddw      ymm1, ymm1, ymm3
+    vmovdqu     ymm2, [eax + esi * 2]
+    vmovdqu     ymm3, [eax + esi * 2 + 32]
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2      // add row 2
+    vpaddw      ymm1, ymm1, ymm3
+    vmovdqu     ymm2, [eax + edi]
+    vmovdqu     ymm3, [eax + edi + 32]
+    lea         eax,  [eax + 64]
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2      // add row 3
+    vpaddw      ymm1, ymm1, ymm3
+    vphaddw     ymm0, ymm0, ymm1      // mutates
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vphaddw
+    vpaddw      ymm0, ymm0, ymm5      // + 8 for round
+    vpsrlw      ymm0, ymm0, 4         // /32 for average of 4 * 4
+    vpackuswb   ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vmovdqu     [edx], xmm0
+    lea         edx, [edx + 16]
+    sub         ecx, 16
+    jg          wloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEROWDOWN4_AVX2
+
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+__declspec(naked)
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    movdqa     xmm3, xmmword ptr kShuf0
+    movdqa     xmm4, xmmword ptr kShuf1
+    movdqa     xmm5, xmmword ptr kShuf2
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm1
+    palignr    xmm1, xmm0, 8
+    pshufb     xmm0, xmm3
+    pshufb     xmm1, xmm4
+    pshufb     xmm2, xmm5
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + 8], xmm1
+    movq       qword ptr [edx + 16], xmm2
+    lea        edx, [edx + 24]
+    sub        ecx, 24
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 kRound34
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked)
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, xmmword ptr kShuf01
+    movdqa     xmm3, xmmword ptr kShuf11
+    movdqa     xmm4, xmmword ptr kShuf21
+    movdqa     xmm5, xmmword ptr kMadd01
+    movdqa     xmm6, xmmword ptr kMadd11
+    movdqa     xmm7, xmmword ptr kRound34
+
+  wloop:
+    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm1, [eax + esi]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 8], xmm0
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, xmmword ptr kMadd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx + 24]
+    sub        ecx, 24
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked)
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, xmmword ptr kShuf01
+    movdqa     xmm3, xmmword ptr kShuf11
+    movdqa     xmm4, xmmword ptr kShuf21
+    movdqa     xmm5, xmmword ptr kMadd01
+    movdqa     xmm6, xmmword ptr kMadd11
+    movdqa     xmm7, xmmword ptr kRound34
+
+  wloop:
+    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm1, [eax + esi]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 8], xmm0
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, xmmword ptr kMadd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx+24]
+    sub        ecx, 24
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked)
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    movdqa     xmm4, xmmword ptr kShuf38a
+    movdqa     xmm5, xmmword ptr kShuf38b
+
+  xloop:
+    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
+    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm4
+    pshufb     xmm1, xmm5
+    paddusb    xmm0, xmm1
+
+    movq       qword ptr [edx], xmm0  // write 12 pixels
+    movhlps    xmm1, xmm0
+    movd       [edx + 8], xmm1
+    lea        edx, [edx + 12]
+    sub        ecx, 12
+    jg         xloop
+
+    ret
+  }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked)
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, xmmword ptr kShufAc
+    movdqa     xmm3, xmmword ptr kShufAc3
+    movdqa     xmm4, xmmword ptr kScaleAc33
+    pxor       xmm5, xmm5
+
+  xloop:
+    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
+    movdqu     xmm6, [eax + esi]
+    movhlps    xmm1, xmm0
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+    movdqu     xmm6, [eax + esi * 2]
+    lea        eax, [eax + 16]
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+
+    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
+    psrldq     xmm0, 2
+    paddusw    xmm6, xmm0
+    psrldq     xmm0, 2
+    paddusw    xmm6, xmm0
+    pshufb     xmm6, xmm2
+
+    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
+    psrldq     xmm1, 2
+    paddusw    xmm7, xmm1
+    psrldq     xmm1, 2
+    paddusw    xmm7, xmm1
+    pshufb     xmm7, xmm3
+    paddusw    xmm6, xmm7
+
+    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
+    packuswb   xmm6, xmm6
+
+    movd       [edx], xmm6           // write 6 pixels
+    psrlq      xmm6, 16
+    movd       [edx + 2], xmm6
+    lea        edx, [edx + 6]
+    sub        ecx, 6
+    jg         xloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked)
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, xmmword ptr kShufAb0
+    movdqa     xmm3, xmmword ptr kShufAb1
+    movdqa     xmm4, xmmword ptr kShufAb2
+    movdqa     xmm5, xmmword ptr kScaleAb2
+
+  xloop:
+    movdqu     xmm0, [eax]           // average 2 rows into xmm0
+    movdqu     xmm1, [eax + esi]
+    lea        eax, [eax + 16]
+    pavgb      xmm0, xmm1
+
+    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
+    pshufb     xmm1, xmm2
+    movdqa     xmm6, xmm0
+    pshufb     xmm6, xmm3
+    paddusw    xmm1, xmm6
+    pshufb     xmm0, xmm4
+    paddusw    xmm1, xmm0
+
+    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
+    packuswb   xmm1, xmm1
+
+    movd       [edx], xmm1           // write 6 pixels
+    psrlq      xmm1, 16
+    movd       [edx + 2], xmm1
+    lea        edx, [edx + 6]
+    sub        ecx, 6
+    jg         xloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_ptr
+    mov        edx, [esp + 8]   // dst_ptr
+    mov        ecx, [esp + 12]  // src_width
+    pxor       xmm5, xmm5
+
+  // sum rows
+  xloop:
+    movdqu     xmm3, [eax]       // read 16 bytes
+    lea        eax, [eax + 16]
+    movdqu     xmm0, [edx]       // read 16 words from destination
+    movdqu     xmm1, [edx + 16]
+    movdqa     xmm2, xmm3
+    punpcklbw  xmm2, xmm5
+    punpckhbw  xmm3, xmm5
+    paddusw    xmm0, xmm2        // sum 16 words
+    paddusw    xmm1, xmm3
+    movdqu     [edx], xmm0       // write 16 words to destination
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 16
+    jg         xloop
+    ret
+  }
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  __asm {
+    mov         eax, [esp + 4]   // src_ptr
+    mov         edx, [esp + 8]   // dst_ptr
+    mov         ecx, [esp + 12]  // src_width
+    vpxor       ymm5, ymm5, ymm5
+
+  // sum rows
+  xloop:
+    vmovdqu     ymm3, [eax]       // read 32 bytes
+    lea         eax, [eax + 32]
+    vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
+    vpunpcklbw  ymm2, ymm3, ymm5
+    vpunpckhbw  ymm3, ymm3, ymm5
+    vpaddusw    ymm0, ymm2, [edx] // sum 16 words
+    vpaddusw    ymm1, ymm3, [edx + 32]
+    vmovdqu     [edx], ymm0       // write 32 words to destination
+    vmovdqu     [edx + 32], ymm1
+    lea         edx, [edx + 64]
+    sub         ecx, 32
+    jg          xloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEADDROW_AVX2
+
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static uvec8 kFsub80 =
+  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static uvec16 kFadd40 =
+  { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
+
+// Bilinear column filtering. SSSE3 version.
+__declspec(naked)
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        edi, [esp + 12 + 4]    // dst_ptr
+    mov        esi, [esp + 12 + 8]    // src_ptr
+    mov        ecx, [esp + 12 + 12]   // dst_width
+    movd       xmm2, [esp + 12 + 16]  // x
+    movd       xmm3, [esp + 12 + 20]  // dx
+    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
+    movd       xmm5, eax
+    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    psrlw      xmm6, 9
+    pcmpeqb    xmm7, xmm7           // generate 0x0001
+    psrlw      xmm7, 15
+    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    sub        ecx, 2
+    jl         xloop29
+
+    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    paddd      xmm0, xmm3
+    punpckldq  xmm2, xmm0           // x0 x1
+    punpckldq  xmm3, xmm3           // dx dx
+    paddd      xmm3, xmm3           // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+
+    // 2 Pixel loop.
+  xloop2:
+    movdqa     xmm1, xmm2           // x0, x1 fractions.
+    paddd      xmm2, xmm3           // x += dx
+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    movd       xmm0, ebx
+    psrlw      xmm1, 9              // 7 bit fractions.
+    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
+    movd       xmm4, ebx
+    pshufb     xmm1, xmm5           // 0011
+    punpcklwd  xmm0, xmm4
+    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
+    pxor       xmm1, xmm6           // 0..7f and 7f..0
+    paddusb    xmm1, xmm7           // +1 so 0..7f and 80..1
+    pmaddubsw  xmm1, xmm0           // 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
+    paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.
+    psrlw      xmm1, 7              // 8.7 fixed point to low 8 bits.
+    packuswb   xmm1, xmm1           // 8 bits, 2 pixels.
+    movd       ebx, xmm1
+    mov        [edi], bx
+    lea        edi, [edi + 2]
+    sub        ecx, 2               // 2 pixels
+    jge        xloop2
+
+ xloop29:
+    add        ecx, 2 - 1
+    jl         xloop99
+
+    // 1 pixel remainder
+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    movd       xmm0, ebx
+    psrlw      xmm2, 9              // 7 bit fractions.
+    pshufb     xmm2, xmm5           // 0011
+    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
+    pxor       xmm2, xmm6           // 0..7f and 7f..0
+    paddusb    xmm2, xmm7           // +1 so 0..7f and 80..1
+    pmaddubsw  xmm2, xmm0           // 16 bit
+    paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.
+    psrlw      xmm2, 7              // 8.7 fixed point to low 8 bits.
+    packuswb   xmm2, xmm2           // 8 bits
+    movd       ebx, xmm2
+    mov        [edi], bl
+
+ xloop99:
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// Reads 16 pixels, duplicates them and writes 32 pixels.
+__declspec(naked)
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  __asm {
+    mov        edx, [esp + 4]    // dst_ptr
+    mov        eax, [esp + 8]    // src_ptr
+    mov        ecx, [esp + 12]   // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0
+    punpckhbw  xmm1, xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         wloop
+
+    ret
+  }
+}
+
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
+__declspec(naked)
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_argb
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_argb
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    shufps     xmm0, xmm1, 0xdd
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 8x1 rectangle to 4x1.
+__declspec(naked)
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_argb
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_argb
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 8x2 rectangle to 4x1.
+__declspec(naked)
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_argb
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_argb
+    mov        ecx, [esp + 4 + 16]   // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 4 pixels at a time.
+__declspec(naked)
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width) {
+  __asm {
+    push       ebx
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_argb
+                                     // src_stride ignored
+    mov        ebx, [esp + 8 + 12]   // src_stepx
+    mov        edx, [esp + 8 + 16]   // dst_argb
+    mov        ecx, [esp + 8 + 20]   // dst_width
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+  wloop:
+    movd       xmm0, [eax]
+    movd       xmm1, [eax + ebx]
+    punpckldq  xmm0, xmm1
+    movd       xmm2, [eax + ebx * 2]
+    movd       xmm3, [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        edi
+    pop        ebx
+    ret
+  }
+}
+
+// Blends four 2x2 to 4x1.
+__declspec(naked)
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]    // src_argb
+    mov        esi, [esp + 12 + 8]    // src_stride
+    mov        ebx, [esp + 12 + 12]   // src_stepx
+    mov        edx, [esp + 12 + 16]   // dst_argb
+    mov        ecx, [esp + 12 + 20]   // dst_width
+    lea        esi, [eax + esi]       // row1 pointer
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+  wloop:
+    movq       xmm0, qword ptr [eax]  // row0 4 pairs
+    movhps     xmm0, qword ptr [eax + ebx]
+    movq       xmm1, qword ptr [eax + ebx * 2]
+    movhps     xmm1, qword ptr [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    movq       xmm2, qword ptr [esi]  // row1 4 pairs
+    movhps     xmm2, qword ptr [esi + ebx]
+    movq       xmm3, qword ptr [esi + ebx * 2]
+    movhps     xmm3, qword ptr [esi + edi]
+    lea        esi,  [esi + ebx * 4]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// Column scaling unfiltered. SSE2 version.
+__declspec(naked)
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  __asm {
+    push       edi
+    push       esi
+    mov        edi, [esp + 8 + 4]    // dst_argb
+    mov        esi, [esp + 8 + 8]    // src_argb
+    mov        ecx, [esp + 8 + 12]   // dst_width
+    movd       xmm2, [esp + 8 + 16]  // x
+    movd       xmm3, [esp + 8 + 20]  // dx
+
+    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
+    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
+    paddd      xmm2, xmm0
+    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
+    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
+    paddd      xmm2, xmm0            // x3 x2 x1 x0
+    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
+    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
+
+    pextrw     eax, xmm2, 1          // get x0 integer.
+    pextrw     edx, xmm2, 3          // get x1 integer.
+
+    cmp        ecx, 0
+    jle        xloop99
+    sub        ecx, 4
+    jl         xloop49
+
+    // 4 Pixel loop.
+ xloop4:
+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
+    pextrw     eax, xmm2, 5           // get x2 integer.
+    pextrw     edx, xmm2, 7           // get x3 integer.
+    paddd      xmm2, xmm3             // x += dx
+    punpckldq  xmm0, xmm1             // x0 x1
+
+    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
+    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
+    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
+    punpckldq  xmm1, xmm4             // x2 x3
+    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4                 // 4 pixels
+    jge        xloop4
+
+ xloop49:
+    test       ecx, 2
+    je         xloop29
+
+    // 2 Pixels.
+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
+    pextrw     eax, xmm2, 5           // get x2 integer.
+    punpckldq  xmm0, xmm1             // x0 x1
+
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+
+ xloop29:
+    test       ecx, 1
+    je         xloop99
+
+    // 1 Pixels.
+    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
+    movd       dword ptr [edi], xmm0
+ xloop99:
+
+    pop        esi
+    pop        edi
+    ret
+  }
+}
+
+// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
+  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+__declspec(naked)
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]    // dst_argb
+    mov        esi, [esp + 8 + 8]    // src_argb
+    mov        ecx, [esp + 8 + 12]   // dst_width
+    movd       xmm2, [esp + 8 + 16]  // x
+    movd       xmm3, [esp + 8 + 20]  // dx
+    movdqa     xmm4, xmmword ptr kShuffleColARGB
+    movdqa     xmm5, xmmword ptr kShuffleFractions
+    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    psrlw      xmm6, 9
+    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    sub        ecx, 2
+    jl         xloop29
+
+    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    paddd      xmm0, xmm3
+    punpckldq  xmm2, xmm0           // x0 x1
+    punpckldq  xmm3, xmm3           // dx dx
+    paddd      xmm3, xmm3           // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+
+    // 2 Pixel loop.
+  xloop2:
+    movdqa     xmm1, xmm2           // x0, x1 fractions.
+    paddd      xmm2, xmm3           // x += dx
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    psrlw      xmm1, 9              // 7 bit fractions.
+    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
+    pshufb     xmm1, xmm5           // 0000000011111111
+    pshufb     xmm0, xmm4           // arrange pixels into pairs
+    pxor       xmm1, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
+    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+    sub        ecx, 2               // 2 pixels
+    jge        xloop2
+
+ xloop29:
+
+    add        ecx, 2 - 1
+    jl         xloop99
+
+    // 1 pixel remainder
+    psrlw      xmm2, 9              // 7 bit fractions.
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    pshufb     xmm2, xmm5           // 00000000
+    pshufb     xmm0, xmm4           // arrange pixels into pairs
+    pxor       xmm2, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
+    movd       [edi], xmm0
+
+ xloop99:
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+__declspec(naked)
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  __asm {
+    mov        edx, [esp + 4]    // dst_argb
+    mov        eax, [esp + 8]    // src_argb
+    mov        ecx, [esp + 12]   // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm1, xmm0
+    punpckldq  xmm0, xmm0
+    punpckhdq  xmm1, xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         wloop
+
+    ret
+  }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked)
+int FixedDiv_X86(int num, int div) {
+  __asm {
+    mov        eax, [esp + 4]    // num
+    cdq                          // extend num to 64 bits
+    shld       edx, eax, 16      // 32.16
+    shl        eax, 16
+    idiv       dword ptr [esp + 8]
+    ret
+  }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked)
+int FixedDiv1_X86(int num, int div) {
+  __asm {
+    mov        eax, [esp + 4]    // num
+    mov        ecx, [esp + 8]    // denom
+    cdq                          // extend num to 64 bits
+    shld       edx, eax, 16      // 32.16
+    shl        eax, 16
+    sub        eax, 0x00010001
+    sbb        edx, 0
+    sub        ecx, 1
+    idiv       ecx
+    ret
+  }
+}
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/video_common.cc b/files/source/video_common.cc
index 616affd1..00fb71e1 100644
--- a/files/source/video_common.cc
+++ b/files/source/video_common.cc
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -16,30 +16,40 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#define ARRAY_SIZE(x) (static_cast<int>((sizeof(x) / sizeof(x[0]))))
+#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
 
 struct FourCCAliasEntry {
   uint32 alias;
   uint32 canonical;
 };
 
-static const FourCCAliasEntry kFourCCAliases[] = {
+static const struct FourCCAliasEntry kFourCCAliases[] = {
   {FOURCC_IYUV, FOURCC_I420},
+  {FOURCC_YU12, FOURCC_I420},
   {FOURCC_YU16, FOURCC_I422},
   {FOURCC_YU24, FOURCC_I444},
   {FOURCC_YUYV, FOURCC_YUY2},
-  {FOURCC_YUVS, FOURCC_YUY2},
+  {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
   {FOURCC_HDYC, FOURCC_UYVY},
-  {FOURCC_2VUY, FOURCC_UYVY},
-  {FOURCC_BA81, FOURCC_BGGR},
+  {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
   {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
-  {FOURCC_RGB3, FOURCC_RAW},
+  {FOURCC_DMB1, FOURCC_MJPG},
+  {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
+  {FOURCC_RGB3, FOURCC_RAW },
   {FOURCC_BGR3, FOURCC_24BG},
+  {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
+  {FOURCC_CM24, FOURCC_RAW },  // kCMPixelFormat_24RGB
+  {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
+  {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
+  {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
 };
+// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
+//  {FOURCC_BGRA, FOURCC_ARGB},  // kCMPixelFormat_32BGRA
 
 LIBYUV_API
 uint32 CanonicalFourCC(uint32 fourcc) {
-  for (int i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+  int i;
+  for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
     if (kFourCCAliases[i].alias == fourcc) {
       return kFourCCAliases[i].canonical;
     }
diff --git a/files/sync_chromium.py b/files/sync_chromium.py
new file mode 100755
index 00000000..4e51b6bd
--- /dev/null
+++ b/files/sync_chromium.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""Script to download a Chromium checkout into the workspace.
+
+The script downloads a full Chromium Git clone and its DEPS.
+
+The following environment variable can be used to alter the behavior:
+* CHROMIUM_NO_HISTORY - If set to 1, a Git checkout with no history will be
+  downloaded. This is consumes less bandwidth and disk space but is known to be
+  slower in general if you have a high-speed connection.
+
+After a successful sync has completed, a .last_sync_chromium file is written to
+the chromium directory. While it exists, no more gclient sync operations will be
+performed until the --target-revision changes or the SCRIPT_VERSION constant is
+incremented. The file can be removed manually to force a new sync.
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+
+# Bump this whenever the algorithm changes and you need bots/devs to re-sync,
+# ignoring the .last_sync_chromium file
+SCRIPT_VERSION = 4
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+CHROMIUM_NO_HISTORY = 'CHROMIUM_NO_HISTORY'
+
+def _parse_gclient_dict():
+  gclient_dict = {}
+  try:
+    main_gclient = os.path.join(os.path.dirname(ROOT_DIR), '.gclient')
+    with open(main_gclient, 'rb') as deps_content:
+      exec(deps_content, gclient_dict)
+  except Exception as e:
+    print >> sys.stderr, 'error while parsing .gclient:', e
+  return gclient_dict
+
+
+def get_cache_dir():
+  return _parse_gclient_dict().get('cache_dir')
+
+
+def get_target_os_list():
+  return ','.join(_parse_gclient_dict().get('target_os', []))
+
+
+def main():
+  CR_DIR = os.path.join(ROOT_DIR, 'chromium')
+
+  p = argparse.ArgumentParser()
+  p.add_argument('--target-revision', required=True,
+                 help='The target chromium git revision [REQUIRED]')
+  p.add_argument('--chromium-dir', default=CR_DIR,
+                 help=('The path to the chromium directory to sync '
+                       '(default: %(default)r)'))
+  opts = p.parse_args()
+  opts.chromium_dir = os.path.abspath(opts.chromium_dir)
+
+  target_os_list = get_target_os_list()
+
+  # Do a quick check to see if we were successful last time to make runhooks
+  # sooper fast.
+  flag_file = os.path.join(opts.chromium_dir, '.last_sync_chromium')
+  flag_file_content = '\n'.join([
+    str(SCRIPT_VERSION),
+    opts.target_revision,
+    repr(target_os_list),
+  ])
+  if (os.path.exists(os.path.join(opts.chromium_dir, 'src')) and
+      os.path.exists(flag_file)):
+    with open(flag_file, 'r') as f:
+      if f.read() == flag_file_content:
+        print 'Chromium already up to date: ', opts.target_revision
+        return 0
+    os.unlink(flag_file)
+
+  env = os.environ.copy()
+
+  # Avoid downloading NaCl toolchain as part of the Chromium hooks.
+  env['GYP_CHROMIUM_NO_ACTION'] = '1'
+  gclient_cmd = 'gclient.bat' if sys.platform.startswith('win') else 'gclient'
+  args = [
+      gclient_cmd, 'sync', '--force', '--revision', 'src@'+opts.target_revision
+  ]
+
+  if os.environ.get('CHROME_HEADLESS') == '1':
+    # Running on a buildbot.
+    args.append('-vvv')
+
+    if sys.platform.startswith('win'):
+      cache_path = os.path.join(os.path.splitdrive(ROOT_DIR)[0] + os.path.sep,
+                                'b', 'git-cache')
+    else:
+      cache_path = '/b/git-cache'
+  else:
+    # Support developers setting the cache_dir in .gclient.
+    cache_path = get_cache_dir()
+
+  # Allow for users with poor internet connections to download a Git clone
+  # without history (saves several gigs but is generally slower and doesn't work
+  # with the Git cache).
+  if os.environ.get(CHROMIUM_NO_HISTORY) == '1':
+    if cache_path:
+      print >> sys.stderr, (
+          'You cannot use "no-history" mode for syncing Chrome (i.e. set the '
+          '%s environment variable to 1) when you have cache_dir configured in '
+          'your .gclient.' % CHROMIUM_NO_HISTORY)
+      return 1
+    args.append('--no-history')
+    gclient_entries_file = os.path.join(opts.chromium_dir, '.gclient_entries')
+  else:
+    # Write a temporary .gclient file that has the cache_dir variable added.
+    gclientfile = os.path.join(opts.chromium_dir, '.gclient')
+    with open(gclientfile, 'rb') as spec:
+      spec = spec.read().splitlines()
+      spec[-1] = 'cache_dir = %r' % (cache_path,)
+    with open(gclientfile + '.tmp', 'wb') as f:
+      f.write('\n'.join(spec))
+
+    args += [
+      '--gclientfile', '.gclient.tmp',
+      '--delete_unversioned_trees', '--reset', '--upstream'
+    ]
+    gclient_entries_file = os.path.join(opts.chromium_dir,
+                                        '.gclient.tmp_entries')
+
+  # To avoid gclient sync problems when DEPS entries have been removed we must
+  # wipe the gclient's entries file that contains cached URLs for all DEPS.
+  if os.path.exists(gclient_entries_file):
+    os.unlink(gclient_entries_file)
+
+  if target_os_list:
+    args += ['--deps=' + target_os_list]
+
+  print 'Running "%s" in %s' % (' '.join(args), opts.chromium_dir)
+  ret = subprocess.call(args, cwd=opts.chromium_dir, env=env)
+  if ret == 0:
+    with open(flag_file, 'wb') as f:
+      f.write(flag_file_content)
+
+  return ret
+
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/files/third_party/gflags/BUILD.gn b/files/third_party/gflags/BUILD.gn
new file mode 100644
index 00000000..69a07232
--- /dev/null
+++ b/files/third_party/gflags/BUILD.gn
@@ -0,0 +1,76 @@
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This is a copy of WebRTC's BUILD.gn.
+
+if (is_win) {
+  gflags_gen_arch_root = "gen/win"
+} else {
+  gflags_gen_arch_root = "gen/posix"
+}
+
+config("gflags_config") {
+  include_dirs = [
+    "$gflags_gen_arch_root/include",  # For configured files.
+    "src/src",  # For everything else.
+  ]
+
+  defines = [
+    # These macros exist so flags and symbols are properly exported when
+    # building DLLs. Since we don't build DLLs, we need to disable them.
+    "GFLAGS_DLL_DECL=",
+    "GFLAGS_DLL_DECLARE_FLAG=",
+    "GFLAGS_DLL_DEFINE_FLAG=",
+  ]
+
+  # GN orders flags on a target before flags from configs. The default config
+  # adds -Wall, and this flag have to be after -Wall -- so they need to
+  # come from a config and can't be on the target directly.
+  if (is_clang) {
+    cflags = [ "-Wno-unused-local-typedef" ]
+  }
+}
+
+source_set("gflags") {
+  cflags = []
+  sources = [
+    "src/src/gflags.cc",
+    "src/src/gflags_completions.cc",
+    "src/src/gflags_reporting.cc",
+  ]
+  if (is_win) {
+    sources += [ "src/src/windows_port.cc" ]
+
+    cflags += [
+      "/wd4005",  # WIN32_LEAN_AND_MEAN.
+      "/wd4267",  # Conversion from size_t to "type".
+    ]
+  }
+
+  include_dirs = [
+    "$gflags_gen_arch_root/include/gflags",  # For configured files.
+    "$gflags_gen_arch_root/include/private",  # For config.h
+  ]
+
+  public_configs = [ ":gflags_config" ]
+
+  configs -= [ "//build/config/compiler:chromium_code" ]
+  configs += [ "//build/config/compiler:no_chromium_code" ]
+
+  if (is_win) {
+    configs -= [ "//build/config/win:unicode" ]
+  }
+
+  if (is_clang) {
+    # TODO(andrew): Look into fixing this warning upstream:
+    # http://code.google.com/p/webrtc/issues/detail?id=760
+    configs -= [ "//build/config/clang:extra_warnings" ]
+    cflags += [ "-Wno-microsoft-include" ]
+  }
+}
diff --git a/files/third_party/gflags/LICENSE b/files/third_party/gflags/LICENSE
new file mode 100644
index 00000000..d15b0c24
--- /dev/null
+++ b/files/third_party/gflags/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2006, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/files/third_party/gflags/README.libyuv b/files/third_party/gflags/README.libyuv
new file mode 100644
index 00000000..5b3bc2db
--- /dev/null
+++ b/files/third_party/gflags/README.libyuv
@@ -0,0 +1,28 @@
+URL: https://github.com/gflags/gflags
+Version: 2.1.2
+License: New BSD
+License File: LICENSE
+
+Description:
+The gflags package contains a library that implements commandline
+flags processing. As such it's a replacement for getopt(). It has
+increased flexibility, including built-in support for C++ types like
+string, and the ability to define flags in the source file in which
+they're used.
+
+Local Modifications: None
+
+
+How to update platform configuration files:
+The gen/ directory contains pre-generated configuration header files.
+Historically, all operating systems and architectures have generated
+similar configurations except for Windows. This is why there's only
+posix and win directories below gen/.
+When rolling gflags to a newer version, it's a good idea to check if
+new configuration files needs to be generated as well.
+Do this by running ./configure in the newly checked out version of
+gflags. Then diff the generated files with the ones below gen/.
+If you notice a diff, update the files with the updated ones.
+If you suspect platform dependend changes other than Windows, you'll
+have to checkout gflags on the other platforms as well and run
+./configure there too.
diff --git a/files/third_party/gflags/gen/posix/include/gflags/gflags.h b/files/third_party/gflags/gen/posix/include/gflags/gflags.h
new file mode 100644
index 00000000..0db38f5c
--- /dev/null
+++ b/files/third_party/gflags/gen/posix/include/gflags/gflags.h
@@ -0,0 +1,573 @@
+// Copyright (c) 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Revamped and reorganized by Craig Silverstein
+//
+// This is the file that should be included by any file which declares
+// or defines a command line flag or wants to parse command line flags
+// or print a program usage message (which will include information about
+// flags).  Executive summary, in the form of an example foo.cc file:
+//
+//    #include "foo.h"         // foo.h has a line "DECLARE_int32(start);"
+//    #include "validators.h"  // hypothetical file defining ValidateIsFile()
+//
+//    DEFINE_int32(end, 1000, "The last record to read");
+//
+//    DEFINE_string(filename, "my_file.txt", "The file to read");
+//    // Crash if the specified file does not exist.
+//    static bool dummy = RegisterFlagValidator(&FLAGS_filename,
+//                                              &ValidateIsFile);
+//
+//    DECLARE_bool(verbose); // some other file has a DEFINE_bool(verbose, ...)
+//
+//    void MyFunc() {
+//      if (FLAGS_verbose) printf("Records %d-%d\n", FLAGS_start, FLAGS_end);
+//    }
+//
+//    Then, at the command-line:
+//       ./foo --noverbose --start=5 --end=100
+//
+// For more details, see
+//    doc/gflags.html
+//
+// --- A note about thread-safety:
+//
+// We describe many functions in this routine as being thread-hostile,
+// thread-compatible, or thread-safe.  Here are the meanings we use:
+//
+// thread-safe: it is safe for multiple threads to call this routine
+//   (or, when referring to a class, methods of this class)
+//   concurrently.
+// thread-hostile: it is not safe for multiple threads to call this
+//   routine (or methods of this class) concurrently.  In gflags,
+//   most thread-hostile routines are intended to be called early in,
+//   or even before, main() -- that is, before threads are spawned.
+// thread-compatible: it is safe for multiple threads to read from
+//   this variable (when applied to variables), or to call const
+//   methods of this class (when applied to classes), as long as no
+//   other thread is writing to the variable or calling non-const
+//   methods of this class.
+
+#ifndef GFLAGS_GFLAGS_H_
+#define GFLAGS_GFLAGS_H_
+
+#include <string>
+#include <vector>
+
+#include "gflags_declare.h" // IWYU pragma: export
+
+
+// We always want to export variables defined in user code
+#ifndef GFLAGS_DLL_DEFINE_FLAG
+#  ifdef _MSC_VER
+#    define GFLAGS_DLL_DEFINE_FLAG __declspec(dllexport)
+#  else
+#    define GFLAGS_DLL_DEFINE_FLAG
+#  endif
+#endif
+
+
+namespace GFLAGS_NAMESPACE {
+
+
+// --------------------------------------------------------------------
+// To actually define a flag in a file, use DEFINE_bool,
+// DEFINE_string, etc. at the bottom of this file.  You may also find
+// it useful to register a validator with the flag.  This ensures that
+// when the flag is parsed from the commandline, or is later set via
+// SetCommandLineOption, we call the validation function. It is _not_
+// called when you assign the value to the flag directly using the = operator.
+//
+// The validation function should return true if the flag value is valid, and
+// false otherwise. If the function returns false for the new setting of the
+// flag, the flag will retain its current value. If it returns false for the
+// default value, ParseCommandLineFlags() will die.
+//
+// This function is safe to call at global construct time (as in the
+// example below).
+//
+// Example use:
+//    static bool ValidatePort(const char* flagname, int32 value) {
+//       if (value > 0 && value < 32768)   // value is ok
+//         return true;
+//       printf("Invalid value for --%s: %d\n", flagname, (int)value);
+//       return false;
+//    }
+//    DEFINE_int32(port, 0, "What port to listen on");
+//    static bool dummy = RegisterFlagValidator(&FLAGS_port, &ValidatePort);
+
+// Returns true if successfully registered, false if not (because the
+// first argument doesn't point to a command-line flag, or because a
+// validator is already registered for this flag).
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const bool*        flag, bool (*validate_fn)(const char*, bool));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int32*       flag, bool (*validate_fn)(const char*, int32));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int64*       flag, bool (*validate_fn)(const char*, int64));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const uint64*      flag, bool (*validate_fn)(const char*, uint64));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const double*      flag, bool (*validate_fn)(const char*, double));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const std::string* flag, bool (*validate_fn)(const char*, const std::string&));
+
+// Convenience macro for the registration of a flag validator
+#define DEFINE_validator(name, validator) \
+    static const bool name##_validator_registered = \
+            GFLAGS_NAMESPACE::RegisterFlagValidator(&FLAGS_##name, validator)
+
+
+// --------------------------------------------------------------------
+// These methods are the best way to get access to info about the
+// list of commandline flags.  Note that these routines are pretty slow.
+//   GetAllFlags: mostly-complete info about the list, sorted by file.
+//   ShowUsageWithFlags: pretty-prints the list to stdout (what --help does)
+//   ShowUsageWithFlagsRestrict: limit to filenames with restrict as a substr
+//
+// In addition to accessing flags, you can also access argv[0] (the program
+// name) and argv (the entire commandline), which we sock away a copy of.
+// These variables are static, so you should only set them once.
+//
+// No need to export this data only structure from DLL, avoiding VS warning 4251.
+struct CommandLineFlagInfo {
+  std::string name;            // the name of the flag
+  std::string type;            // the type of the flag: int32, etc
+  std::string description;     // the "help text" associated with the flag
+  std::string current_value;   // the current value, as a string
+  std::string default_value;   // the default value, as a string
+  std::string filename;        // 'cleaned' version of filename holding the flag
+  bool has_validator_fn;       // true if RegisterFlagValidator called on this flag
+  bool is_default;             // true if the flag has the default value and
+                               // has not been set explicitly from the cmdline
+                               // or via SetCommandLineOption
+  const void* flag_ptr;        // pointer to the flag's current value (i.e. FLAGS_foo)
+};
+
+// Using this inside of a validator is a recipe for a deadlock.
+// TODO(user) Fix locking when validators are running, to make it safe to
+// call validators during ParseAllFlags.
+// Also make sure then to uncomment the corresponding unit test in
+// gflags_unittest.sh
+extern GFLAGS_DLL_DECL void GetAllFlags(std::vector<CommandLineFlagInfo>* OUTPUT);
+// These two are actually defined in gflags_reporting.cc.
+extern GFLAGS_DLL_DECL void ShowUsageWithFlags(const char *argv0);  // what --help does
+extern GFLAGS_DLL_DECL void ShowUsageWithFlagsRestrict(const char *argv0, const char *restrict);
+
+// Create a descriptive string for a flag.
+// Goes to some trouble to make pretty line breaks.
+extern GFLAGS_DLL_DECL std::string DescribeOneFlag(const CommandLineFlagInfo& flag);
+
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetArgv(int argc, const char** argv);
+
+// The following functions are thread-safe as long as SetArgv() is
+// only called before any threads start.
+extern GFLAGS_DLL_DECL const std::vector<std::string>& GetArgvs();
+extern GFLAGS_DLL_DECL const char* GetArgv();                      // all of argv as a string
+extern GFLAGS_DLL_DECL const char* GetArgv0();                     // only argv0
+extern GFLAGS_DLL_DECL uint32 GetArgvSum();                        // simple checksum of argv
+extern GFLAGS_DLL_DECL const char* ProgramInvocationName();        // argv0, or "UNKNOWN" if not set
+extern GFLAGS_DLL_DECL const char* ProgramInvocationShortName();   // basename(argv0)
+
+// ProgramUsage() is thread-safe as long as SetUsageMessage() is only
+// called before any threads start.
+extern GFLAGS_DLL_DECL const char* ProgramUsage();                 // string set by SetUsageMessage()
+
+// VersionString() is thread-safe as long as SetVersionString() is only
+// called before any threads start.
+extern GFLAGS_DLL_DECL const char* VersionString();                // string set by SetVersionString()
+
+
+
+// --------------------------------------------------------------------
+// Normally you access commandline flags by just saying "if (FLAGS_foo)"
+// or whatever, and set them by calling "FLAGS_foo = bar" (or, more
+// commonly, via the DEFINE_foo macro).  But if you need a bit more
+// control, we have programmatic ways to get/set the flags as well.
+// These programmatic ways to access flags are thread-safe, but direct
+// access is only thread-compatible.
+
+// Return true iff the flagname was found.
+// OUTPUT is set to the flag's value, or unchanged if we return false.
+extern GFLAGS_DLL_DECL bool GetCommandLineOption(const char* name, std::string* OUTPUT);
+
+// Return true iff the flagname was found. OUTPUT is set to the flag's
+// CommandLineFlagInfo or unchanged if we return false.
+extern GFLAGS_DLL_DECL bool GetCommandLineFlagInfo(const char* name, CommandLineFlagInfo* OUTPUT);
+
+// Return the CommandLineFlagInfo of the flagname.  exit() if name not found.
+// Example usage, to check if a flag's value is currently the default value:
+//   if (GetCommandLineFlagInfoOrDie("foo").is_default) ...
+extern GFLAGS_DLL_DECL CommandLineFlagInfo GetCommandLineFlagInfoOrDie(const char* name);
+
+enum GFLAGS_DLL_DECL FlagSettingMode {
+  // update the flag's value (can call this multiple times).
+  SET_FLAGS_VALUE,
+  // update the flag's value, but *only if* it has not yet been updated
+  // with SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef".
+  SET_FLAG_IF_DEFAULT,
+  // set the flag's default value to this.  If the flag has not yet updated
+  // yet (via SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef")
+  // change the flag's current value to the new default value as well.
+  SET_FLAGS_DEFAULT
+};
+
+// Set a particular flag ("command line option").  Returns a string
+// describing the new value that the option has been set to.  The
+// return value API is not well-specified, so basically just depend on
+// it to be empty if the setting failed for some reason -- the name is
+// not a valid flag name, or the value is not a valid value -- and
+// non-empty else.
+
+// SetCommandLineOption uses set_mode == SET_FLAGS_VALUE (the common case)
+extern GFLAGS_DLL_DECL std::string SetCommandLineOption        (const char* name, const char* value);
+extern GFLAGS_DLL_DECL std::string SetCommandLineOptionWithMode(const char* name, const char* value, FlagSettingMode set_mode);
+
+
+// --------------------------------------------------------------------
+// Saves the states (value, default value, whether the user has set
+// the flag, registered validators, etc) of all flags, and restores
+// them when the FlagSaver is destroyed.  This is very useful in
+// tests, say, when you want to let your tests change the flags, but
+// make sure that they get reverted to the original states when your
+// test is complete.
+//
+// Example usage:
+//   void TestFoo() {
+//     FlagSaver s1;
+//     FLAG_foo = false;
+//     FLAG_bar = "some value";
+//
+//     // test happens here.  You can return at any time
+//     // without worrying about restoring the FLAG values.
+//   }
+//
+// Note: This class is marked with GFLAGS_ATTRIBUTE_UNUSED because all
+// the work is done in the constructor and destructor, so in the standard
+// usage example above, the compiler would complain that it's an
+// unused variable.
+//
+// This class is thread-safe.  However, its destructor writes to
+// exactly the set of flags that have changed value during its
+// lifetime, so concurrent _direct_ access to those flags
+// (i.e. FLAGS_foo instead of {Get,Set}CommandLineOption()) is unsafe.
+
+class GFLAGS_DLL_DECL FlagSaver {
+ public:
+  FlagSaver();
+  ~FlagSaver();
+
+ private:
+  class FlagSaverImpl* impl_;   // we use pimpl here to keep API steady
+
+  FlagSaver(const FlagSaver&);  // no copying!
+  void operator=(const FlagSaver&);
+}__attribute((unused));
+
+// --------------------------------------------------------------------
+// Some deprecated or hopefully-soon-to-be-deprecated functions.
+
+// This is often used for logging.  TODO(csilvers): figure out a better way
+extern GFLAGS_DLL_DECL std::string CommandlineFlagsIntoString();
+// Usually where this is used, a FlagSaver should be used instead.
+extern GFLAGS_DLL_DECL
+bool ReadFlagsFromString(const std::string& flagfilecontents,
+                         const char* prog_name,
+                         bool errors_are_fatal);  // uses SET_FLAGS_VALUE
+
+// These let you manually implement --flagfile functionality.
+// DEPRECATED.
+extern GFLAGS_DLL_DECL bool AppendFlagsIntoFile(const std::string& filename, const char* prog_name);
+extern GFLAGS_DLL_DECL bool ReadFromFlagsFile(const std::string& filename, const char* prog_name, bool errors_are_fatal);   // uses SET_FLAGS_VALUE
+
+
+// --------------------------------------------------------------------
+// Useful routines for initializing flags from the environment.
+// In each case, if 'varname' does not exist in the environment
+// return defval.  If 'varname' does exist but is not valid
+// (e.g., not a number for an int32 flag), abort with an error.
+// Otherwise, return the value.  NOTE: for booleans, for true use
+// 't' or 'T' or 'true' or '1', for false 'f' or 'F' or 'false' or '0'.
+
+extern GFLAGS_DLL_DECL bool BoolFromEnv(const char *varname, bool defval);
+extern GFLAGS_DLL_DECL int32 Int32FromEnv(const char *varname, int32 defval);
+extern GFLAGS_DLL_DECL int64 Int64FromEnv(const char *varname, int64 defval);
+extern GFLAGS_DLL_DECL uint64 Uint64FromEnv(const char *varname, uint64 defval);
+extern GFLAGS_DLL_DECL double DoubleFromEnv(const char *varname, double defval);
+extern GFLAGS_DLL_DECL const char *StringFromEnv(const char *varname, const char *defval);
+
+
+// --------------------------------------------------------------------
+// The next two functions parse gflags from main():
+
+// Set the "usage" message for this program.  For example:
+//   string usage("This program does nothing.  Sample usage:\n");
+//   usage += argv[0] + " <uselessarg1> <uselessarg2>";
+//   SetUsageMessage(usage);
+// Do not include commandline flags in the usage: we do that for you!
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetUsageMessage(const std::string& usage);
+
+// Sets the version string, which is emitted with --version.
+// For instance: SetVersionString("1.3");
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetVersionString(const std::string& version);
+
+
+// Looks for flags in argv and parses them.  Rearranges argv to put
+// flags first, or removes them entirely if remove_flags is true.
+// If a flag is defined more than once in the command line or flag
+// file, the last definition is used.  Returns the index (into argv)
+// of the first non-flag argument.
+// See top-of-file for more details on this function.
+#ifndef SWIG   // In swig, use ParseCommandLineFlagsScript() instead.
+extern GFLAGS_DLL_DECL uint32 ParseCommandLineFlags(int *argc, char*** argv, bool remove_flags);
+#endif
+
+
+// Calls to ParseCommandLineNonHelpFlags and then to
+// HandleCommandLineHelpFlags can be used instead of a call to
+// ParseCommandLineFlags during initialization, in order to allow for
+// changing default values for some FLAGS (via
+// e.g. SetCommandLineOptionWithMode calls) between the time of
+// command line parsing and the time of dumping help information for
+// the flags as a result of command line parsing.  If a flag is
+// defined more than once in the command line or flag file, the last
+// definition is used.  Returns the index (into argv) of the first
+// non-flag argument.  (If remove_flags is true, will always return 1.)
+extern GFLAGS_DLL_DECL uint32 ParseCommandLineNonHelpFlags(int *argc, char*** argv, bool remove_flags);
+
+// This is actually defined in gflags_reporting.cc.
+// This function is misnamed (it also handles --version, etc.), but
+// it's too late to change that now. :-(
+extern GFLAGS_DLL_DECL void HandleCommandLineHelpFlags();   // in gflags_reporting.cc
+
+// Allow command line reparsing.  Disables the error normally
+// generated when an unknown flag is found, since it may be found in a
+// later parse.  Thread-hostile; meant to be called before any threads
+// are spawned.
+extern GFLAGS_DLL_DECL void AllowCommandLineReparsing();
+
+// Reparse the flags that have not yet been recognized.  Only flags
+// registered since the last parse will be recognized.  Any flag value
+// must be provided as part of the argument using "=", not as a
+// separate command line argument that follows the flag argument.
+// Intended for handling flags from dynamically loaded libraries,
+// since their flags are not registered until they are loaded.
+extern GFLAGS_DLL_DECL void ReparseCommandLineNonHelpFlags();
+
+// Clean up memory allocated by flags.  This is only needed to reduce
+// the quantity of "potentially leaked" reports emitted by memory
+// debugging tools such as valgrind.  It is not required for normal
+// operation, or for the google perftools heap-checker.  It must only
+// be called when the process is about to exit, and all threads that
+// might access flags are quiescent.  Referencing flags after this is
+// called will have unexpected consequences.  This is not safe to run
+// when multiple threads might be running: the function is
+// thread-hostile.
+extern GFLAGS_DLL_DECL void ShutDownCommandLineFlags();
+
+
+// --------------------------------------------------------------------
+// Now come the command line flag declaration/definition macros that
+// will actually be used.  They're kind of hairy.  A major reason
+// for this is initialization: we want people to be able to access
+// variables in global constructors and have that not crash, even if
+// their global constructor runs before the global constructor here.
+// (Obviously, we can't guarantee the flags will have the correct
+// default value in that case, but at least accessing them is safe.)
+// The only way to do that is have flags point to a static buffer.
+// So we make one, using a union to ensure proper alignment, and
+// then use placement-new to actually set up the flag with the
+// correct default value.  In the same vein, we have to worry about
+// flag access in global destructors, so FlagRegisterer has to be
+// careful never to destroy the flag-values it constructs.
+//
+// Note that when we define a flag variable FLAGS_<name>, we also
+// preemptively define a junk variable, FLAGS_no<name>.  This is to
+// cause a link-time error if someone tries to define 2 flags with
+// names like "logging" and "nologging".  We do this because a bool
+// flag FLAG can be set from the command line to true with a "-FLAG"
+// argument, and to false with a "-noFLAG" argument, and so this can
+// potentially avert confusion.
+//
+// We also put flags into their own namespace.  It is purposefully
+// named in an opaque way that people should have trouble typing
+// directly.  The idea is that DEFINE puts the flag in the weird
+// namespace, and DECLARE imports the flag from there into the current
+// namespace.  The net result is to force people to use DECLARE to get
+// access to a flag, rather than saying "extern GFLAGS_DLL_DECL bool FLAGS_whatever;"
+// or some such instead.  We want this so we can put extra
+// functionality (like sanity-checking) in DECLARE if we want, and
+// make sure it is picked up everywhere.
+//
+// We also put the type of the variable in the namespace, so that
+// people can't DECLARE_int32 something that they DEFINE_bool'd
+// elsewhere.
+
+class GFLAGS_DLL_DECL FlagRegisterer {
+ public:
+  FlagRegisterer(const char* name, const char* type,
+                 const char* help, const char* filename,
+                 void* current_storage, void* defvalue_storage);
+};
+
+// If your application #defines STRIP_FLAG_HELP to a non-zero value
+// before #including this file, we remove the help message from the
+// binary file. This can reduce the size of the resulting binary
+// somewhat, and may also be useful for security reasons.
+
+extern GFLAGS_DLL_DECL const char kStrippedFlagHelp[];
+
+
+} // namespace GFLAGS_NAMESPACE
+
+
+#ifndef SWIG  // In swig, ignore the main flag declarations
+
+#if defined(STRIP_FLAG_HELP) && STRIP_FLAG_HELP > 0
+// Need this construct to avoid the 'defined but not used' warning.
+#define MAYBE_STRIPPED_HELP(txt) \
+   (false ? (txt) : GFLAGS_NAMESPACE::kStrippedFlagHelp)
+#else
+#define MAYBE_STRIPPED_HELP(txt) txt
+#endif
+
+// Each command-line flag has two variables associated with it: one
+// with the current value, and one with the default value.  However,
+// we have a third variable, which is where value is assigned; it's a
+// constant.  This guarantees that FLAG_##value is initialized at
+// static initialization time (e.g. before program-start) rather than
+// than global construction time (which is after program-start but
+// before main), at least when 'value' is a compile-time constant.  We
+// use a small trick for the "default value" variable, and call it
+// FLAGS_no<name>.  This serves the second purpose of assuring a
+// compile error if someone tries to define a flag named no<name>
+// which is illegal (--foo and --nofoo both affect the "foo" flag).
+#define DEFINE_VARIABLE(type, shorttype, name, value, help)             \
+  namespace fL##shorttype {                                             \
+    static const type FLAGS_nono##name = value;                         \
+    /* We always want to export defined variables, dll or no */         \
+    GFLAGS_DLL_DEFINE_FLAG type FLAGS_##name = FLAGS_nono##name;        \
+    type FLAGS_no##name = FLAGS_nono##name;                             \
+    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                   \
+      #name, #type, MAYBE_STRIPPED_HELP(help), __FILE__,                \
+      &FLAGS_##name, &FLAGS_no##name);                                  \
+  }                                                                     \
+  using fL##shorttype::FLAGS_##name
+
+// For DEFINE_bool, we want to do the extra check that the passed-in
+// value is actually a bool, and not a string or something that can be
+// coerced to a bool.  These declarations (no definition needed!) will
+// help us do that, and never evaluate From, which is important.
+// We'll use 'sizeof(IsBool(val))' to distinguish. This code requires
+// that the compiler have different sizes for bool & double. Since
+// this is not guaranteed by the standard, we check it with a
+// COMPILE_ASSERT.
+namespace fLB {
+struct CompileAssert {};
+typedef CompileAssert expected_sizeof_double_neq_sizeof_bool[
+                      (sizeof(double) != sizeof(bool)) ? 1 : -1];
+template<typename From> double GFLAGS_DLL_DECL IsBoolFlag(const From& from);
+GFLAGS_DLL_DECL bool IsBoolFlag(bool from);
+}  // namespace fLB
+
+// Here are the actual DEFINE_*-macros. The respective DECLARE_*-macros
+// are in a separate include, gflags_declare.h, for reducing
+// the physical transitive size for DECLARE use.
+#define DEFINE_bool(name, val, txt)                                     \
+  namespace fLB {                                                       \
+    typedef ::fLB::CompileAssert FLAG_##name##_value_is_not_a_bool[     \
+            (sizeof(::fLB::IsBoolFlag(val)) != sizeof(double))? 1: -1]; \
+  }                                                                     \
+  DEFINE_VARIABLE(bool, B, name, val, txt)
+
+#define DEFINE_int32(name, val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int32, I, \
+                   name, val, txt)
+
+#define DEFINE_int64(name, val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int64, I64, \
+                   name, val, txt)
+
+#define DEFINE_uint64(name,val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint64, U64, \
+                   name, val, txt)
+
+#define DEFINE_double(name, val, txt) \
+   DEFINE_VARIABLE(double, D, name, val, txt)
+
+// Strings are trickier, because they're not a POD, so we can't
+// construct them at static-initialization time (instead they get
+// constructed at global-constructor time, which is much later).  To
+// try to avoid crashes in that case, we use a char buffer to store
+// the string, which we can static-initialize, and then placement-new
+// into it later.  It's not perfect, but the best we can do.
+
+namespace fLS {
+
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const char *value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const clstring &value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           int value);
+}  // namespace fLS
+
+// We need to define a var named FLAGS_no##name so people don't define
+// --string and --nostring.  And we need a temporary place to put val
+// so we don't have to evaluate it twice.  Two great needs that go
+// great together!
+// The weird 'using' + 'extern' inside the fLS namespace is to work around
+// an unknown compiler bug/issue with the gcc 4.2.1 on SUSE 10.  See
+//    http://code.google.com/p/google-gflags/issues/detail?id=20
+#define DEFINE_string(name, val, txt)                                       \
+  namespace fLS {                                                           \
+    using ::fLS::clstring;                                                  \
+    static union { void* align; char s[sizeof(clstring)]; } s_##name[2];    \
+    clstring* const FLAGS_no##name = ::fLS::                                \
+                                   dont_pass0toDEFINE_string(s_##name[0].s, \
+                                                             val);          \
+    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                       \
+        #name, "string", MAYBE_STRIPPED_HELP(txt), __FILE__,                \
+        s_##name[0].s, new (s_##name[1].s) clstring(*FLAGS_no##name));      \
+    extern GFLAGS_DLL_DEFINE_FLAG clstring& FLAGS_##name;                   \
+    using fLS::FLAGS_##name;                                                \
+    clstring& FLAGS_##name = *FLAGS_no##name;                               \
+  }                                                                         \
+  using fLS::FLAGS_##name
+
+#endif  // SWIG
+
+
+// Import gflags library symbols into alternative/deprecated namespace(s)
+#include "gflags_gflags.h"
+
+
+#endif  // GFLAGS_GFLAGS_H_
diff --git a/files/third_party/gflags/gen/posix/include/gflags/gflags_completions.h b/files/third_party/gflags/gen/posix/include/gflags/gflags_completions.h
new file mode 100644
index 00000000..f951c1e0
--- /dev/null
+++ b/files/third_party/gflags/gen/posix/include/gflags/gflags_completions.h
@@ -0,0 +1,121 @@
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+
+//
+// Implement helpful bash-style command line flag completions
+//
+// ** Functional API:
+// HandleCommandLineCompletions() should be called early during
+// program startup, but after command line flag code has been
+// initialized, such as the beginning of HandleCommandLineHelpFlags().
+// It checks the value of the flag --tab_completion_word.  If this
+// flag is empty, nothing happens here.  If it contains a string,
+// however, then HandleCommandLineCompletions() will hijack the
+// process, attempting to identify the intention behind this
+// completion.  Regardless of the outcome of this deduction, the
+// process will be terminated, similar to --helpshort flag
+// handling.
+//
+// ** Overview of Bash completions:
+// Bash can be told to programatically determine completions for the
+// current 'cursor word'.  It does this by (in this case) invoking a
+// command with some additional arguments identifying the command
+// being executed, the word being completed, and the previous word
+// (if any).  Bash then expects a sequence of output lines to be
+// printed to stdout.  If these lines all contain a common prefix
+// longer than the cursor word, bash will replace the cursor word
+// with that common prefix, and display nothing.  If there isn't such
+// a common prefix, bash will display the lines in pages using 'more'.
+//
+// ** Strategy taken for command line completions:
+// If we can deduce either the exact flag intended, or a common flag
+// prefix, we'll output exactly that.  Otherwise, if information
+// must be displayed to the user, we'll take the opportunity to add
+// some helpful information beyond just the flag name (specifically,
+// we'll include the default flag value and as much of the flag's
+// description as can fit on a single terminal line width, as specified
+// by the flag --tab_completion_columns).  Furthermore, we'll try to
+// make bash order the output such that the most useful or relevent
+// flags are the most likely to be shown at the top.
+//
+// ** Additional features:
+// To assist in finding that one really useful flag, substring matching
+// was implemented.  Before pressing a <TAB> to get completion for the
+// current word, you can append one or more '?' to the flag to do
+// substring matching.  Here's the semantics:
+//   --foo<TAB>     Show me all flags with names prefixed by 'foo'
+//   --foo?<TAB>    Show me all flags with 'foo' somewhere in the name
+//   --foo??<TAB>   Same as prior case, but also search in module
+//                  definition path for 'foo'
+//   --foo???<TAB>  Same as prior case, but also search in flag
+//                  descriptions for 'foo'
+// Finally, we'll trim the output to a relatively small number of
+// flags to keep bash quiet about the verbosity of output.  If one
+// really wanted to see all possible matches, appending a '+' to the
+// search word will force the exhaustive list of matches to be printed.
+//
+// ** How to have bash accept completions from a binary:
+// Bash requires that it be informed about each command that programmatic
+// completion should be enabled for.  Example addition to a .bashrc
+// file would be (your path to gflags_completions.sh file may differ):
+
+/*
+$ complete -o bashdefault -o default -o nospace -C                            \
+ '/home/build/eng/bash/bash_completions.sh --tab_completion_columns $COLUMNS' \
+  time  env  binary_name  another_binary  [...]
+*/
+
+// This would allow the following to work:
+//   $ /path/to/binary_name --vmodule<TAB>
+// Or:
+//   $ ./bin/path/another_binary --gfs_u<TAB>
+// (etc)
+//
+// Sadly, it appears that bash gives no easy way to force this behavior for
+// all commands.  That's where the "time" in the above example comes in.
+// If you haven't specifically added a command to the list of completion
+// supported commands, you can still get completions by prefixing the
+// entire command with "env".
+//   $ env /some/brand/new/binary --vmod<TAB>
+// Assuming that "binary" is a newly compiled binary, this should still
+// produce the expected completion output.
+
+
+#ifndef GFLAGS_COMPLETIONS_H_
+#define GFLAGS_COMPLETIONS_H_
+
+namespace google {
+
+extern void HandleCommandLineCompletions(void);
+
+}
+
+#endif  // GFLAGS_COMPLETIONS_H_
diff --git a/files/third_party/gflags/gen/posix/include/gflags/gflags_declare.h b/files/third_party/gflags/gen/posix/include/gflags/gflags_declare.h
new file mode 100644
index 00000000..935a20e7
--- /dev/null
+++ b/files/third_party/gflags/gen/posix/include/gflags/gflags_declare.h
@@ -0,0 +1,141 @@
+// Copyright (c) 1999, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+//
+// Revamped and reorganized by Craig Silverstein
+//
+// This is the file that should be included by any file which declares
+// command line flag.
+
+#ifndef GFLAGS_DECLARE_H_
+#define GFLAGS_DECLARE_H_
+
+
+// ---------------------------------------------------------------------------
+// Namespace of gflags library symbols.
+#define GFLAGS_NAMESPACE google
+
+// ---------------------------------------------------------------------------
+// Windows DLL import/export.
+
+// We always want to import the symbols of the gflags library
+#ifndef GFLAGS_DLL_DECL
+#  if 0 && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECL __declspec(dllimport)
+#  else
+#    define GFLAGS_DLL_DECL
+#  endif
+#endif
+
+// We always want to import variables declared in user code
+#ifndef GFLAGS_DLL_DECLARE_FLAG
+#  ifdef _MSC_VER
+#    define GFLAGS_DLL_DECLARE_FLAG __declspec(dllimport)
+#  else
+#    define GFLAGS_DLL_DECLARE_FLAG
+#  endif
+#endif
+
+// ---------------------------------------------------------------------------
+// Flag types
+#include <string>
+#if 1
+#  include <stdint.h>                   // the normal place uint32_t is defined
+#elif 1
+#  include <sys/types.h>                // the normal place u_int32_t is defined
+#elif 1
+#  include <inttypes.h>                 // a third place for uint32_t or u_int32_t
+#endif
+
+namespace GFLAGS_NAMESPACE {
+
+#if 1 // C99
+typedef int32_t          int32;
+typedef uint32_t         uint32;
+typedef int64_t          int64;
+typedef uint64_t         uint64;
+#elif 0 // BSD
+typedef int32_t          int32;
+typedef u_int32_t        uint32;
+typedef int64_t          int64;
+typedef u_int64_t        uint64;
+#elif 0 // Windows
+typedef __int32          int32;
+typedef unsigned __int32 uint32;
+typedef __int64          int64;
+typedef unsigned __int64 uint64;
+#else
+#  error Do not know how to define a 32-bit integer quantity on your system
+#endif
+
+} // namespace GFLAGS_NAMESPACE
+
+
+namespace fLS {
+
+// The meaning of "string" might be different between now and when the
+// macros below get invoked (e.g., if someone is experimenting with
+// other string implementations that get defined after this file is
+// included).  Save the current meaning now and use it in the macros.
+typedef std::string clstring;
+
+} // namespace fLS
+
+
+#define DECLARE_VARIABLE(type, shorttype, name) \
+  /* We always want to import declared variables, dll or no */ \
+  namespace fL##shorttype { extern GFLAGS_DLL_DECLARE_FLAG type FLAGS_##name; } \
+  using fL##shorttype::FLAGS_##name
+
+#define DECLARE_bool(name) \
+  DECLARE_VARIABLE(bool, B, name)
+
+#define DECLARE_int32(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int32, I, name)
+
+#define DECLARE_int64(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int64, I64, name)
+
+#define DECLARE_uint64(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::uint64, U64, name)
+
+#define DECLARE_double(name) \
+  DECLARE_VARIABLE(double, D, name)
+
+#define DECLARE_string(name) \
+  /* We always want to import declared variables, dll or no */ \
+  namespace fLS { \
+  using ::fLS::clstring; \
+  extern GFLAGS_DLL_DECLARE_FLAG ::fLS::clstring& FLAGS_##name; \
+  } \
+  using fLS::FLAGS_##name
+
+
+#endif  // GFLAGS_DECLARE_H_
diff --git a/files/third_party/gflags/gen/posix/include/gflags/gflags_gflags.h b/files/third_party/gflags/gen/posix/include/gflags/gflags_gflags.h
new file mode 100644
index 00000000..0c17825d
--- /dev/null
+++ b/files/third_party/gflags/gen/posix/include/gflags/gflags_gflags.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2014, Andreas Schuh
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// -----------------------------------------------------------------------------
+// Imports the gflags library symbols into an alternative/deprecated namespace.
+
+#ifndef GFLAGS_GFLAGS_H_
+#  error The internal header gflags_gflags.h may only be included by gflags.h
+#endif
+
+#ifndef GFLAGS_NS_GFLAGS_H_
+#define GFLAGS_NS_GFLAGS_H_
+
+
+namespace gflags {
+
+
+using GFLAGS_NAMESPACE::int32;
+using GFLAGS_NAMESPACE::uint32;
+using GFLAGS_NAMESPACE::int64;
+using GFLAGS_NAMESPACE::uint64;
+
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::CommandLineFlagInfo;
+using GFLAGS_NAMESPACE::GetAllFlags;
+using GFLAGS_NAMESPACE::ShowUsageWithFlags;
+using GFLAGS_NAMESPACE::ShowUsageWithFlagsRestrict;
+using GFLAGS_NAMESPACE::DescribeOneFlag;
+using GFLAGS_NAMESPACE::SetArgv;
+using GFLAGS_NAMESPACE::GetArgvs;
+using GFLAGS_NAMESPACE::GetArgv;
+using GFLAGS_NAMESPACE::GetArgv0;
+using GFLAGS_NAMESPACE::GetArgvSum;
+using GFLAGS_NAMESPACE::ProgramInvocationName;
+using GFLAGS_NAMESPACE::ProgramInvocationShortName;
+using GFLAGS_NAMESPACE::ProgramUsage;
+using GFLAGS_NAMESPACE::VersionString;
+using GFLAGS_NAMESPACE::GetCommandLineOption;
+using GFLAGS_NAMESPACE::GetCommandLineFlagInfo;
+using GFLAGS_NAMESPACE::GetCommandLineFlagInfoOrDie;
+using GFLAGS_NAMESPACE::FlagSettingMode;
+using GFLAGS_NAMESPACE::SET_FLAGS_VALUE;
+using GFLAGS_NAMESPACE::SET_FLAG_IF_DEFAULT;
+using GFLAGS_NAMESPACE::SET_FLAGS_DEFAULT;
+using GFLAGS_NAMESPACE::SetCommandLineOption;
+using GFLAGS_NAMESPACE::SetCommandLineOptionWithMode;
+using GFLAGS_NAMESPACE::FlagSaver;
+using GFLAGS_NAMESPACE::CommandlineFlagsIntoString;
+using GFLAGS_NAMESPACE::ReadFlagsFromString;
+using GFLAGS_NAMESPACE::AppendFlagsIntoFile;
+using GFLAGS_NAMESPACE::ReadFromFlagsFile;
+using GFLAGS_NAMESPACE::BoolFromEnv;
+using GFLAGS_NAMESPACE::Int32FromEnv;
+using GFLAGS_NAMESPACE::Int64FromEnv;
+using GFLAGS_NAMESPACE::Uint64FromEnv;
+using GFLAGS_NAMESPACE::DoubleFromEnv;
+using GFLAGS_NAMESPACE::StringFromEnv;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+using GFLAGS_NAMESPACE::SetVersionString;
+using GFLAGS_NAMESPACE::ParseCommandLineNonHelpFlags;
+using GFLAGS_NAMESPACE::HandleCommandLineHelpFlags;
+using GFLAGS_NAMESPACE::AllowCommandLineReparsing;
+using GFLAGS_NAMESPACE::ReparseCommandLineNonHelpFlags;
+using GFLAGS_NAMESPACE::ShutDownCommandLineFlags;
+using GFLAGS_NAMESPACE::FlagRegisterer;
+
+#ifndef SWIG
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+#endif
+
+
+} // namespace gflags
+
+
+#endif  // GFLAGS_NS_GFLAGS_H_
diff --git a/files/third_party/gflags/gen/posix/include/private/config.h b/files/third_party/gflags/gen/posix/include/private/config.h
new file mode 100644
index 00000000..592d61c4
--- /dev/null
+++ b/files/third_party/gflags/gen/posix/include/private/config.h
@@ -0,0 +1,112 @@
+/* Generated from config.h.in during build configuration using CMake. */
+
+// Note: This header file is only used internally. It is not part of public interface!
+
+// ---------------------------------------------------------------------------
+// System checks
+
+// Define if you build this library for a MS Windows OS.
+/* #undef OS_WINDOWS */
+
+// Define if you have the <stdint.h> header file.
+#define HAVE_STDINT_H
+
+// Define if you have the <sys/types.h> header file.
+#define HAVE_SYS_TYPES_H
+
+// Define if you have the <inttypes.h> header file.
+#define HAVE_INTTYPES_H
+
+// Define if you have the <sys/stat.h> header file.
+#define HAVE_SYS_STAT_H
+
+// Define if you have the <unistd.h> header file.
+#define HAVE_UNISTD_H
+
+// Define if you have the <fnmatch.h> header file.
+#define HAVE_FNMATCH_H
+
+// Define if you have the <shlwapi.h> header file (Windows 2000/XP).
+/* #undef HAVE_SHLWAPI_H */
+
+// Define if you have the strtoll function.
+#define HAVE_STRTOLL
+
+// Define if you have the strtoq function.
+/* #undef HAVE_STRTOQ */
+
+// Define if you have the <pthread.h> header file.
+#define HAVE_PTHREAD
+
+// Define if your pthread library defines the type pthread_rwlock_t
+#define HAVE_RWLOCK
+
+// gcc requires this to get PRId64, etc.
+#if defined(HAVE_INTTYPES_H) && !defined(__STDC_FORMAT_MACROS)
+#  define __STDC_FORMAT_MACROS 1
+#endif
+
+// ---------------------------------------------------------------------------
+// Package information
+
+// Name of package.
+#define PACKAGE gflags
+
+// Define to the full name of this package.
+#define PACKAGE_NAME gflags
+
+// Define to the full name and version of this package.
+#define PACKAGE_STRING gflags 2.2.0
+
+// Define to the one symbol short name of this package.
+#define PACKAGE_TARNAME gflags-2.2.0
+
+// Define to the version of this package.
+#define PACKAGE_VERSION 2.2.0
+
+// Version number of package.
+#define VERSION PACKAGE_VERSION
+
+// Define to the address where bug reports for this package should be sent.
+#define PACKAGE_BUGREPORT https://github.com/schuhschuh/gflags/issues
+
+// ---------------------------------------------------------------------------
+// Path separator
+#ifndef PATH_SEPARATOR
+#  ifdef OS_WINDOWS
+#    define PATH_SEPARATOR  '\\'
+#  else
+#    define PATH_SEPARATOR  '/'
+#  endif
+#endif
+
+// ---------------------------------------------------------------------------
+// Windows
+
+// Whether gflags library is a DLL.
+#ifndef GFLAGS_IS_A_DLL
+#  define GFLAGS_IS_A_DLL 0
+#endif
+
+// Always export symbols when compiling a shared library as this file is only
+// included by internal modules when building the gflags library itself.
+// The gflags_declare.h header file will set it to import these symbols otherwise.
+#ifndef GFLAGS_DLL_DECL
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECL __declspec(dllexport)
+#  else
+#    define GFLAGS_DLL_DECL
+#  endif
+#endif
+// Flags defined by the gflags library itself must be exported
+#ifndef GFLAGS_DLL_DEFINE_FLAG
+#  define GFLAGS_DLL_DEFINE_FLAG GFLAGS_DLL_DECL
+#endif
+
+#ifdef OS_WINDOWS
+// The unittests import the symbols of the shared gflags library
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECL_FOR_UNITTESTS __declspec(dllimport)
+#  endif
+#  include "windows_port.h"
+#endif
diff --git a/files/third_party/gflags/gen/win/include/gflags/gflags.h b/files/third_party/gflags/gen/win/include/gflags/gflags.h
new file mode 100644
index 00000000..357eec6b
--- /dev/null
+++ b/files/third_party/gflags/gen/win/include/gflags/gflags.h
@@ -0,0 +1,573 @@
+// Copyright (c) 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Revamped and reorganized by Craig Silverstein
+//
+// This is the file that should be included by any file which declares
+// or defines a command line flag or wants to parse command line flags
+// or print a program usage message (which will include information about
+// flags).  Executive summary, in the form of an example foo.cc file:
+//
+//    #include "foo.h"         // foo.h has a line "DECLARE_int32(start);"
+//    #include "validators.h"  // hypothetical file defining ValidateIsFile()
+//
+//    DEFINE_int32(end, 1000, "The last record to read");
+//
+//    DEFINE_string(filename, "my_file.txt", "The file to read");
+//    // Crash if the specified file does not exist.
+//    static bool dummy = RegisterFlagValidator(&FLAGS_filename,
+//                                              &ValidateIsFile);
+//
+//    DECLARE_bool(verbose); // some other file has a DEFINE_bool(verbose, ...)
+//
+//    void MyFunc() {
+//      if (FLAGS_verbose) printf("Records %d-%d\n", FLAGS_start, FLAGS_end);
+//    }
+//
+//    Then, at the command-line:
+//       ./foo --noverbose --start=5 --end=100
+//
+// For more details, see
+//    doc/gflags.html
+//
+// --- A note about thread-safety:
+//
+// We describe many functions in this routine as being thread-hostile,
+// thread-compatible, or thread-safe.  Here are the meanings we use:
+//
+// thread-safe: it is safe for multiple threads to call this routine
+//   (or, when referring to a class, methods of this class)
+//   concurrently.
+// thread-hostile: it is not safe for multiple threads to call this
+//   routine (or methods of this class) concurrently.  In gflags,
+//   most thread-hostile routines are intended to be called early in,
+//   or even before, main() -- that is, before threads are spawned.
+// thread-compatible: it is safe for multiple threads to read from
+//   this variable (when applied to variables), or to call const
+//   methods of this class (when applied to classes), as long as no
+//   other thread is writing to the variable or calling non-const
+//   methods of this class.
+
+#ifndef GFLAGS_GFLAGS_H_
+#define GFLAGS_GFLAGS_H_
+
+#include <string>
+#include <vector>
+
+#include "gflags_declare.h" // IWYU pragma: export
+
+
+// We always want to export variables defined in user code
+#ifndef GFLAGS_DLL_DEFINE_FLAG
+#  ifdef _MSC_VER
+#    define GFLAGS_DLL_DEFINE_FLAG __declspec(dllexport)
+#  else
+#    define GFLAGS_DLL_DEFINE_FLAG
+#  endif
+#endif
+
+
+namespace GFLAGS_NAMESPACE {
+
+
+// --------------------------------------------------------------------
+// To actually define a flag in a file, use DEFINE_bool,
+// DEFINE_string, etc. at the bottom of this file.  You may also find
+// it useful to register a validator with the flag.  This ensures that
+// when the flag is parsed from the commandline, or is later set via
+// SetCommandLineOption, we call the validation function. It is _not_
+// called when you assign the value to the flag directly using the = operator.
+//
+// The validation function should return true if the flag value is valid, and
+// false otherwise. If the function returns false for the new setting of the
+// flag, the flag will retain its current value. If it returns false for the
+// default value, ParseCommandLineFlags() will die.
+//
+// This function is safe to call at global construct time (as in the
+// example below).
+//
+// Example use:
+//    static bool ValidatePort(const char* flagname, int32 value) {
+//       if (value > 0 && value < 32768)   // value is ok
+//         return true;
+//       printf("Invalid value for --%s: %d\n", flagname, (int)value);
+//       return false;
+//    }
+//    DEFINE_int32(port, 0, "What port to listen on");
+//    static bool dummy = RegisterFlagValidator(&FLAGS_port, &ValidatePort);
+
+// Returns true if successfully registered, false if not (because the
+// first argument doesn't point to a command-line flag, or because a
+// validator is already registered for this flag).
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const bool*        flag, bool (*validate_fn)(const char*, bool));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int32*       flag, bool (*validate_fn)(const char*, int32));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int64*       flag, bool (*validate_fn)(const char*, int64));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const uint64*      flag, bool (*validate_fn)(const char*, uint64));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const double*      flag, bool (*validate_fn)(const char*, double));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const std::string* flag, bool (*validate_fn)(const char*, const std::string&));
+
+// Convenience macro for the registration of a flag validator
+#define DEFINE_validator(name, validator) \
+    static const bool name##_validator_registered = \
+            GFLAGS_NAMESPACE::RegisterFlagValidator(&FLAGS_##name, validator)
+
+
+// --------------------------------------------------------------------
+// These methods are the best way to get access to info about the
+// list of commandline flags.  Note that these routines are pretty slow.
+//   GetAllFlags: mostly-complete info about the list, sorted by file.
+//   ShowUsageWithFlags: pretty-prints the list to stdout (what --help does)
+//   ShowUsageWithFlagsRestrict: limit to filenames with restrict as a substr
+//
+// In addition to accessing flags, you can also access argv[0] (the program
+// name) and argv (the entire commandline), which we sock away a copy of.
+// These variables are static, so you should only set them once.
+//
+// No need to export this data only structure from DLL, avoiding VS warning 4251.
+struct CommandLineFlagInfo {
+  std::string name;            // the name of the flag
+  std::string type;            // the type of the flag: int32, etc
+  std::string description;     // the "help text" associated with the flag
+  std::string current_value;   // the current value, as a string
+  std::string default_value;   // the default value, as a string
+  std::string filename;        // 'cleaned' version of filename holding the flag
+  bool has_validator_fn;       // true if RegisterFlagValidator called on this flag
+  bool is_default;             // true if the flag has the default value and
+                               // has not been set explicitly from the cmdline
+                               // or via SetCommandLineOption
+  const void* flag_ptr;        // pointer to the flag's current value (i.e. FLAGS_foo)
+};
+
+// Using this inside of a validator is a recipe for a deadlock.
+// TODO(user) Fix locking when validators are running, to make it safe to
+// call validators during ParseAllFlags.
+// Also make sure then to uncomment the corresponding unit test in
+// gflags_unittest.sh
+extern GFLAGS_DLL_DECL void GetAllFlags(std::vector<CommandLineFlagInfo>* OUTPUT);
+// These two are actually defined in gflags_reporting.cc.
+extern GFLAGS_DLL_DECL void ShowUsageWithFlags(const char *argv0);  // what --help does
+extern GFLAGS_DLL_DECL void ShowUsageWithFlagsRestrict(const char *argv0, const char *restrict);
+
+// Create a descriptive string for a flag.
+// Goes to some trouble to make pretty line breaks.
+extern GFLAGS_DLL_DECL std::string DescribeOneFlag(const CommandLineFlagInfo& flag);
+
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetArgv(int argc, const char** argv);
+
+// The following functions are thread-safe as long as SetArgv() is
+// only called before any threads start.
+extern GFLAGS_DLL_DECL const std::vector<std::string>& GetArgvs();
+extern GFLAGS_DLL_DECL const char* GetArgv();                      // all of argv as a string
+extern GFLAGS_DLL_DECL const char* GetArgv0();                     // only argv0
+extern GFLAGS_DLL_DECL uint32 GetArgvSum();                        // simple checksum of argv
+extern GFLAGS_DLL_DECL const char* ProgramInvocationName();        // argv0, or "UNKNOWN" if not set
+extern GFLAGS_DLL_DECL const char* ProgramInvocationShortName();   // basename(argv0)
+
+// ProgramUsage() is thread-safe as long as SetUsageMessage() is only
+// called before any threads start.
+extern GFLAGS_DLL_DECL const char* ProgramUsage();                 // string set by SetUsageMessage()
+
+// VersionString() is thread-safe as long as SetVersionString() is only
+// called before any threads start.
+extern GFLAGS_DLL_DECL const char* VersionString();                // string set by SetVersionString()
+
+
+
+// --------------------------------------------------------------------
+// Normally you access commandline flags by just saying "if (FLAGS_foo)"
+// or whatever, and set them by calling "FLAGS_foo = bar" (or, more
+// commonly, via the DEFINE_foo macro).  But if you need a bit more
+// control, we have programmatic ways to get/set the flags as well.
+// These programmatic ways to access flags are thread-safe, but direct
+// access is only thread-compatible.
+
+// Return true iff the flagname was found.
+// OUTPUT is set to the flag's value, or unchanged if we return false.
+extern GFLAGS_DLL_DECL bool GetCommandLineOption(const char* name, std::string* OUTPUT);
+
+// Return true iff the flagname was found. OUTPUT is set to the flag's
+// CommandLineFlagInfo or unchanged if we return false.
+extern GFLAGS_DLL_DECL bool GetCommandLineFlagInfo(const char* name, CommandLineFlagInfo* OUTPUT);
+
+// Return the CommandLineFlagInfo of the flagname.  exit() if name not found.
+// Example usage, to check if a flag's value is currently the default value:
+//   if (GetCommandLineFlagInfoOrDie("foo").is_default) ...
+extern GFLAGS_DLL_DECL CommandLineFlagInfo GetCommandLineFlagInfoOrDie(const char* name);
+
+enum GFLAGS_DLL_DECL FlagSettingMode {
+  // update the flag's value (can call this multiple times).
+  SET_FLAGS_VALUE,
+  // update the flag's value, but *only if* it has not yet been updated
+  // with SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef".
+  SET_FLAG_IF_DEFAULT,
+  // set the flag's default value to this.  If the flag has not yet updated
+  // yet (via SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef")
+  // change the flag's current value to the new default value as well.
+  SET_FLAGS_DEFAULT
+};
+
+// Set a particular flag ("command line option").  Returns a string
+// describing the new value that the option has been set to.  The
+// return value API is not well-specified, so basically just depend on
+// it to be empty if the setting failed for some reason -- the name is
+// not a valid flag name, or the value is not a valid value -- and
+// non-empty else.
+
+// SetCommandLineOption uses set_mode == SET_FLAGS_VALUE (the common case)
+extern GFLAGS_DLL_DECL std::string SetCommandLineOption        (const char* name, const char* value);
+extern GFLAGS_DLL_DECL std::string SetCommandLineOptionWithMode(const char* name, const char* value, FlagSettingMode set_mode);
+
+
+// --------------------------------------------------------------------
+// Saves the states (value, default value, whether the user has set
+// the flag, registered validators, etc) of all flags, and restores
+// them when the FlagSaver is destroyed.  This is very useful in
+// tests, say, when you want to let your tests change the flags, but
+// make sure that they get reverted to the original states when your
+// test is complete.
+//
+// Example usage:
+//   void TestFoo() {
+//     FlagSaver s1;
+//     FLAG_foo = false;
+//     FLAG_bar = "some value";
+//
+//     // test happens here.  You can return at any time
+//     // without worrying about restoring the FLAG values.
+//   }
+//
+// Note: This class is marked with GFLAGS_ATTRIBUTE_UNUSED because all
+// the work is done in the constructor and destructor, so in the standard
+// usage example above, the compiler would complain that it's an
+// unused variable.
+//
+// This class is thread-safe.  However, its destructor writes to
+// exactly the set of flags that have changed value during its
+// lifetime, so concurrent _direct_ access to those flags
+// (i.e. FLAGS_foo instead of {Get,Set}CommandLineOption()) is unsafe.
+
+class GFLAGS_DLL_DECL FlagSaver {
+ public:
+  FlagSaver();
+  ~FlagSaver();
+
+ private:
+  class FlagSaverImpl* impl_;   // we use pimpl here to keep API steady
+
+  FlagSaver(const FlagSaver&);  // no copying!
+  void operator=(const FlagSaver&);
+};
+
+// --------------------------------------------------------------------
+// Some deprecated or hopefully-soon-to-be-deprecated functions.
+
+// This is often used for logging.  TODO(csilvers): figure out a better way
+extern GFLAGS_DLL_DECL std::string CommandlineFlagsIntoString();
+// Usually where this is used, a FlagSaver should be used instead.
+extern GFLAGS_DLL_DECL
+bool ReadFlagsFromString(const std::string& flagfilecontents,
+                         const char* prog_name,
+                         bool errors_are_fatal);  // uses SET_FLAGS_VALUE
+
+// These let you manually implement --flagfile functionality.
+// DEPRECATED.
+extern GFLAGS_DLL_DECL bool AppendFlagsIntoFile(const std::string& filename, const char* prog_name);
+extern GFLAGS_DLL_DECL bool ReadFromFlagsFile(const std::string& filename, const char* prog_name, bool errors_are_fatal);   // uses SET_FLAGS_VALUE
+
+
+// --------------------------------------------------------------------
+// Useful routines for initializing flags from the environment.
+// In each case, if 'varname' does not exist in the environment
+// return defval.  If 'varname' does exist but is not valid
+// (e.g., not a number for an int32 flag), abort with an error.
+// Otherwise, return the value.  NOTE: for booleans, for true use
+// 't' or 'T' or 'true' or '1', for false 'f' or 'F' or 'false' or '0'.
+
+extern GFLAGS_DLL_DECL bool BoolFromEnv(const char *varname, bool defval);
+extern GFLAGS_DLL_DECL int32 Int32FromEnv(const char *varname, int32 defval);
+extern GFLAGS_DLL_DECL int64 Int64FromEnv(const char *varname, int64 defval);
+extern GFLAGS_DLL_DECL uint64 Uint64FromEnv(const char *varname, uint64 defval);
+extern GFLAGS_DLL_DECL double DoubleFromEnv(const char *varname, double defval);
+extern GFLAGS_DLL_DECL const char *StringFromEnv(const char *varname, const char *defval);
+
+
+// --------------------------------------------------------------------
+// The next two functions parse gflags from main():
+
+// Set the "usage" message for this program.  For example:
+//   string usage("This program does nothing.  Sample usage:\n");
+//   usage += argv[0] + " <uselessarg1> <uselessarg2>";
+//   SetUsageMessage(usage);
+// Do not include commandline flags in the usage: we do that for you!
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetUsageMessage(const std::string& usage);
+
+// Sets the version string, which is emitted with --version.
+// For instance: SetVersionString("1.3");
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetVersionString(const std::string& version);
+
+
+// Looks for flags in argv and parses them.  Rearranges argv to put
+// flags first, or removes them entirely if remove_flags is true.
+// If a flag is defined more than once in the command line or flag
+// file, the last definition is used.  Returns the index (into argv)
+// of the first non-flag argument.
+// See top-of-file for more details on this function.
+#ifndef SWIG   // In swig, use ParseCommandLineFlagsScript() instead.
+extern GFLAGS_DLL_DECL uint32 ParseCommandLineFlags(int *argc, char*** argv, bool remove_flags);
+#endif
+
+
+// Calls to ParseCommandLineNonHelpFlags and then to
+// HandleCommandLineHelpFlags can be used instead of a call to
+// ParseCommandLineFlags during initialization, in order to allow for
+// changing default values for some FLAGS (via
+// e.g. SetCommandLineOptionWithMode calls) between the time of
+// command line parsing and the time of dumping help information for
+// the flags as a result of command line parsing.  If a flag is
+// defined more than once in the command line or flag file, the last
+// definition is used.  Returns the index (into argv) of the first
+// non-flag argument.  (If remove_flags is true, will always return 1.)
+extern GFLAGS_DLL_DECL uint32 ParseCommandLineNonHelpFlags(int *argc, char*** argv, bool remove_flags);
+
+// This is actually defined in gflags_reporting.cc.
+// This function is misnamed (it also handles --version, etc.), but
+// it's too late to change that now. :-(
+extern GFLAGS_DLL_DECL void HandleCommandLineHelpFlags();   // in gflags_reporting.cc
+
+// Allow command line reparsing.  Disables the error normally
+// generated when an unknown flag is found, since it may be found in a
+// later parse.  Thread-hostile; meant to be called before any threads
+// are spawned.
+extern GFLAGS_DLL_DECL void AllowCommandLineReparsing();
+
+// Reparse the flags that have not yet been recognized.  Only flags
+// registered since the last parse will be recognized.  Any flag value
+// must be provided as part of the argument using "=", not as a
+// separate command line argument that follows the flag argument.
+// Intended for handling flags from dynamically loaded libraries,
+// since their flags are not registered until they are loaded.
+extern GFLAGS_DLL_DECL void ReparseCommandLineNonHelpFlags();
+
+// Clean up memory allocated by flags.  This is only needed to reduce
+// the quantity of "potentially leaked" reports emitted by memory
+// debugging tools such as valgrind.  It is not required for normal
+// operation, or for the google perftools heap-checker.  It must only
+// be called when the process is about to exit, and all threads that
+// might access flags are quiescent.  Referencing flags after this is
+// called will have unexpected consequences.  This is not safe to run
+// when multiple threads might be running: the function is
+// thread-hostile.
+extern GFLAGS_DLL_DECL void ShutDownCommandLineFlags();
+
+
+// --------------------------------------------------------------------
+// Now come the command line flag declaration/definition macros that
+// will actually be used.  They're kind of hairy.  A major reason
+// for this is initialization: we want people to be able to access
+// variables in global constructors and have that not crash, even if
+// their global constructor runs before the global constructor here.
+// (Obviously, we can't guarantee the flags will have the correct
+// default value in that case, but at least accessing them is safe.)
+// The only way to do that is have flags point to a static buffer.
+// So we make one, using a union to ensure proper alignment, and
+// then use placement-new to actually set up the flag with the
+// correct default value.  In the same vein, we have to worry about
+// flag access in global destructors, so FlagRegisterer has to be
+// careful never to destroy the flag-values it constructs.
+//
+// Note that when we define a flag variable FLAGS_<name>, we also
+// preemptively define a junk variable, FLAGS_no<name>.  This is to
+// cause a link-time error if someone tries to define 2 flags with
+// names like "logging" and "nologging".  We do this because a bool
+// flag FLAG can be set from the command line to true with a "-FLAG"
+// argument, and to false with a "-noFLAG" argument, and so this can
+// potentially avert confusion.
+//
+// We also put flags into their own namespace.  It is purposefully
+// named in an opaque way that people should have trouble typing
+// directly.  The idea is that DEFINE puts the flag in the weird
+// namespace, and DECLARE imports the flag from there into the current
+// namespace.  The net result is to force people to use DECLARE to get
+// access to a flag, rather than saying "extern GFLAGS_DLL_DECL bool FLAGS_whatever;"
+// or some such instead.  We want this so we can put extra
+// functionality (like sanity-checking) in DECLARE if we want, and
+// make sure it is picked up everywhere.
+//
+// We also put the type of the variable in the namespace, so that
+// people can't DECLARE_int32 something that they DEFINE_bool'd
+// elsewhere.
+
+class GFLAGS_DLL_DECL FlagRegisterer {
+ public:
+  FlagRegisterer(const char* name, const char* type,
+                 const char* help, const char* filename,
+                 void* current_storage, void* defvalue_storage);
+};
+
+// If your application #defines STRIP_FLAG_HELP to a non-zero value
+// before #including this file, we remove the help message from the
+// binary file. This can reduce the size of the resulting binary
+// somewhat, and may also be useful for security reasons.
+
+extern GFLAGS_DLL_DECL const char kStrippedFlagHelp[];
+
+
+} // namespace GFLAGS_NAMESPACE
+
+
+#ifndef SWIG  // In swig, ignore the main flag declarations
+
+#if defined(STRIP_FLAG_HELP) && STRIP_FLAG_HELP > 0
+// Need this construct to avoid the 'defined but not used' warning.
+#define MAYBE_STRIPPED_HELP(txt) \
+   (false ? (txt) : GFLAGS_NAMESPACE::kStrippedFlagHelp)
+#else
+#define MAYBE_STRIPPED_HELP(txt) txt
+#endif
+
+// Each command-line flag has two variables associated with it: one
+// with the current value, and one with the default value.  However,
+// we have a third variable, which is where value is assigned; it's a
+// constant.  This guarantees that FLAG_##value is initialized at
+// static initialization time (e.g. before program-start) rather than
+// than global construction time (which is after program-start but
+// before main), at least when 'value' is a compile-time constant.  We
+// use a small trick for the "default value" variable, and call it
+// FLAGS_no<name>.  This serves the second purpose of assuring a
+// compile error if someone tries to define a flag named no<name>
+// which is illegal (--foo and --nofoo both affect the "foo" flag).
+#define DEFINE_VARIABLE(type, shorttype, name, value, help)             \
+  namespace fL##shorttype {                                             \
+    static const type FLAGS_nono##name = value;                         \
+    /* We always want to export defined variables, dll or no */         \
+    GFLAGS_DLL_DEFINE_FLAG type FLAGS_##name = FLAGS_nono##name;        \
+    type FLAGS_no##name = FLAGS_nono##name;                             \
+    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                   \
+      #name, #type, MAYBE_STRIPPED_HELP(help), __FILE__,                \
+      &FLAGS_##name, &FLAGS_no##name);                                  \
+  }                                                                     \
+  using fL##shorttype::FLAGS_##name
+
+// For DEFINE_bool, we want to do the extra check that the passed-in
+// value is actually a bool, and not a string or something that can be
+// coerced to a bool.  These declarations (no definition needed!) will
+// help us do that, and never evaluate From, which is important.
+// We'll use 'sizeof(IsBool(val))' to distinguish. This code requires
+// that the compiler have different sizes for bool & double. Since
+// this is not guaranteed by the standard, we check it with a
+// COMPILE_ASSERT.
+namespace fLB {
+struct CompileAssert {};
+typedef CompileAssert expected_sizeof_double_neq_sizeof_bool[
+                      (sizeof(double) != sizeof(bool)) ? 1 : -1];
+template<typename From> double GFLAGS_DLL_DECL IsBoolFlag(const From& from);
+GFLAGS_DLL_DECL bool IsBoolFlag(bool from);
+}  // namespace fLB
+
+// Here are the actual DEFINE_*-macros. The respective DECLARE_*-macros
+// are in a separate include, gflags_declare.h, for reducing
+// the physical transitive size for DECLARE use.
+#define DEFINE_bool(name, val, txt)                                     \
+  namespace fLB {                                                       \
+    typedef ::fLB::CompileAssert FLAG_##name##_value_is_not_a_bool[     \
+            (sizeof(::fLB::IsBoolFlag(val)) != sizeof(double))? 1: -1]; \
+  }                                                                     \
+  DEFINE_VARIABLE(bool, B, name, val, txt)
+
+#define DEFINE_int32(name, val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int32, I, \
+                   name, val, txt)
+
+#define DEFINE_int64(name, val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int64, I64, \
+                   name, val, txt)
+
+#define DEFINE_uint64(name,val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint64, U64, \
+                   name, val, txt)
+
+#define DEFINE_double(name, val, txt) \
+   DEFINE_VARIABLE(double, D, name, val, txt)
+
+// Strings are trickier, because they're not a POD, so we can't
+// construct them at static-initialization time (instead they get
+// constructed at global-constructor time, which is much later).  To
+// try to avoid crashes in that case, we use a char buffer to store
+// the string, which we can static-initialize, and then placement-new
+// into it later.  It's not perfect, but the best we can do.
+
+namespace fLS {
+
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const char *value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const clstring &value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           int value);
+}  // namespace fLS
+
+// We need to define a var named FLAGS_no##name so people don't define
+// --string and --nostring.  And we need a temporary place to put val
+// so we don't have to evaluate it twice.  Two great needs that go
+// great together!
+// The weird 'using' + 'extern' inside the fLS namespace is to work around
+// an unknown compiler bug/issue with the gcc 4.2.1 on SUSE 10.  See
+//    http://code.google.com/p/google-gflags/issues/detail?id=20
+#define DEFINE_string(name, val, txt)                                       \
+  namespace fLS {                                                           \
+    using ::fLS::clstring;                                                  \
+    static union { void* align; char s[sizeof(clstring)]; } s_##name[2];    \
+    clstring* const FLAGS_no##name = ::fLS::                                \
+                                   dont_pass0toDEFINE_string(s_##name[0].s, \
+                                                             val);          \
+    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                       \
+        #name, "string", MAYBE_STRIPPED_HELP(txt), __FILE__,                \
+        s_##name[0].s, new (s_##name[1].s) clstring(*FLAGS_no##name));      \
+    extern GFLAGS_DLL_DEFINE_FLAG clstring& FLAGS_##name;                   \
+    using fLS::FLAGS_##name;                                                \
+    clstring& FLAGS_##name = *FLAGS_no##name;                               \
+  }                                                                         \
+  using fLS::FLAGS_##name
+
+#endif  // SWIG
+
+
+// Import gflags library symbols into alternative/deprecated namespace(s)
+#include "gflags_gflags.h"
+
+
+#endif  // GFLAGS_GFLAGS_H_
diff --git a/files/third_party/gflags/gen/win/include/gflags/gflags_completions.h b/files/third_party/gflags/gen/win/include/gflags/gflags_completions.h
new file mode 100644
index 00000000..f951c1e0
--- /dev/null
+++ b/files/third_party/gflags/gen/win/include/gflags/gflags_completions.h
@@ -0,0 +1,121 @@
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+
+//
+// Implement helpful bash-style command line flag completions
+//
+// ** Functional API:
+// HandleCommandLineCompletions() should be called early during
+// program startup, but after command line flag code has been
+// initialized, such as the beginning of HandleCommandLineHelpFlags().
+// It checks the value of the flag --tab_completion_word.  If this
+// flag is empty, nothing happens here.  If it contains a string,
+// however, then HandleCommandLineCompletions() will hijack the
+// process, attempting to identify the intention behind this
+// completion.  Regardless of the outcome of this deduction, the
+// process will be terminated, similar to --helpshort flag
+// handling.
+//
+// ** Overview of Bash completions:
+// Bash can be told to programatically determine completions for the
+// current 'cursor word'.  It does this by (in this case) invoking a
+// command with some additional arguments identifying the command
+// being executed, the word being completed, and the previous word
+// (if any).  Bash then expects a sequence of output lines to be
+// printed to stdout.  If these lines all contain a common prefix
+// longer than the cursor word, bash will replace the cursor word
+// with that common prefix, and display nothing.  If there isn't such
+// a common prefix, bash will display the lines in pages using 'more'.
+//
+// ** Strategy taken for command line completions:
+// If we can deduce either the exact flag intended, or a common flag
+// prefix, we'll output exactly that.  Otherwise, if information
+// must be displayed to the user, we'll take the opportunity to add
+// some helpful information beyond just the flag name (specifically,
+// we'll include the default flag value and as much of the flag's
+// description as can fit on a single terminal line width, as specified
+// by the flag --tab_completion_columns).  Furthermore, we'll try to
+// make bash order the output such that the most useful or relevent
+// flags are the most likely to be shown at the top.
+//
+// ** Additional features:
+// To assist in finding that one really useful flag, substring matching
+// was implemented.  Before pressing a <TAB> to get completion for the
+// current word, you can append one or more '?' to the flag to do
+// substring matching.  Here's the semantics:
+//   --foo<TAB>     Show me all flags with names prefixed by 'foo'
+//   --foo?<TAB>    Show me all flags with 'foo' somewhere in the name
+//   --foo??<TAB>   Same as prior case, but also search in module
+//                  definition path for 'foo'
+//   --foo???<TAB>  Same as prior case, but also search in flag
+//                  descriptions for 'foo'
+// Finally, we'll trim the output to a relatively small number of
+// flags to keep bash quiet about the verbosity of output.  If one
+// really wanted to see all possible matches, appending a '+' to the
+// search word will force the exhaustive list of matches to be printed.
+//
+// ** How to have bash accept completions from a binary:
+// Bash requires that it be informed about each command that programmatic
+// completion should be enabled for.  Example addition to a .bashrc
+// file would be (your path to gflags_completions.sh file may differ):
+
+/*
+$ complete -o bashdefault -o default -o nospace -C                            \
+ '/home/build/eng/bash/bash_completions.sh --tab_completion_columns $COLUMNS' \
+  time  env  binary_name  another_binary  [...]
+*/
+
+// This would allow the following to work:
+//   $ /path/to/binary_name --vmodule<TAB>
+// Or:
+//   $ ./bin/path/another_binary --gfs_u<TAB>
+// (etc)
+//
+// Sadly, it appears that bash gives no easy way to force this behavior for
+// all commands.  That's where the "time" in the above example comes in.
+// If you haven't specifically added a command to the list of completion
+// supported commands, you can still get completions by prefixing the
+// entire command with "env".
+//   $ env /some/brand/new/binary --vmod<TAB>
+// Assuming that "binary" is a newly compiled binary, this should still
+// produce the expected completion output.
+
+
+#ifndef GFLAGS_COMPLETIONS_H_
+#define GFLAGS_COMPLETIONS_H_
+
+namespace google {
+
+extern void HandleCommandLineCompletions(void);
+
+}
+
+#endif  // GFLAGS_COMPLETIONS_H_
diff --git a/files/third_party/gflags/gen/win/include/gflags/gflags_declare.h b/files/third_party/gflags/gen/win/include/gflags/gflags_declare.h
new file mode 100644
index 00000000..fbc8466f
--- /dev/null
+++ b/files/third_party/gflags/gen/win/include/gflags/gflags_declare.h
@@ -0,0 +1,141 @@
+// Copyright (c) 1999, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+//
+// Revamped and reorganized by Craig Silverstein
+//
+// This is the file that should be included by any file which declares
+// command line flag.
+
+#ifndef GFLAGS_DECLARE_H_
+#define GFLAGS_DECLARE_H_
+
+
+// ---------------------------------------------------------------------------
+// Namespace of gflags library symbols.
+#define GFLAGS_NAMESPACE google
+
+// ---------------------------------------------------------------------------
+// Windows DLL import/export.
+
+// We always want to import the symbols of the gflags library
+#ifndef GFLAGS_DLL_DECL
+#  if 0 && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECL __declspec(dllimport)
+#  else
+#    define GFLAGS_DLL_DECL
+#  endif
+#endif
+
+// We always want to import variables declared in user code
+#ifndef GFLAGS_DLL_DECLARE_FLAG
+#  ifdef _MSC_VER
+#    define GFLAGS_DLL_DECLARE_FLAG __declspec(dllimport)
+#  else
+#    define GFLAGS_DLL_DECLARE_FLAG
+#  endif
+#endif
+
+// ---------------------------------------------------------------------------
+// Flag types
+#include <string>
+#if 1
+#  include <stdint.h>                   // the normal place uint32_t is defined
+#elif 1
+#  include <sys/types.h>                // the normal place u_int32_t is defined
+#elif 0
+#  include <inttypes.h>                 // a third place for uint32_t or u_int32_t
+#endif
+
+namespace GFLAGS_NAMESPACE {
+
+#if 0 // C99
+typedef int32_t          int32;
+typedef uint32_t         uint32;
+typedef int64_t          int64;
+typedef uint64_t         uint64;
+#elif 0 // BSD
+typedef int32_t          int32;
+typedef u_int32_t        uint32;
+typedef int64_t          int64;
+typedef u_int64_t        uint64;
+#elif 1 // Windows
+typedef __int32          int32;
+typedef unsigned __int32 uint32;
+typedef __int64          int64;
+typedef unsigned __int64 uint64;
+#else
+#  error Do not know how to define a 32-bit integer quantity on your system
+#endif
+
+} // namespace GFLAGS_NAMESPACE
+
+
+namespace fLS {
+
+// The meaning of "string" might be different between now and when the
+// macros below get invoked (e.g., if someone is experimenting with
+// other string implementations that get defined after this file is
+// included).  Save the current meaning now and use it in the macros.
+typedef std::string clstring;
+
+} // namespace fLS
+
+
+#define DECLARE_VARIABLE(type, shorttype, name) \
+  /* We always want to import declared variables, dll or no */ \
+  namespace fL##shorttype { extern GFLAGS_DLL_DECLARE_FLAG type FLAGS_##name; } \
+  using fL##shorttype::FLAGS_##name
+
+#define DECLARE_bool(name) \
+  DECLARE_VARIABLE(bool, B, name)
+
+#define DECLARE_int32(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int32, I, name)
+
+#define DECLARE_int64(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int64, I64, name)
+
+#define DECLARE_uint64(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::uint64, U64, name)
+
+#define DECLARE_double(name) \
+  DECLARE_VARIABLE(double, D, name)
+
+#define DECLARE_string(name) \
+  /* We always want to import declared variables, dll or no */ \
+  namespace fLS { \
+  using ::fLS::clstring; \
+  extern GFLAGS_DLL_DECLARE_FLAG ::fLS::clstring& FLAGS_##name; \
+  } \
+  using fLS::FLAGS_##name
+
+
+#endif  // GFLAGS_DECLARE_H_
diff --git a/files/third_party/gflags/gen/win/include/gflags/gflags_gflags.h b/files/third_party/gflags/gen/win/include/gflags/gflags_gflags.h
new file mode 100644
index 00000000..0c17825d
--- /dev/null
+++ b/files/third_party/gflags/gen/win/include/gflags/gflags_gflags.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2014, Andreas Schuh
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// -----------------------------------------------------------------------------
+// Imports the gflags library symbols into an alternative/deprecated namespace.
+
+#ifndef GFLAGS_GFLAGS_H_
+#  error The internal header gflags_gflags.h may only be included by gflags.h
+#endif
+
+#ifndef GFLAGS_NS_GFLAGS_H_
+#define GFLAGS_NS_GFLAGS_H_
+
+
+namespace gflags {
+
+
+using GFLAGS_NAMESPACE::int32;
+using GFLAGS_NAMESPACE::uint32;
+using GFLAGS_NAMESPACE::int64;
+using GFLAGS_NAMESPACE::uint64;
+
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::CommandLineFlagInfo;
+using GFLAGS_NAMESPACE::GetAllFlags;
+using GFLAGS_NAMESPACE::ShowUsageWithFlags;
+using GFLAGS_NAMESPACE::ShowUsageWithFlagsRestrict;
+using GFLAGS_NAMESPACE::DescribeOneFlag;
+using GFLAGS_NAMESPACE::SetArgv;
+using GFLAGS_NAMESPACE::GetArgvs;
+using GFLAGS_NAMESPACE::GetArgv;
+using GFLAGS_NAMESPACE::GetArgv0;
+using GFLAGS_NAMESPACE::GetArgvSum;
+using GFLAGS_NAMESPACE::ProgramInvocationName;
+using GFLAGS_NAMESPACE::ProgramInvocationShortName;
+using GFLAGS_NAMESPACE::ProgramUsage;
+using GFLAGS_NAMESPACE::VersionString;
+using GFLAGS_NAMESPACE::GetCommandLineOption;
+using GFLAGS_NAMESPACE::GetCommandLineFlagInfo;
+using GFLAGS_NAMESPACE::GetCommandLineFlagInfoOrDie;
+using GFLAGS_NAMESPACE::FlagSettingMode;
+using GFLAGS_NAMESPACE::SET_FLAGS_VALUE;
+using GFLAGS_NAMESPACE::SET_FLAG_IF_DEFAULT;
+using GFLAGS_NAMESPACE::SET_FLAGS_DEFAULT;
+using GFLAGS_NAMESPACE::SetCommandLineOption;
+using GFLAGS_NAMESPACE::SetCommandLineOptionWithMode;
+using GFLAGS_NAMESPACE::FlagSaver;
+using GFLAGS_NAMESPACE::CommandlineFlagsIntoString;
+using GFLAGS_NAMESPACE::ReadFlagsFromString;
+using GFLAGS_NAMESPACE::AppendFlagsIntoFile;
+using GFLAGS_NAMESPACE::ReadFromFlagsFile;
+using GFLAGS_NAMESPACE::BoolFromEnv;
+using GFLAGS_NAMESPACE::Int32FromEnv;
+using GFLAGS_NAMESPACE::Int64FromEnv;
+using GFLAGS_NAMESPACE::Uint64FromEnv;
+using GFLAGS_NAMESPACE::DoubleFromEnv;
+using GFLAGS_NAMESPACE::StringFromEnv;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+using GFLAGS_NAMESPACE::SetVersionString;
+using GFLAGS_NAMESPACE::ParseCommandLineNonHelpFlags;
+using GFLAGS_NAMESPACE::HandleCommandLineHelpFlags;
+using GFLAGS_NAMESPACE::AllowCommandLineReparsing;
+using GFLAGS_NAMESPACE::ReparseCommandLineNonHelpFlags;
+using GFLAGS_NAMESPACE::ShutDownCommandLineFlags;
+using GFLAGS_NAMESPACE::FlagRegisterer;
+
+#ifndef SWIG
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+#endif
+
+
+} // namespace gflags
+
+
+#endif  // GFLAGS_NS_GFLAGS_H_
diff --git a/files/third_party/gflags/gen/win/include/private/config.h b/files/third_party/gflags/gen/win/include/private/config.h
new file mode 100644
index 00000000..d541580e
--- /dev/null
+++ b/files/third_party/gflags/gen/win/include/private/config.h
@@ -0,0 +1,112 @@
+/* Generated from config.h.in during build configuration using CMake. */
+
+// Note: This header file is only used internally. It is not part of public interface!
+
+// ---------------------------------------------------------------------------
+// System checks
+
+// Define if you build this library for a MS Windows OS.
+#define OS_WINDOWS
+
+// Define if you have the <stdint.h> header file.
+#define HAVE_STDINT_H
+
+// Define if you have the <sys/types.h> header file.
+#define HAVE_SYS_TYPES_H
+
+// Define if you have the <inttypes.h> header file.
+/* #undef HAVE_INTTYPES_H */
+
+// Define if you have the <sys/stat.h> header file.
+#define HAVE_SYS_STAT_H
+
+// Define if you have the <unistd.h> header file.
+/* #undef HAVE_UNISTD_H */
+
+// Define if you have the <fnmatch.h> header file.
+/* #undef HAVE_FNMATCH_H */
+
+// Define if you have the <shlwapi.h> header file (Windows 2000/XP).
+#define HAVE_SHLWAPI_H
+
+// Define if you have the strtoll function.
+/* #undef HAVE_STRTOLL */
+
+// Define if you have the strtoq function.
+/* #undef HAVE_STRTOQ */
+
+// Define if you have the <pthread.h> header file.
+/* #undef HAVE_PTHREAD */
+
+// Define if your pthread library defines the type pthread_rwlock_t
+/* #undef HAVE_RWLOCK */
+
+// gcc requires this to get PRId64, etc.
+#if defined(HAVE_INTTYPES_H) && !defined(__STDC_FORMAT_MACROS)
+#  define __STDC_FORMAT_MACROS 1
+#endif
+
+// ---------------------------------------------------------------------------
+// Package information
+
+// Name of package.
+#define PACKAGE gflags
+
+// Define to the full name of this package.
+#define PACKAGE_NAME gflags
+
+// Define to the full name and version of this package.
+#define PACKAGE_STRING gflags 2.2.0
+
+// Define to the one symbol short name of this package.
+#define PACKAGE_TARNAME gflags-2.2.0
+
+// Define to the version of this package.
+#define PACKAGE_VERSION 2.2.0
+
+// Version number of package.
+#define VERSION PACKAGE_VERSION
+
+// Define to the address where bug reports for this package should be sent.
+#define PACKAGE_BUGREPORT https://github.com/schuhschuh/gflags/issues
+
+// ---------------------------------------------------------------------------
+// Path separator
+#ifndef PATH_SEPARATOR
+#  ifdef OS_WINDOWS
+#    define PATH_SEPARATOR  '\\'
+#  else
+#    define PATH_SEPARATOR  '/'
+#  endif
+#endif
+
+// ---------------------------------------------------------------------------
+// Windows
+
+// Whether gflags library is a DLL.
+#ifndef GFLAGS_IS_A_DLL
+#  define GFLAGS_IS_A_DLL 0
+#endif
+
+// Always export symbols when compiling a shared library as this file is only
+// included by internal modules when building the gflags library itself.
+// The gflags_declare.h header file will set it to import these symbols otherwise.
+#ifndef GFLAGS_DLL_DECL
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECL __declspec(dllexport)
+#  else
+#    define GFLAGS_DLL_DECL
+#  endif
+#endif
+// Flags defined by the gflags library itself must be exported
+#ifndef GFLAGS_DLL_DEFINE_FLAG
+#  define GFLAGS_DLL_DEFINE_FLAG GFLAGS_DLL_DECL
+#endif
+
+#ifdef OS_WINDOWS
+// The unittests import the symbols of the shared gflags library
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECL_FOR_UNITTESTS __declspec(dllimport)
+#  endif
+#  include "windows_port.h"
+#endif
diff --git a/files/third_party/gflags/gflags.gyp b/files/third_party/gflags/gflags.gyp
new file mode 100644
index 00000000..37f2815a
--- /dev/null
+++ b/files/third_party/gflags/gflags.gyp
@@ -0,0 +1,92 @@
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This is a copy of WebRTC's gflags.gyp.
+
+{
+  'variables': {
+    'gflags_root': '<(DEPTH)/third_party/gflags',
+    'conditions': [
+      ['OS=="win"', {
+        'gflags_gen_arch_root': '<(gflags_root)/gen/win',
+      }, {
+        'gflags_gen_arch_root': '<(gflags_root)/gen/posix',
+      }],
+    ],
+  },
+  'targets': [
+    {
+      'target_name': 'gflags',
+      'type': 'static_library',
+      'include_dirs': [
+        '<(gflags_gen_arch_root)/include/gflags',  # For configured files.
+        '<(gflags_gen_arch_root)/include/private',  # For config.h
+        '<(gflags_root)/src/src',  # For everything else.
+      ],
+      'defines': [
+        # These macros exist so flags and symbols are properly
+        # exported when building DLLs. Since we don't build DLLs, we
+        # need to disable them.
+        'GFLAGS_DLL_DECL=',
+        'GFLAGS_DLL_DECLARE_FLAG=',
+        'GFLAGS_DLL_DEFINE_FLAG=',
+      ],
+      'direct_dependent_settings': {
+        'include_dirs': [
+          '<(gflags_gen_arch_root)/include',  # For configured files.
+          '<(gflags_root)/src/src',  # For everything else.
+        ],
+        'defines': [
+          'GFLAGS_DLL_DECL=',
+          'GFLAGS_DLL_DECLARE_FLAG=',
+          'GFLAGS_DLL_DEFINE_FLAG=',
+        ],
+      },
+      'sources': [
+        'src/src/gflags.cc',
+        'src/src/gflags_completions.cc',
+        'src/src/gflags_reporting.cc',
+      ],
+      'conditions': [
+        ['OS=="win"', {
+          'sources': [
+            'src/src/windows_port.cc',
+          ],
+          'msvs_disabled_warnings': [
+            4005,  # WIN32_LEAN_AND_MEAN redefinition.
+            4267,  # Conversion from size_t to "type".
+          ],
+          'configurations': {
+            'Common_Base': {
+              'msvs_configuration_attributes': {
+                'CharacterSet': '2',  # Use Multi-byte Character Set.
+              },
+            },
+          },
+        }],
+        # TODO(andrew): Look into fixing this warning upstream:
+        # http://code.google.com/p/webrtc/issues/detail?id=760
+        ['OS=="win" and clang==1', {
+          'msvs_settings': {
+            'VCCLCompilerTool': {
+              'AdditionalOptions': [
+                '-Wno-microsoft-include',
+              ],
+            },
+          },
+        }],
+        ['clang==1', {
+          'cflags': [
+            '-Wno-microsoft-include',
+          ],
+        }],
+      ],
+    },
+  ],
+}
diff --git a/files/tools/OWNERS b/files/tools/OWNERS
new file mode 100644
index 00000000..aca046d4
--- /dev/null
+++ b/files/tools/OWNERS
@@ -0,0 +1 @@
+kjellander@chromium.org
diff --git a/files/tools/msan/OWNERS b/files/tools/msan/OWNERS
new file mode 100644
index 00000000..60351e7e
--- /dev/null
+++ b/files/tools/msan/OWNERS
@@ -0,0 +1,3 @@
+pbos@chromium.org
+kjellander@chromium.org
+
diff --git a/files/tools/msan/blacklist.txt b/files/tools/msan/blacklist.txt
new file mode 100644
index 00000000..8b5e42a7
--- /dev/null
+++ b/files/tools/msan/blacklist.txt
@@ -0,0 +1,9 @@
+# The rules in this file are only applied at compile time.
+# Because the Chrome buildsystem does not automatically touch the files
+# mentioned here, changing this file requires clobbering all MSan bots.
+#
+# Please think twice before you add or remove these rules.
+
+# This is a stripped down copy of Chromium's blacklist.txt, to enable
+# adding libyuv-specific blacklist entries.
+
diff --git a/files/tools/ubsan/OWNERS b/files/tools/ubsan/OWNERS
new file mode 100644
index 00000000..b608519a
--- /dev/null
+++ b/files/tools/ubsan/OWNERS
@@ -0,0 +1,4 @@
+pbos@webrtc.org
+kjellander@webrtc.org
+fbarchard@chromium.org
+
diff --git a/files/tools/ubsan/blacklist.txt b/files/tools/ubsan/blacklist.txt
new file mode 100644
index 00000000..8bcb2907
--- /dev/null
+++ b/files/tools/ubsan/blacklist.txt
@@ -0,0 +1,15 @@
+#############################################################################
+# UBSan blacklist.
+# Please think twice before you add or remove these rules.
+
+# This is a stripped down copy of Chromium's blacklist.txt, to enable
+# adding WebRTC-specific blacklist entries.
+
+#############################################################################
+# YASM does some funny things that UBsan doesn't like.
+# https://crbug.com/489901
+src:*/third_party/yasm/*
+
+#############################################################################
+# Ignore system libraries.
+src:*/usr/*
diff --git a/files/tools/ubsan/vptr_blacklist.txt b/files/tools/ubsan/vptr_blacklist.txt
new file mode 100644
index 00000000..8ed070c0
--- /dev/null
+++ b/files/tools/ubsan/vptr_blacklist.txt
@@ -0,0 +1,21 @@
+#############################################################################
+# UBSan vptr blacklist.
+# Function and type based blacklisting use a mangled name, and it is especially
+# tricky to represent C++ types. For now, any possible changes by name manglings
+# are simply represented as wildcard expressions of regexp, and thus it might be
+# over-blacklisted.
+#
+# Please think twice before you add or remove these rules.
+#
+# This is a stripped down copy of Chromium's vptr_blacklist.txt, to enable
+# adding libyuv-specific blacklist entries.
+
+#############################################################################
+# Using raw pointer values.
+#
+# A raw pointer value (16) is used to infer the field offset by
+# GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET.
+
+# Example:
+# src:*/third_party/protobuf/src/google/protobuf/compiler/plugin.pb.cc
+
diff --git a/files/unit_test/basictypes_test.cc b/files/unit_test/basictypes_test.cc
new file mode 100644
index 00000000..89f7644d
--- /dev/null
+++ b/files/unit_test/basictypes_test.cc
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/basic_types.h"
+
+namespace libyuv {
+
+TEST_F(LibYUVBaseTest, Endian) {
+  uint16 v16 = 0x1234u;
+  uint8 first_byte = *reinterpret_cast<uint8*>(&v16);
+#if defined(LIBYUV_LITTLE_ENDIAN)
+  EXPECT_EQ(0x34u, first_byte);
+#else
+  EXPECT_EQ(0x12u, first_byte);
+#endif
+}
+
+TEST_F(LibYUVBaseTest, SizeOfTypes) {
+  int8 i8 = -1;
+  uint8 u8 = 1u;
+  int16 i16 = -1;
+  uint16 u16 = 1u;
+  int32 i32 = -1;
+  uint32 u32 = 1u;
+  int64 i64 = -1;
+  uint64 u64 = 1u;
+  EXPECT_EQ(1u, sizeof(i8));
+  EXPECT_EQ(1u, sizeof(u8));
+  EXPECT_EQ(2u, sizeof(i16));
+  EXPECT_EQ(2u, sizeof(u16));
+  EXPECT_EQ(4u, sizeof(i32));
+  EXPECT_EQ(4u, sizeof(u32));
+  EXPECT_EQ(8u, sizeof(i64));
+  EXPECT_EQ(8u, sizeof(u64));
+  EXPECT_GT(0, i8);
+  EXPECT_LT(0u, u8);
+  EXPECT_GT(0, i16);
+  EXPECT_LT(0u, u16);
+  EXPECT_GT(0, i32);
+  EXPECT_LT(0u, u32);
+  EXPECT_GT(0, i64);
+  EXPECT_LT(0u, u64);
+}
+
+TEST_F(LibYUVBaseTest, SizeOfConstants) {
+  EXPECT_EQ(8u, sizeof(INT64_C(0)));
+  EXPECT_EQ(8u, sizeof(UINT64_C(0)));
+  EXPECT_EQ(8u, sizeof(INT64_C(0x1234567887654321)));
+  EXPECT_EQ(8u, sizeof(UINT64_C(0x8765432112345678)));
+}
+
+}  // namespace libyuv
diff --git a/files/unit_test/color_test.cc b/files/unit_test/color_test.cc
new file mode 100644
index 00000000..36041d99
--- /dev/null
+++ b/files/unit_test/color_test.cc
@@ -0,0 +1,570 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+// TODO(fbarchard): Port high accuracy YUV to RGB to Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define ERROR_R 1
+#define ERROR_G 1
+#define ERROR_B 3
+#define ERROR_FULL 6
+#define ERROR_J420 5
+#else
+#define ERROR_R 1
+#define ERROR_G 1
+#define ERROR_B 3
+#define ERROR_FULL 5
+#define ERROR_J420 3
+#endif
+
+#define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF)              \
+  TEST_F(LibYUVColorTest, TESTNAME) {                                          \
+  const int kPixels = benchmark_width_ * benchmark_height_;                    \
+  const int kHalfPixels = ((benchmark_width_ + 1) / 2) *                       \
+      ((benchmark_height_ + HS1) / HS);                                        \
+  align_buffer_page_end(orig_y, kPixels);                                      \
+  align_buffer_page_end(orig_u, kHalfPixels);                                  \
+  align_buffer_page_end(orig_v, kHalfPixels);                                  \
+  align_buffer_page_end(orig_pixels, kPixels * 4);                             \
+  align_buffer_page_end(temp_y, kPixels);                                      \
+  align_buffer_page_end(temp_u, kHalfPixels);                                  \
+  align_buffer_page_end(temp_v, kHalfPixels);                                  \
+  align_buffer_page_end(dst_pixels_opt, kPixels * 4);                          \
+  align_buffer_page_end(dst_pixels_c, kPixels * 4);                            \
+                                                                               \
+  MemRandomize(orig_pixels, kPixels * 4);                                      \
+  MemRandomize(orig_y, kPixels);                                               \
+  MemRandomize(orig_u, kHalfPixels);                                           \
+  MemRandomize(orig_v, kHalfPixels);                                           \
+  MemRandomize(temp_y, kPixels);                                               \
+  MemRandomize(temp_u, kHalfPixels);                                           \
+  MemRandomize(temp_v, kHalfPixels);                                           \
+  MemRandomize(dst_pixels_opt, kPixels * 4);                                   \
+  MemRandomize(dst_pixels_c, kPixels * 4);                                     \
+                                                                               \
+  /* The test is overall for color conversion matrix being reversible, so */   \
+  /* this initializes the pixel with 2x2 blocks to eliminate subsampling. */   \
+  uint8* p = orig_y;                                                           \
+  for (int y = 0; y < benchmark_height_ - HS1; y += HS) {                      \
+    for (int x = 0; x < benchmark_width_ - 1; x += 2) {                        \
+      uint8 r = static_cast<uint8>(fastrand());                                \
+      p[0] = r;                                                                \
+      p[1] = r;                                                                \
+      p[HN] = r;                                                               \
+      p[HN + 1] = r;                                                           \
+      p += 2;                                                                  \
+    }                                                                          \
+    if (benchmark_width_ & 1) {                                                \
+      uint8 r = static_cast<uint8>(fastrand());                                \
+      p[0] = r;                                                                \
+      p[HN] = r;                                                               \
+      p += 1;                                                                  \
+    }                                                                          \
+    p += HN;                                                                   \
+  }                                                                            \
+  if ((benchmark_height_ & 1) && HS == 2) {                                    \
+    for (int x = 0; x < benchmark_width_ - 1; x += 2) {                        \
+      uint8 r = static_cast<uint8>(fastrand());                                \
+      p[0] = r;                                                                \
+      p[1] = r;                                                                \
+      p += 2;                                                                  \
+    }                                                                          \
+    if (benchmark_width_ & 1) {                                                \
+      uint8 r = static_cast<uint8>(fastrand());                                \
+      p[0] = r;                                                                \
+      p += 1;                                                                  \
+    }                                                                          \
+  }                                                                            \
+  /* Start with YUV converted to ARGB. */                                      \
+  YUVTOARGB(orig_y, benchmark_width_,                                          \
+            orig_u, (benchmark_width_ + 1) / 2,                                \
+            orig_v, (benchmark_width_ + 1) / 2,                                \
+            orig_pixels, benchmark_width_ * 4,                                 \
+            benchmark_width_, benchmark_height_);                              \
+                                                                               \
+  ARGBTOYUV(orig_pixels, benchmark_width_ * 4,                                 \
+            temp_y, benchmark_width_,                                          \
+            temp_u, (benchmark_width_ + 1) / 2,                                \
+            temp_v, (benchmark_width_ + 1) / 2,                                \
+            benchmark_width_, benchmark_height_);                              \
+                                                                               \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  YUVTOARGB(temp_y, benchmark_width_,                                          \
+            temp_u, (benchmark_width_ + 1) / 2,                                \
+            temp_v, (benchmark_width_ + 1) / 2,                                \
+            dst_pixels_c, benchmark_width_ * 4,                                \
+            benchmark_width_, benchmark_height_);                              \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+                                                                               \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    YUVTOARGB(temp_y, benchmark_width_,                                        \
+              temp_u, (benchmark_width_ + 1) / 2,                              \
+              temp_v, (benchmark_width_ + 1) / 2,                              \
+              dst_pixels_opt, benchmark_width_ * 4,                            \
+              benchmark_width_, benchmark_height_);                            \
+  }                                                                            \
+  /* Test C and SIMD match. */                                                 \
+  for (int i = 0; i < kPixels * 4; ++i) {                                      \
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                             \
+  }                                                                            \
+  /* Test SIMD is close to original. */                                        \
+  for (int i = 0; i < kPixels * 4; ++i) {                                      \
+    EXPECT_NEAR(static_cast<int>(orig_pixels[i]),                              \
+                static_cast<int>(dst_pixels_opt[i]), DIFF);                    \
+  }                                                                            \
+                                                                               \
+  free_aligned_buffer_page_end(orig_pixels);                                   \
+  free_aligned_buffer_page_end(orig_y);                                        \
+  free_aligned_buffer_page_end(orig_u);                                        \
+  free_aligned_buffer_page_end(orig_v);                                        \
+  free_aligned_buffer_page_end(temp_y);                                        \
+  free_aligned_buffer_page_end(temp_u);                                        \
+  free_aligned_buffer_page_end(temp_v);                                        \
+  free_aligned_buffer_page_end(dst_pixels_opt);                                \
+  free_aligned_buffer_page_end(dst_pixels_c);                                  \
+}                                                                              \
+
+TESTCS(TestI420, I420ToARGB, ARGBToI420, 1, 2, benchmark_width_, ERROR_FULL)
+TESTCS(TestI422, I422ToARGB, ARGBToI422, 0, 1, 0, ERROR_FULL)
+TESTCS(TestJ420, J420ToARGB, ARGBToJ420, 1, 2, benchmark_width_, ERROR_J420)
+TESTCS(TestJ422, J422ToARGB, ARGBToJ422, 0, 1, 0, ERROR_J420)
+
+static void YUVToRGB(int y, int u, int v, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+  const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+  SIMD_ALIGNED(uint8 orig_y[16]);
+  SIMD_ALIGNED(uint8 orig_u[8]);
+  SIMD_ALIGNED(uint8 orig_v[8]);
+  SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+  memset(orig_u, u, kHalfPixels);
+  memset(orig_v, v, kHalfPixels);
+
+  /* YUV converted to ARGB. */
+  I422ToARGB(orig_y, kWidth,
+             orig_u, (kWidth + 1) / 2,
+             orig_v, (kWidth + 1) / 2,
+             orig_pixels, kWidth * 4,
+             kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
+static void YUVJToRGB(int y, int u, int v, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+  const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+  SIMD_ALIGNED(uint8 orig_y[16]);
+  SIMD_ALIGNED(uint8 orig_u[8]);
+  SIMD_ALIGNED(uint8 orig_v[8]);
+  SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+  memset(orig_u, u, kHalfPixels);
+  memset(orig_v, v, kHalfPixels);
+
+  /* YUV converted to ARGB. */
+  J422ToARGB(orig_y, kWidth,
+             orig_u, (kWidth + 1) / 2,
+             orig_v, (kWidth + 1) / 2,
+             orig_pixels, kWidth * 4,
+             kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
+static void YToRGB(int y, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+
+  SIMD_ALIGNED(uint8 orig_y[16]);
+  SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+
+  /* YUV converted to ARGB. */
+  I400ToARGB(orig_y, kWidth, orig_pixels, kWidth * 4, kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
+static void YJToRGB(int y, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+
+  SIMD_ALIGNED(uint8 orig_y[16]);
+  SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+
+  /* YUV converted to ARGB. */
+  J400ToARGB(orig_y, kWidth, orig_pixels, kWidth * 4, kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
+// Pick a method for clamping.
+//  #define CLAMPMETHOD_IF 1
+//  #define CLAMPMETHOD_TABLE 1
+#define CLAMPMETHOD_TERNARY 1
+//  #define CLAMPMETHOD_MASK 1
+
+// Pick a method for rounding.
+#define ROUND(f) static_cast<int>(f + 0.5f)
+//  #define ROUND(f) lrintf(f)
+//  #define ROUND(f) static_cast<int>(round(f))
+//  #define ROUND(f) _mm_cvt_ss2si(_mm_load_ss(&f))
+
+#if defined(CLAMPMETHOD_IF)
+static int RoundToByte(float f) {
+  int i =  ROUND(f);
+  if (i < 0) {
+    i = 0;
+  }
+  if (i > 255) {
+    i = 255;
+  }
+  return i;
+}
+#elif defined(CLAMPMETHOD_TABLE)
+static const unsigned char clamptable[811] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+  29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+  67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
+  86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
+  104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
+  119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
+  134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+  149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163,
+  164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178,
+  179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193,
+  194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
+  209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+  224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
+  239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+  254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+};
+
+static int RoundToByte(float f) {
+  return clamptable[ROUND(f) + 276];
+}
+#elif defined(CLAMPMETHOD_TERNARY)
+static int RoundToByte(float f) {
+  int i = ROUND(f);
+  return (i < 0) ? 0 : ((i > 255) ? 255 : i);
+}
+#elif defined(CLAMPMETHOD_MASK)
+static int RoundToByte(float f) {
+  int i = ROUND(f);
+  i =  ((-(i) >> 31) & (i));  // clamp to 0.
+  return (((255 - (i)) >> 31) | (i)) & 255;  // clamp to 255.
+}
+#endif
+
+#define RANDOM256(s) ((s & 1) ? ((s >> 1) ^ 0xb8) : (s >> 1))
+
+TEST_F(LibYUVColorTest, TestRoundToByte) {
+  int allb = 0;
+  int count = benchmark_width_ * benchmark_height_;
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    float f = (fastrand() & 255) * 3.14f - 260.f;
+    for (int j = 0; j < count; ++j) {
+      int b = RoundToByte(f);
+      f += 0.91f;
+      allb |= b;
+    }
+  }
+  EXPECT_GE(allb, 0);
+  EXPECT_LE(allb, 255);
+}
+
+static void YUVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+  *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.596);
+  *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.391 - (v - 128) * 0.813);
+  *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.018);
+}
+
+static void YUVJToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+  *r = RoundToByte(y - (v - 128) * -1.40200);
+  *g = RoundToByte(y - (u - 128) * 0.34414 - (v - 128) * 0.71414);
+  *b = RoundToByte(y - (u - 128) * -1.77200);
+}
+
+TEST_F(LibYUVColorTest, TestYUV) {
+  int r0, g0, b0, r1, g1, b1;
+
+  // cyan (less red)
+  YUVToRGBReference(240, 255, 0, &r0, &g0, &b0);
+  EXPECT_EQ(56, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(255, b0);
+
+  YUVToRGB(240, 255, 0, &r1, &g1, &b1);
+  EXPECT_EQ(57, r1);
+  EXPECT_EQ(255, g1);
+  EXPECT_EQ(255, b1);
+
+  // green (less red and blue)
+  YUVToRGBReference(240, 0, 0, &r0, &g0, &b0);
+  EXPECT_EQ(56, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(2, b0);
+
+  YUVToRGB(240, 0, 0, &r1, &g1, &b1);
+  EXPECT_EQ(57, r1);
+  EXPECT_EQ(255, g1);
+  EXPECT_EQ(5, b1);
+
+  for (int i = 0; i < 256; ++i) {
+    YUVToRGBReference(i, 128, 128, &r0, &g0, &b0);
+    YUVToRGB(i, 128, 128, &r1, &g1, &b1);
+    EXPECT_NEAR(r0, r1, ERROR_R);
+    EXPECT_NEAR(g0, g1, ERROR_G);
+    EXPECT_NEAR(b0, b1, ERROR_B);
+
+    YUVToRGBReference(i, 0, 0, &r0, &g0, &b0);
+    YUVToRGB(i, 0, 0, &r1, &g1, &b1);
+    EXPECT_NEAR(r0, r1, ERROR_R);
+    EXPECT_NEAR(g0, g1, ERROR_G);
+    EXPECT_NEAR(b0, b1, ERROR_B);
+
+    YUVToRGBReference(i, 0, 255, &r0, &g0, &b0);
+    YUVToRGB(i, 0, 255, &r1, &g1, &b1);
+    EXPECT_NEAR(r0, r1, ERROR_R);
+    EXPECT_NEAR(g0, g1, ERROR_G);
+    EXPECT_NEAR(b0, b1, ERROR_B);
+  }
+}
+
+TEST_F(LibYUVColorTest, TestGreyYUV) {
+  int r0, g0, b0, r1, g1, b1, r2, g2, b2;
+
+  // black
+  YUVToRGBReference(16, 128, 128, &r0, &g0, &b0);
+  EXPECT_EQ(0, r0);
+  EXPECT_EQ(0, g0);
+  EXPECT_EQ(0, b0);
+
+  YUVToRGB(16, 128, 128, &r1, &g1, &b1);
+  EXPECT_EQ(0, r1);
+  EXPECT_EQ(0, g1);
+  EXPECT_EQ(0, b1);
+
+  // white
+  YUVToRGBReference(240, 128, 128, &r0, &g0, &b0);
+  EXPECT_EQ(255, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(255, b0);
+
+  YUVToRGB(240, 128, 128, &r1, &g1, &b1);
+  EXPECT_EQ(255, r1);
+  EXPECT_EQ(255, g1);
+  EXPECT_EQ(255, b1);
+
+  // grey
+  YUVToRGBReference(128, 128, 128, &r0, &g0, &b0);
+  EXPECT_EQ(130, r0);
+  EXPECT_EQ(130, g0);
+  EXPECT_EQ(130, b0);
+
+  YUVToRGB(128, 128, 128, &r1, &g1, &b1);
+  EXPECT_EQ(130, r1);
+  EXPECT_EQ(130, g1);
+  EXPECT_EQ(130, b1);
+
+
+  for (int y = 0; y < 256; ++y) {
+    YUVToRGBReference(y, 128, 128, &r0, &g0, &b0);
+    YUVToRGB(y, 128, 128, &r1, &g1, &b1);
+    YToRGB(y, &r2, &g2, &b2);
+    EXPECT_EQ(r0, r1);
+    EXPECT_EQ(g0, g1);
+    EXPECT_EQ(b0, b1);
+    EXPECT_EQ(r0, r2);
+    EXPECT_EQ(g0, g2);
+    EXPECT_EQ(b0, b2);
+  }
+}
+
+static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
+  int i;
+  printf("hist");
+  for (i = 0; i < 256; ++i) {
+    if (rh[i] || gh[i] || bh[i]) {
+      printf("\t%8d", i - 128);
+    }
+  }
+  printf("\nred");
+  for (i = 0; i < 256; ++i) {
+    if (rh[i] || gh[i] || bh[i]) {
+      printf("\t%8d", rh[i]);
+    }
+  }
+  printf("\ngreen");
+  for (i = 0; i < 256; ++i) {
+    if (rh[i] || gh[i] || bh[i]) {
+      printf("\t%8d", gh[i]);
+    }
+  }
+  printf("\nblue");
+  for (i = 0; i < 256; ++i) {
+    if (rh[i] || gh[i] || bh[i]) {
+      printf("\t%8d", bh[i]);
+    }
+  }
+  printf("\n");
+}
+
+TEST_F(LibYUVColorTest, TestFullYUV) {
+  int rh[256] = { 0, }, gh[256] = { 0, }, bh[256] = { 0, };
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
+      for (int y2 = 0; y2 < 256; ++y2) {
+        int r0, g0, b0, r1, g1, b1;
+        int y = RANDOM256(y2);
+        YUVToRGBReference(y, u, v, &r0, &g0, &b0);
+        YUVToRGB(y, u, v, &r1, &g1, &b1);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
+        ++rh[r1 - r0 + 128];
+        ++gh[g1 - g0 + 128];
+        ++bh[b1 - b0 + 128];
+      }
+    }
+  }
+  PrintHistogram(rh, gh, bh);
+}
+
+TEST_F(LibYUVColorTest, TestFullYUVJ) {
+  int rh[256] = { 0, }, gh[256] = { 0, }, bh[256] = { 0, };
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
+      for (int y2 = 0; y2 < 256; ++y2) {
+        int r0, g0, b0, r1, g1, b1;
+        int y = RANDOM256(y2);
+        YUVJToRGBReference(y, u, v, &r0, &g0, &b0);
+        YUVJToRGB(y, u, v, &r1, &g1, &b1);
+        EXPECT_NEAR(r0, r1, 1);
+        EXPECT_NEAR(g0, g1, 1);
+        EXPECT_NEAR(b0, b1, 1);
+        ++rh[r1 - r0 + 128];
+        ++gh[g1 - g0 + 128];
+        ++bh[b1 - b0 + 128];
+      }
+    }
+  }
+  PrintHistogram(rh, gh, bh);
+}
+
+TEST_F(LibYUVColorTest, TestGreyYUVJ) {
+  int r0, g0, b0, r1, g1, b1, r2, g2, b2;
+
+  // black
+  YUVJToRGBReference(0, 128, 128, &r0, &g0, &b0);
+  EXPECT_EQ(0, r0);
+  EXPECT_EQ(0, g0);
+  EXPECT_EQ(0, b0);
+
+  YUVJToRGB(0, 128, 128, &r1, &g1, &b1);
+  EXPECT_EQ(0, r1);
+  EXPECT_EQ(0, g1);
+  EXPECT_EQ(0, b1);
+
+  // white
+  YUVJToRGBReference(255, 128, 128, &r0, &g0, &b0);
+  EXPECT_EQ(255, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(255, b0);
+
+  YUVJToRGB(255, 128, 128, &r1, &g1, &b1);
+  EXPECT_EQ(255, r1);
+  EXPECT_EQ(255, g1);
+  EXPECT_EQ(255, b1);
+
+  // grey
+  YUVJToRGBReference(128, 128, 128, &r0, &g0, &b0);
+  EXPECT_EQ(128, r0);
+  EXPECT_EQ(128, g0);
+  EXPECT_EQ(128, b0);
+
+  YUVJToRGB(128, 128, 128, &r1, &g1, &b1);
+  EXPECT_EQ(128, r1);
+  EXPECT_EQ(128, g1);
+  EXPECT_EQ(128, b1);
+
+  for (int y = 0; y < 256; ++y) {
+    YUVJToRGBReference(y, 128, 128, &r0, &g0, &b0);
+    YUVJToRGB(y, 128, 128, &r1, &g1, &b1);
+    YJToRGB(y, &r2, &g2, &b2);
+    EXPECT_EQ(r0, r1);
+    EXPECT_EQ(g0, g1);
+    EXPECT_EQ(b0, b1);
+    EXPECT_EQ(r0, r2);
+    EXPECT_EQ(g0, g2);
+    EXPECT_EQ(b0, b2);
+  }
+}
+
+}  // namespace libyuv
diff --git a/files/unit_test/compare_test.cc b/files/unit_test/compare_test.cc
index 8a49a612..a8ce671d 100644
--- a/files/unit_test/compare_test.cc
+++ b/files/unit_test/compare_test.cc
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -16,6 +16,7 @@
 #include "libyuv/basic_types.h"
 #include "libyuv/compare.h"
 #include "libyuv/cpu_id.h"
+#include "libyuv/video_common.h"
 
 namespace libyuv {
 
@@ -30,50 +31,93 @@ static uint32 ReferenceHashDjb2(const uint8* src, uint64 count, uint32 seed) {
   return hash;
 }
 
-TEST_F(libyuvTest, TestDjb2) {
-  const int kMaxTest = 2049;
-  align_buffer_16(src_a, kMaxTest)
+TEST_F(LibYUVBaseTest, Djb2_Test) {
+  const int kMaxTest = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_a, kMaxTest);
+  align_buffer_page_end(src_b, kMaxTest);
 
-  for (int i = 0; i < kMaxTest; ++i) {
-    src_a[i] = i;
-  }
-  for (int i = 0; i < kMaxTest; ++i) {
-    uint32 h1 = HashDjb2(src_a, kMaxTest, 5381);
-    uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
-    EXPECT_EQ(h1, h2);
-  }
-  // Hash constant generator using for tables in compare
-  int h = 1;
-  for (int i = 0; i <= 16 ; ++i) {
-    printf("%08x ", h);
-    h *= 33;
-  }
-  printf("\n");
-
-  free_aligned_buffer_16(src_a)
-}
-
-TEST_F(libyuvTest, BenchmakDjb2_C) {
-  const int kMaxTest = 1280 * 720;
-  align_buffer_16(src_a, kMaxTest)
+  const char* fox = "The quick brown fox jumps over the lazy dog"
+      " and feels as if he were in the seventh heaven of typography"
+      " together with Hermann Zapf";
+  uint32 foxhash = HashDjb2(reinterpret_cast<const uint8*>(fox), 131, 5381);
+  const uint32 kExpectedFoxHash = 2611006483u;
+  EXPECT_EQ(kExpectedFoxHash, foxhash);
 
   for (int i = 0; i < kMaxTest; ++i) {
-    src_a[i] = i;
-  }
-  uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
-  uint32 h1;
-  MaskCpuFlags(kCpuInitialized);
-  for (int i = 0; i < benchmark_iterations_; ++i) {
-    h1 = HashDjb2(src_a, kMaxTest, 5381);
+    src_a[i] = (fastrand() & 0xff);
+    src_b[i] = (fastrand() & 0xff);
   }
-  MaskCpuFlags(-1);
+  // Compare different buffers. Expect hash is different.
+  uint32 h1 = HashDjb2(src_a, kMaxTest, 5381);
+  uint32 h2 = HashDjb2(src_b, kMaxTest, 5381);
+  EXPECT_NE(h1, h2);
+
+  // Make last half same. Expect hash is different.
+  memcpy(src_a + kMaxTest / 2, src_b + kMaxTest / 2, kMaxTest / 2);
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_b, kMaxTest, 5381);
+  EXPECT_NE(h1, h2);
+
+  // Make first half same. Expect hash is different.
+  memcpy(src_a + kMaxTest / 2, src_a, kMaxTest / 2);
+  memcpy(src_b + kMaxTest / 2, src_b, kMaxTest / 2);
+  memcpy(src_a, src_b, kMaxTest / 2);
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_b, kMaxTest, 5381);
+  EXPECT_NE(h1, h2);
+
+  // Make same. Expect hash is same.
+  memcpy(src_a, src_b, kMaxTest);
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_b, kMaxTest, 5381);
+  EXPECT_EQ(h1, h2);
+
+  // Mask seed different. Expect hash is different.
+  memcpy(src_a, src_b, kMaxTest);
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_b, kMaxTest, 1234);
+  EXPECT_NE(h1, h2);
+
+  // Make one byte different in middle. Expect hash is different.
+  memcpy(src_a, src_b, kMaxTest);
+  ++src_b[kMaxTest / 2];
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_b, kMaxTest, 5381);
+  EXPECT_NE(h1, h2);
+
+  // Make first byte different. Expect hash is different.
+  memcpy(src_a, src_b, kMaxTest);
+  ++src_b[0];
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_b, kMaxTest, 5381);
+  EXPECT_NE(h1, h2);
+
+  // Make last byte different. Expect hash is different.
+  memcpy(src_a, src_b, kMaxTest);
+  ++src_b[kMaxTest - 1];
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_b, kMaxTest, 5381);
+  EXPECT_NE(h1, h2);
+
+  // Make a zeros. Test different lengths. Expect hash is different.
+  memset(src_a, 0, kMaxTest);
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_a, kMaxTest / 2, 5381);
+  EXPECT_NE(h1, h2);
+
+  // Make a zeros and seed of zero. Test different lengths. Expect hash is same.
+  memset(src_a, 0, kMaxTest);
+  h1 = HashDjb2(src_a, kMaxTest, 0);
+  h2 = HashDjb2(src_a, kMaxTest / 2, 0);
   EXPECT_EQ(h1, h2);
-  free_aligned_buffer_16(src_a)
+
+  free_aligned_buffer_page_end(src_a);
+  free_aligned_buffer_page_end(src_b);
 }
 
-TEST_F(libyuvTest, BenchmakDjb2_OPT) {
-  const int kMaxTest = 1280 * 720;
-  align_buffer_16(src_a, kMaxTest)
+TEST_F(LibYUVBaseTest, BenchmarkDjb2_Opt) {
+  const int kMaxTest = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_a, kMaxTest);
 
   for (int i = 0; i < kMaxTest; ++i) {
     src_a[i] = i;
@@ -84,13 +128,12 @@ TEST_F(libyuvTest, BenchmakDjb2_OPT) {
     h1 = HashDjb2(src_a, kMaxTest, 5381);
   }
   EXPECT_EQ(h1, h2);
-  free_aligned_buffer_16(src_a)
+  free_aligned_buffer_page_end(src_a);
 }
 
-TEST_F(libyuvTest, BenchmakDjb2_Unaligned_OPT) {
-  const int kMaxTest = 1280 * 720;
-  align_buffer_16(src_a, kMaxTest + 1)
-
+TEST_F(LibYUVBaseTest, BenchmarkDjb2_Unaligned) {
+  const int kMaxTest = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_a, kMaxTest + 1);
   for (int i = 0; i < kMaxTest; ++i) {
     src_a[i + 1] = i;
   }
@@ -100,64 +143,106 @@ TEST_F(libyuvTest, BenchmakDjb2_Unaligned_OPT) {
     h1 = HashDjb2(src_a + 1, kMaxTest, 5381);
   }
   EXPECT_EQ(h1, h2);
-  free_aligned_buffer_16(src_a)
+  free_aligned_buffer_page_end(src_a);
 }
 
-TEST_F(libyuvTest, BenchmarkSumSquareError_C) {
-  const int kMaxWidth = 4096 * 3;
-  align_buffer_16(src_a, kMaxWidth)
-  align_buffer_16(src_b, kMaxWidth)
-
-  for (int i = 0; i < kMaxWidth; ++i) {
-    src_a[i] = i;
-    src_b[i] = i;
+TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Opt) {
+  uint32 fourcc;
+  const int kMaxTest = benchmark_width_ * benchmark_height_ * 4;
+  align_buffer_page_end(src_a, kMaxTest);
+  for (int i = 0; i < kMaxTest; ++i) {
+    src_a[i] = 255;
   }
 
-  MaskCpuFlags(kCpuInitialized);
+  src_a[0] = 0;
+  fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
+                      benchmark_width_, benchmark_height_);
+  EXPECT_EQ(libyuv::FOURCC_BGRA, fourcc);
+  src_a[0] = 255;
+  src_a[3] = 0;
+  fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
+                      benchmark_width_, benchmark_height_);
+  EXPECT_EQ(libyuv::FOURCC_ARGB, fourcc);
+  src_a[3] = 255;
+
   for (int i = 0; i < benchmark_iterations_; ++i) {
-    ComputeSumSquareError(src_a, src_b, kMaxWidth);
+    fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
+                        benchmark_width_, benchmark_height_);
   }
+  EXPECT_EQ(0, fourcc);
 
-  MaskCpuFlags(-1);
+  free_aligned_buffer_page_end(src_a);
+}
 
-  EXPECT_EQ(0, 0);
+TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Unaligned) {
+  uint32 fourcc;
+  const int kMaxTest = benchmark_width_ * benchmark_height_ * 4 + 1;
+  align_buffer_page_end(src_a, kMaxTest);
+  for (int i = 1; i < kMaxTest; ++i) {
+    src_a[i] = 255;
+  }
 
-  free_aligned_buffer_16(src_a)
-  free_aligned_buffer_16(src_b)
-}
+  src_a[0 + 1] = 0;
+  fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
+                      benchmark_width_, benchmark_height_);
+  EXPECT_EQ(libyuv::FOURCC_BGRA, fourcc);
+  src_a[0 + 1] = 255;
+  src_a[3 + 1] = 0;
+  fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
+                      benchmark_width_, benchmark_height_);
+  EXPECT_EQ(libyuv::FOURCC_ARGB, fourcc);
+  src_a[3 + 1] = 255;
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
+                        benchmark_width_, benchmark_height_);
+  }
+  EXPECT_EQ(0, fourcc);
 
-TEST_F(libyuvTest, BenchmarkSumSquareError_OPT) {
+  free_aligned_buffer_page_end(src_a);
+}
+TEST_F(LibYUVBaseTest, BenchmarkSumSquareError_Opt) {
   const int kMaxWidth = 4096 * 3;
-  align_buffer_16(src_a, kMaxWidth)
-  align_buffer_16(src_b, kMaxWidth)
+  align_buffer_page_end(src_a, kMaxWidth);
+  align_buffer_page_end(src_b, kMaxWidth);
+  memset(src_a, 0, kMaxWidth);
+  memset(src_b, 0, kMaxWidth);
+
+  memcpy(src_a, "test0123test4567", 16);
+  memcpy(src_b, "tick0123tock4567", 16);
+  uint64 h1 = ComputeSumSquareError(src_a, src_b, 16);
+  EXPECT_EQ(790u, h1);
 
   for (int i = 0; i < kMaxWidth; ++i) {
     src_a[i] = i;
     src_b[i] = i;
   }
+  memset(src_a, 0, kMaxWidth);
+  memset(src_b, 0, kMaxWidth);
 
-  for (int i = 0; i < benchmark_iterations_; ++i) {
-    ComputeSumSquareError(src_a, src_b, kMaxWidth);
+  int count = benchmark_iterations_ *
+    ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
+  for (int i = 0; i < count; ++i) {
+    h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth);
   }
 
-  EXPECT_EQ(0, 0);
+  EXPECT_EQ(0, h1);
 
-  free_aligned_buffer_16(src_a)
-  free_aligned_buffer_16(src_b)
+  free_aligned_buffer_page_end(src_a);
+  free_aligned_buffer_page_end(src_b);
 }
 
-TEST_F(libyuvTest, SumSquareError) {
+TEST_F(LibYUVBaseTest, SumSquareError) {
   const int kMaxWidth = 4096 * 3;
-  align_buffer_16(src_a, kMaxWidth)
-  align_buffer_16(src_b, kMaxWidth)
-
+  align_buffer_page_end(src_a, kMaxWidth);
+  align_buffer_page_end(src_b, kMaxWidth);
   memset(src_a, 0, kMaxWidth);
   memset(src_b, 0, kMaxWidth);
 
   uint64 err;
   err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
 
-  EXPECT_EQ(err, 0);
+  EXPECT_EQ(0, err);
 
   memset(src_a, 1, kMaxWidth);
   err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
@@ -168,90 +253,83 @@ TEST_F(libyuvTest, SumSquareError) {
   memset(src_b, 193, kMaxWidth);
   err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
 
-  EXPECT_EQ(err, (kMaxWidth * 3 * 3));
-
-  srandom(time(NULL));
+  EXPECT_EQ(kMaxWidth * 3 * 3, err);
 
   for (int i = 0; i < kMaxWidth; ++i) {
-    src_a[i] = (random() & 0xff);
-    src_b[i] = (random() & 0xff);
+    src_a[i] = (fastrand() & 0xff);
+    src_b[i] = (fastrand() & 0xff);
   }
 
-  MaskCpuFlags(kCpuInitialized);
+  MaskCpuFlags(disable_cpu_flags_);
   uint64 c_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
 
-  MaskCpuFlags(-1);
+  MaskCpuFlags(benchmark_cpu_info_);
   uint64 opt_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
 
   EXPECT_EQ(c_err, opt_err);
 
-  free_aligned_buffer_16(src_a)
-  free_aligned_buffer_16(src_b)
+  free_aligned_buffer_page_end(src_a);
+  free_aligned_buffer_page_end(src_b);
 }
 
-TEST_F(libyuvTest, BenchmarkPsnr_C) {
-  align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
-  align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
-
+TEST_F(LibYUVBaseTest, BenchmarkPsnr_Opt) {
+  align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
   for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
     src_a[i] = i;
     src_b[i] = i;
   }
 
-  MaskCpuFlags(kCpuInitialized);
+  MaskCpuFlags(benchmark_cpu_info_);
 
-  double c_time = get_time();
+  double opt_time = get_time();
   for (int i = 0; i < benchmark_iterations_; ++i)
     CalcFramePsnr(src_a, benchmark_width_,
                   src_b, benchmark_width_,
                   benchmark_width_, benchmark_height_);
 
-  c_time = (get_time() - c_time) / benchmark_iterations_;
-  printf("BenchmarkPsnr_C - %8.2f us c\n", c_time * 1e6);
-
-  MaskCpuFlags(-1);
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+  printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
 
   EXPECT_EQ(0, 0);
 
-  free_aligned_buffer_16(src_a)
-  free_aligned_buffer_16(src_b)
+  free_aligned_buffer_page_end(src_a);
+  free_aligned_buffer_page_end(src_b);
 }
 
-TEST_F(libyuvTest, BenchmarkPsnr_OPT) {
-  align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
-  align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
-
+TEST_F(LibYUVBaseTest, BenchmarkPsnr_Unaligned) {
+  align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_ + 1);
+  align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
   for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
-    src_a[i] = i;
+    src_a[i + 1] = i;
     src_b[i] = i;
   }
 
-  MaskCpuFlags(-1);
+  MaskCpuFlags(benchmark_cpu_info_);
 
   double opt_time = get_time();
   for (int i = 0; i < benchmark_iterations_; ++i)
-    CalcFramePsnr(src_a, benchmark_width_,
+    CalcFramePsnr(src_a + 1, benchmark_width_,
                   src_b, benchmark_width_,
                   benchmark_width_, benchmark_height_);
 
   opt_time = (get_time() - opt_time) / benchmark_iterations_;
-  printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6);
+  printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
 
   EXPECT_EQ(0, 0);
 
-  free_aligned_buffer_16(src_a)
-  free_aligned_buffer_16(src_b)
+  free_aligned_buffer_page_end(src_a);
+  free_aligned_buffer_page_end(src_b);
 }
 
-TEST_F(libyuvTest, Psnr) {
-  const int kSrcWidth = 1280;
-  const int kSrcHeight = 720;
+TEST_F(LibYUVBaseTest, Psnr) {
+  const int kSrcWidth = benchmark_width_;
+  const int kSrcHeight = benchmark_height_;
   const int b = 128;
   const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
   const int kSrcStride = 2 * b + kSrcWidth;
-  align_buffer_16(src_a, kSrcPlaneSize)
-  align_buffer_16(src_b, kSrcPlaneSize)
-
+  align_buffer_page_end(src_a, kSrcPlaneSize);
+  align_buffer_page_end(src_b, kSrcPlaneSize);
   memset(src_a, 0, kSrcPlaneSize);
   memset(src_b, 0, kSrcPlaneSize);
 
@@ -279,36 +357,37 @@ TEST_F(libyuvTest, Psnr) {
   EXPECT_GT(err, 48.0);
   EXPECT_LT(err, 49.0);
 
-  for (int i = 0; i < kSrcPlaneSize; ++i)
+  for (int i = 0; i < kSrcPlaneSize; ++i) {
     src_a[i] = i;
+  }
 
   err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
                       src_b + kSrcStride * b + b, kSrcStride,
                       kSrcWidth, kSrcHeight);
 
-  EXPECT_GT(err, 4.0);
-  EXPECT_LT(err, 5.0);
-
-  srandom(time(NULL));
+  EXPECT_GT(err, 2.0);
+  if (kSrcWidth * kSrcHeight >= 256) {
+    EXPECT_LT(err, 6.0);
+  }
 
   memset(src_a, 0, kSrcPlaneSize);
   memset(src_b, 0, kSrcPlaneSize);
 
   for (int i = b; i < (kSrcHeight + b); ++i) {
     for (int j = b; j < (kSrcWidth + b); ++j) {
-      src_a[(i * kSrcStride) + j] = (random() & 0xff);
-      src_b[(i * kSrcStride) + j] = (random() & 0xff);
+      src_a[(i * kSrcStride) + j] = (fastrand() & 0xff);
+      src_b[(i * kSrcStride) + j] = (fastrand() & 0xff);
     }
   }
 
-  MaskCpuFlags(kCpuInitialized);
+  MaskCpuFlags(disable_cpu_flags_);
   double c_err, opt_err;
 
   c_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
                         src_b + kSrcStride * b + b, kSrcStride,
                         kSrcWidth, kSrcHeight);
 
-  MaskCpuFlags(-1);
+  MaskCpuFlags(benchmark_cpu_info_);
 
   opt_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
                           src_b + kSrcStride * b + b, kSrcStride,
@@ -316,48 +395,19 @@ TEST_F(libyuvTest, Psnr) {
 
   EXPECT_EQ(opt_err, c_err);
 
-  free_aligned_buffer_16(src_a)
-  free_aligned_buffer_16(src_b)
-}
-
-TEST_F(libyuvTest, BenchmarkSsim_C) {
-  align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
-  align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
-
-  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
-    src_a[i] = i;
-    src_b[i] = i;
-  }
-
-  MaskCpuFlags(kCpuInitialized);
-
-  double c_time = get_time();
-  for (int i = 0; i < benchmark_iterations_; ++i)
-    CalcFrameSsim(src_a, benchmark_width_,
-                  src_b, benchmark_width_,
-                  benchmark_width_, benchmark_height_);
-
-  c_time = (get_time() - c_time) / benchmark_iterations_;
-  printf("BenchmarkSsim_C - %8.2f us c\n", c_time * 1e6);
-
-  MaskCpuFlags(-1);
-
-  EXPECT_EQ(0, 0);
-
-  free_aligned_buffer_16(src_a)
-  free_aligned_buffer_16(src_b)
+  free_aligned_buffer_page_end(src_a);
+  free_aligned_buffer_page_end(src_b);
 }
 
-TEST_F(libyuvTest, BenchmarkSsim_OPT) {
-  align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
-  align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
-
+TEST_F(LibYUVBaseTest, DISABLED_BenchmarkSsim_Opt) {
+  align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
   for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
     src_a[i] = i;
     src_b[i] = i;
   }
 
-  MaskCpuFlags(-1);
+  MaskCpuFlags(benchmark_cpu_info_);
 
   double opt_time = get_time();
   for (int i = 0; i < benchmark_iterations_; ++i)
@@ -366,32 +416,37 @@ TEST_F(libyuvTest, BenchmarkSsim_OPT) {
                   benchmark_width_, benchmark_height_);
 
   opt_time = (get_time() - opt_time) / benchmark_iterations_;
-  printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6);
+  printf("BenchmarkSsim_Opt - %8.2f us opt\n", opt_time * 1e6);
 
-  EXPECT_EQ(0, 0);
+  EXPECT_EQ(0, 0);  // Pass if we get this far.
 
-  free_aligned_buffer_16(src_a)
-  free_aligned_buffer_16(src_b)
+  free_aligned_buffer_page_end(src_a);
+  free_aligned_buffer_page_end(src_b);
 }
 
-TEST_F(libyuvTest, Ssim) {
-  const int kSrcWidth = 1280;
-  const int kSrcHeight = 720;
+TEST_F(LibYUVBaseTest, Ssim) {
+  const int kSrcWidth = benchmark_width_;
+  const int kSrcHeight = benchmark_height_;
   const int b = 128;
   const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
   const int kSrcStride = 2 * b + kSrcWidth;
-  align_buffer_16(src_a, kSrcPlaneSize)
-  align_buffer_16(src_b, kSrcPlaneSize)
-
+  align_buffer_page_end(src_a, kSrcPlaneSize);
+  align_buffer_page_end(src_b, kSrcPlaneSize);
   memset(src_a, 0, kSrcPlaneSize);
   memset(src_b, 0, kSrcPlaneSize);
 
+  if (kSrcWidth <=8 || kSrcHeight <= 8) {
+    printf("warning - Ssim size too small.  Testing function executes.\n");
+  }
+
   double err;
   err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
                       src_b + kSrcStride * b + b, kSrcStride,
                       kSrcWidth, kSrcHeight);
 
-  EXPECT_EQ(err, 1.0);
+  if (kSrcWidth > 8 && kSrcHeight > 8) {
+    EXPECT_EQ(err, 1.0);
+  }
 
   memset(src_a, 255, kSrcPlaneSize);
 
@@ -399,7 +454,9 @@ TEST_F(libyuvTest, Ssim) {
                       src_b + kSrcStride * b + b, kSrcStride,
                       kSrcWidth, kSrcHeight);
 
-  EXPECT_LT(err, 0.0001);
+  if (kSrcWidth > 8 && kSrcHeight > 8) {
+    EXPECT_LT(err, 0.0001);
+  }
 
   memset(src_a, 1, kSrcPlaneSize);
 
@@ -407,44 +464,50 @@ TEST_F(libyuvTest, Ssim) {
                       src_b + kSrcStride * b + b, kSrcStride,
                       kSrcWidth, kSrcHeight);
 
-  EXPECT_GT(err, 0.8);
-  EXPECT_LT(err, 0.9);
+  if (kSrcWidth > 8 && kSrcHeight > 8) {
+    EXPECT_GT(err, 0.0001);
+    EXPECT_LT(err, 0.9);
+  }
 
-  for (int i = 0; i < kSrcPlaneSize; ++i)
+  for (int i = 0; i < kSrcPlaneSize; ++i) {
     src_a[i] = i;
+  }
 
   err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
                       src_b + kSrcStride * b + b, kSrcStride,
                       kSrcWidth, kSrcHeight);
 
-  EXPECT_GT(err, 0.008);
-  EXPECT_LT(err, 0.009);
+  if (kSrcWidth > 8 && kSrcHeight > 8) {
+    EXPECT_GT(err, 0.0);
+    EXPECT_LT(err, 0.01);
+  }
 
-  srandom(time(NULL));
   for (int i = b; i < (kSrcHeight + b); ++i) {
     for (int j = b; j < (kSrcWidth + b); ++j) {
-      src_a[(i * kSrcStride) + j] = (random() & 0xff);
-      src_b[(i * kSrcStride) + j] = (random() & 0xff);
+      src_a[(i * kSrcStride) + j] = (fastrand() & 0xff);
+      src_b[(i * kSrcStride) + j] = (fastrand() & 0xff);
     }
   }
 
-  MaskCpuFlags(kCpuInitialized);
+  MaskCpuFlags(disable_cpu_flags_);
   double c_err, opt_err;
 
   c_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
                         src_b + kSrcStride * b + b, kSrcStride,
                         kSrcWidth, kSrcHeight);
 
-  MaskCpuFlags(-1);
+  MaskCpuFlags(benchmark_cpu_info_);
 
   opt_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
                           src_b + kSrcStride * b + b, kSrcStride,
                           kSrcWidth, kSrcHeight);
 
-  EXPECT_EQ(opt_err, c_err);
+  if (kSrcWidth > 8 && kSrcHeight > 8) {
+    EXPECT_EQ(opt_err, c_err);
+  }
 
-  free_aligned_buffer_16(src_a)
-  free_aligned_buffer_16(src_b)
+  free_aligned_buffer_page_end(src_a);
+  free_aligned_buffer_page_end(src_b);
 }
 
 }  // namespace libyuv
diff --git a/files/unit_test/convert_test.cc b/files/unit_test/convert_test.cc
new file mode 100644
index 00000000..56a2bfd8
--- /dev/null
+++ b/files/unit_test/convert_test.cc
@@ -0,0 +1,1861 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/video_common.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
+
+#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,           \
+                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF)   \
+TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                 \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  align_buffer_page_end(src_y, kWidth * kHeight + OFF);                        \
+  align_buffer_page_end(src_u,                                                 \
+                        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                     \
+                        SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);              \
+  align_buffer_page_end(src_v,                                                 \
+                        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                     \
+                        SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);              \
+  align_buffer_page_end(dst_y_c, kWidth * kHeight);                            \
+  align_buffer_page_end(dst_u_c,                                               \
+                        SUBSAMPLE(kWidth, SUBSAMP_X) *                         \
+                        SUBSAMPLE(kHeight, SUBSAMP_Y));                        \
+  align_buffer_page_end(dst_v_c,                                               \
+                        SUBSAMPLE(kWidth, SUBSAMP_X) *                         \
+                        SUBSAMPLE(kHeight, SUBSAMP_Y));                        \
+  align_buffer_page_end(dst_y_opt, kWidth * kHeight);                          \
+  align_buffer_page_end(dst_u_opt,                                             \
+                        SUBSAMPLE(kWidth, SUBSAMP_X) *                         \
+                        SUBSAMPLE(kHeight, SUBSAMP_Y));                        \
+  align_buffer_page_end(dst_v_opt,                                             \
+                        SUBSAMPLE(kWidth, SUBSAMP_X) *                         \
+                        SUBSAMPLE(kHeight, SUBSAMP_Y));                        \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                       \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {                \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {               \
+      src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
+          (fastrand() & 0xff);                                                 \
+      src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
+          (fastrand() & 0xff);                                                 \
+    }                                                                          \
+  }                                                                            \
+  memset(dst_y_c, 1, kWidth * kHeight);                                        \
+  memset(dst_u_c, 2, SUBSAMPLE(kWidth, SUBSAMP_X) *                            \
+                     SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  memset(dst_v_c, 3, SUBSAMPLE(kWidth, SUBSAMP_X) *                            \
+                     SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  memset(dst_y_opt, 101, kWidth * kHeight);                                    \
+  memset(dst_u_opt, 102, SUBSAMPLE(kWidth, SUBSAMP_X) *                        \
+                         SUBSAMPLE(kHeight, SUBSAMP_Y));                       \
+  memset(dst_v_opt, 103, SUBSAMPLE(kWidth, SUBSAMP_X) *                        \
+                         SUBSAMPLE(kHeight, SUBSAMP_Y));                       \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                          \
+                                 src_u + OFF,                                  \
+                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
+                                 src_v + OFF,                                  \
+                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
+                                 dst_y_c, kWidth,                              \
+                                 dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
+                                 dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
+                                 kWidth, NEG kHeight);                         \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                        \
+                                   src_u + OFF,                                \
+                                       SUBSAMPLE(kWidth, SRC_SUBSAMP_X),       \
+                                   src_v + OFF,                                \
+                                       SUBSAMPLE(kWidth, SRC_SUBSAMP_X),       \
+                                   dst_y_opt, kWidth,                          \
+                                   dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
+                                   dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
+                                   kWidth, NEG kHeight);                       \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_EQ(0, max_diff);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_u_c[i *                                     \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
+              static_cast<int>(dst_u_opt[i *                                   \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 3);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_v_c[i *                                     \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
+              static_cast<int>(dst_v_opt[i *                                   \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 3);                                                      \
+  free_aligned_buffer_page_end(dst_y_c);                                       \
+  free_aligned_buffer_page_end(dst_u_c);                                       \
+  free_aligned_buffer_page_end(dst_v_c);                                       \
+  free_aligned_buffer_page_end(dst_y_opt);                                     \
+  free_aligned_buffer_page_end(dst_u_opt);                                     \
+  free_aligned_buffer_page_end(dst_v_opt);                                     \
+  free_aligned_buffer_page_end(src_y);                                         \
+  free_aligned_buffer_page_end(src_u);                                         \
+  free_aligned_buffer_page_end(src_v);                                         \
+}
+
+#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,            \
+                      FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                        \
+    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
+                   benchmark_width_ - 4, _Any, +, 0)                           \
+    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
+                   benchmark_width_, _Unaligned, +, 1)                         \
+    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
+                   benchmark_width_, _Invert, -, 0)                            \
+    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
+                   benchmark_width_, _Opt, +, 0)
+
+TESTPLANARTOP(I420, 2, 2, I420, 2, 2)
+TESTPLANARTOP(I422, 2, 1, I420, 2, 2)
+TESTPLANARTOP(I444, 1, 1, I420, 2, 2)
+TESTPLANARTOP(I411, 4, 1, I420, 2, 2)
+TESTPLANARTOP(I420, 2, 2, I422, 2, 1)
+TESTPLANARTOP(I420, 2, 2, I444, 1, 1)
+TESTPLANARTOP(I420, 2, 2, I411, 4, 1)
+TESTPLANARTOP(I420, 2, 2, I420Mirror, 2, 2)
+TESTPLANARTOP(I422, 2, 1, I422, 2, 1)
+TESTPLANARTOP(I444, 1, 1, I444, 1, 1)
+
+#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,          \
+                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF)   \
+TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                 \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  align_buffer_page_end(src_y, kWidth * kHeight + OFF);                        \
+  align_buffer_page_end(src_u,                                                 \
+                        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                     \
+                        SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);              \
+  align_buffer_page_end(src_v,                                                 \
+                        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                     \
+                        SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);              \
+  align_buffer_page_end(dst_y_c, kWidth * kHeight);                            \
+  align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *           \
+                        SUBSAMPLE(kHeight, SUBSAMP_Y));                        \
+  align_buffer_page_end(dst_y_opt, kWidth * kHeight);                          \
+  align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *         \
+                        SUBSAMPLE(kHeight, SUBSAMP_Y));                        \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                       \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {                \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {               \
+      src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
+          (fastrand() & 0xff);                                                 \
+      src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
+          (fastrand() & 0xff);                                                 \
+    }                                                                          \
+  }                                                                            \
+  memset(dst_y_c, 1, kWidth * kHeight);                                        \
+  memset(dst_uv_c, 2, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *                       \
+                      SUBSAMPLE(kHeight, SUBSAMP_Y));                          \
+  memset(dst_y_opt, 101, kWidth * kHeight);                                    \
+  memset(dst_uv_opt, 102, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *                   \
+                          SUBSAMPLE(kHeight, SUBSAMP_Y));                      \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                          \
+                                 src_u + OFF,                                  \
+                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
+                                 src_v + OFF,                                  \
+                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
+                                 dst_y_c, kWidth,                              \
+                                 dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X),   \
+                                 kWidth, NEG kHeight);                         \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                        \
+                                   src_u + OFF,                                \
+                                   SUBSAMPLE(kWidth, SRC_SUBSAMP_X),           \
+                                   src_v + OFF,                                \
+                                   SUBSAMPLE(kWidth, SRC_SUBSAMP_X),           \
+                                   dst_y_opt, kWidth,                          \
+                                   dst_uv_opt,                                 \
+                                   SUBSAMPLE(kWidth * 2, SUBSAMP_X),           \
+                                   kWidth, NEG kHeight);                       \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 1);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) {               \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_uv_c[i *                                    \
+                               SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) -        \
+              static_cast<int>(dst_uv_opt[i *                                  \
+                               SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]));        \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 1);                                                      \
+  free_aligned_buffer_page_end(dst_y_c);                                       \
+  free_aligned_buffer_page_end(dst_uv_c);                                      \
+  free_aligned_buffer_page_end(dst_y_opt);                                     \
+  free_aligned_buffer_page_end(dst_uv_opt);                                    \
+  free_aligned_buffer_page_end(src_y);                                         \
+  free_aligned_buffer_page_end(src_u);                                         \
+  free_aligned_buffer_page_end(src_v);                                         \
+}
+
+#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,           \
+                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                       \
+    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
+                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
+                    benchmark_width_ - 4, _Any, +, 0)                          \
+    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
+                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
+                    benchmark_width_, _Unaligned, +, 1)                        \
+    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
+                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
+                    benchmark_width_, _Invert, -, 0)                           \
+    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
+                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
+                    benchmark_width_, _Opt, +, 0)
+
+TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
+TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
+
+#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
+                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
+TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                 \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  align_buffer_page_end(src_y, kWidth * kHeight + OFF);                        \
+  align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *         \
+                        SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);              \
+  align_buffer_page_end(dst_y_c, kWidth * kHeight);                            \
+  align_buffer_page_end(dst_u_c,                                               \
+                        SUBSAMPLE(kWidth, SUBSAMP_X) *                         \
+                        SUBSAMPLE(kHeight, SUBSAMP_Y));                        \
+  align_buffer_page_end(dst_v_c,                                               \
+                        SUBSAMPLE(kWidth, SUBSAMP_X) *                         \
+                        SUBSAMPLE(kHeight, SUBSAMP_Y));                        \
+  align_buffer_page_end(dst_y_opt, kWidth * kHeight);                          \
+  align_buffer_page_end(dst_u_opt,                                             \
+                        SUBSAMPLE(kWidth, SUBSAMP_X) *                         \
+                        SUBSAMPLE(kHeight, SUBSAMP_Y));                        \
+  align_buffer_page_end(dst_v_opt,                                             \
+                        SUBSAMPLE(kWidth, SUBSAMP_X) *                         \
+                        SUBSAMPLE(kHeight, SUBSAMP_Y));                        \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                       \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {                \
+    for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {           \
+      src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =           \
+          (fastrand() & 0xff);                                                 \
+    }                                                                          \
+  }                                                                            \
+  memset(dst_y_c, 1, kWidth * kHeight);                                        \
+  memset(dst_u_c, 2, SUBSAMPLE(kWidth, SUBSAMP_X) *                            \
+                     SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  memset(dst_v_c, 3, SUBSAMPLE(kWidth, SUBSAMP_X) *                            \
+                     SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  memset(dst_y_opt, 101, kWidth * kHeight);                                    \
+  memset(dst_u_opt, 102, SUBSAMPLE(kWidth, SUBSAMP_X) *                        \
+                         SUBSAMPLE(kHeight, SUBSAMP_Y));                       \
+  memset(dst_v_opt, 103, SUBSAMPLE(kWidth, SUBSAMP_X) *                        \
+                         SUBSAMPLE(kHeight, SUBSAMP_Y));                       \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                          \
+                                 src_uv + OFF,                                 \
+                                 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X),         \
+                                 dst_y_c, kWidth,                              \
+                                 dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
+                                 dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
+                                 kWidth, NEG kHeight);                         \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                        \
+                                   src_uv + OFF,                               \
+                                   2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X),       \
+                                   dst_y_opt, kWidth,                          \
+                                   dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
+                                   dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
+                                   kWidth, NEG kHeight);                       \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 1);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_u_c[i *                                     \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
+              static_cast<int>(dst_u_opt[i *                                   \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 1);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_v_c[i *                                     \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
+              static_cast<int>(dst_v_opt[i *                                   \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 1);                                                      \
+  free_aligned_buffer_page_end(dst_y_c);                                       \
+  free_aligned_buffer_page_end(dst_u_c);                                       \
+  free_aligned_buffer_page_end(dst_v_c);                                       \
+  free_aligned_buffer_page_end(dst_y_opt);                                     \
+  free_aligned_buffer_page_end(dst_u_opt);                                     \
+  free_aligned_buffer_page_end(dst_v_opt);                                     \
+  free_aligned_buffer_page_end(src_y);                                         \
+  free_aligned_buffer_page_end(src_uv);                                        \
+}
+
+#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,          \
+                        FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                      \
+    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
+                     benchmark_width_ - 4, _Any, +, 0)                         \
+    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
+                     benchmark_width_, _Unaligned, +, 1)                       \
+    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
+                     benchmark_width_, _Invert, -, 0)                          \
+    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
+                     benchmark_width_, _Opt, +, 0)
+
+TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
+TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
+
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN) - 1) / (ALIGN) * (ALIGN))
+
+#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                       YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C)         \
+TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                          \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                     \
+  const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                        \
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                          \
+  const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);               \
+  align_buffer_page_end(src_y, kWidth * kHeight + OFF);                        \
+  align_buffer_page_end(src_u, kSizeUV + OFF);                                 \
+  align_buffer_page_end(src_v, kSizeUV + OFF);                                 \
+  align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);                 \
+  align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);               \
+  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
+    src_y[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  for (int i = 0; i < kSizeUV; ++i) {                                          \
+    src_u[i + OFF] = (fastrand() & 0xff);                                      \
+    src_v[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                             \
+  memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                         \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                   \
+                        src_u + OFF, kStrideUV,                                \
+                        src_v + OFF, kStrideUV,                                \
+                        dst_argb_c + OFF, kStrideB,                            \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                 \
+                          src_u + OFF, kStrideUV,                              \
+                          src_v + OFF, kStrideUV,                              \
+                          dst_argb_opt + OFF, kStrideB,                        \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  /* Convert to ARGB so 565 is expanded to bytes that can be compared. */      \
+  align_buffer_page_end(dst_argb32_c, kWidth * BPP_C  * kHeight);              \
+  align_buffer_page_end(dst_argb32_opt, kWidth * BPP_C  * kHeight);            \
+  memset(dst_argb32_c, 2, kWidth * BPP_C  * kHeight);                          \
+  memset(dst_argb32_opt, 102, kWidth * BPP_C  * kHeight);                      \
+  FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB,                                 \
+                   dst_argb32_c, kWidth * BPP_C ,                              \
+                   kWidth, kHeight);                                           \
+  FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB,                               \
+                   dst_argb32_opt, kWidth * BPP_C ,                            \
+                   kWidth, kHeight);                                           \
+  for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                         \
+    int abs_diff =                                                             \
+        abs(static_cast<int>(dst_argb32_c[i]) -                                \
+            static_cast<int>(dst_argb32_opt[i]));                              \
+    if (abs_diff > max_diff) {                                                 \
+      max_diff = abs_diff;                                                     \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_page_end(src_y);                                         \
+  free_aligned_buffer_page_end(src_u);                                         \
+  free_aligned_buffer_page_end(src_v);                                         \
+  free_aligned_buffer_page_end(dst_argb_c);                                    \
+  free_aligned_buffer_page_end(dst_argb_opt);                                  \
+  free_aligned_buffer_page_end(dst_argb32_c);                                  \
+  free_aligned_buffer_page_end(dst_argb32_opt);                                \
+}
+
+#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,   \
+                      YALIGN, DIFF, FMT_C, BPP_C)                              \
+    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+        YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C)          \
+    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+        YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C)        \
+    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+        YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C)           \
+    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+        YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+
+TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(J420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1, 9, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1, 17, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(J422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(J422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I411, 4, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1, 1, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1, 1, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1, 0, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1, 0, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1, 0, ARGB, 4)
+TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1, 0, ARGB, 4)
+
+#define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                       YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN)                \
+TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                          \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                     \
+  const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                        \
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                          \
+  const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);               \
+  align_buffer_page_end(src_y, kWidth * kHeight + OFF);                        \
+  align_buffer_page_end(src_u, kSizeUV + OFF);                                 \
+  align_buffer_page_end(src_v, kSizeUV + OFF);                                 \
+  align_buffer_page_end(src_a, kWidth * kHeight + OFF);                        \
+  align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);                 \
+  align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);               \
+  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
+    src_y[i + OFF] = (fastrand() & 0xff);                                      \
+    src_a[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  for (int i = 0; i < kSizeUV; ++i) {                                          \
+    src_u[i + OFF] = (fastrand() & 0xff);                                      \
+    src_v[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                             \
+  memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                         \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                   \
+                        src_u + OFF, kStrideUV,                                \
+                        src_v + OFF, kStrideUV,                                \
+                        src_a + OFF, kWidth,                                   \
+                        dst_argb_c + OFF, kStrideB,                            \
+                        kWidth, NEG kHeight, ATTEN);                           \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                 \
+                          src_u + OFF, kStrideUV,                              \
+                          src_v + OFF, kStrideUV,                              \
+                          src_a + OFF, kWidth,                                 \
+                          dst_argb_opt + OFF, kStrideB,                        \
+                          kWidth, NEG kHeight, ATTEN);                         \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                         \
+    int abs_diff =                                                             \
+        abs(static_cast<int>(dst_argb_c[i + OFF]) -                            \
+            static_cast<int>(dst_argb_opt[i + OFF]));                          \
+    if (abs_diff > max_diff) {                                                 \
+      max_diff = abs_diff;                                                     \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_page_end(src_y);                                         \
+  free_aligned_buffer_page_end(src_u);                                         \
+  free_aligned_buffer_page_end(src_v);                                         \
+  free_aligned_buffer_page_end(src_a);                                         \
+  free_aligned_buffer_page_end(dst_argb_c);                                    \
+  free_aligned_buffer_page_end(dst_argb_opt);                                  \
+}
+
+#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                       YALIGN, DIFF)                                           \
+    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0)                     \
+    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 0)                   \
+    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0)                      \
+    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0)                         \
+    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_, DIFF, _Premult, +, 0, 1)
+
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2)
+TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
+
+#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,       \
+                         W1280, DIFF, N, NEG, OFF)                             \
+TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                          \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kStrideB = kWidth * BPP_B;                                         \
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                          \
+  align_buffer_page_end(src_y, kWidth * kHeight + OFF);                        \
+  align_buffer_page_end(src_uv,                                                \
+                        kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF);  \
+  align_buffer_page_end(dst_argb_c, kStrideB * kHeight);                       \
+  align_buffer_page_end(dst_argb_opt, kStrideB * kHeight);                     \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                       \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < kStrideUV * 2; ++j) {                                  \
+      src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff);               \
+    }                                                                          \
+  }                                                                            \
+  memset(dst_argb_c, 1, kStrideB * kHeight);                                   \
+  memset(dst_argb_opt, 101, kStrideB * kHeight);                               \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                   \
+                        src_uv + OFF, kStrideUV * 2,                           \
+                        dst_argb_c, kWidth * BPP_B,                            \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                 \
+                          src_uv + OFF, kStrideUV * 2,                         \
+                          dst_argb_opt, kWidth * BPP_B,                        \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  /* Convert to ARGB so 565 is expanded to bytes that can be compared. */      \
+  align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight);                   \
+  align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight);                 \
+  memset(dst_argb32_c, 2, kWidth * 4 * kHeight);                               \
+  memset(dst_argb32_opt, 102, kWidth * 4 * kHeight);                           \
+  FMT_B##ToARGB(dst_argb_c, kStrideB,                                          \
+                dst_argb32_c, kWidth * 4,                                      \
+                kWidth, kHeight);                                              \
+  FMT_B##ToARGB(dst_argb_opt, kStrideB,                                        \
+                dst_argb32_opt, kWidth * 4,                                    \
+                kWidth, kHeight);                                              \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth * 4; ++j) {                                     \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) -             \
+              static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j]));           \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_page_end(src_y);                                         \
+  free_aligned_buffer_page_end(src_uv);                                        \
+  free_aligned_buffer_page_end(dst_argb_c);                                    \
+  free_aligned_buffer_page_end(dst_argb_opt);                                  \
+  free_aligned_buffer_page_end(dst_argb32_c);                                  \
+  free_aligned_buffer_page_end(dst_argb32_opt);                                \
+}
+
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF)  \
+    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
+                     benchmark_width_ - 4, DIFF, _Any, +, 0)                   \
+    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
+                     benchmark_width_, DIFF, _Unaligned, +, 1)                 \
+    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
+                     benchmark_width_, DIFF, _Invert, -, 0)                    \
+    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
+                     benchmark_width_, DIFF, _Opt, +, 0)
+
+TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
+TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
+TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
+
+#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+                       W1280, DIFF, N, NEG, OFF)                               \
+TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                          \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                     \
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                          \
+  const int kStride =                                                          \
+      (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;                             \
+  align_buffer_page_end(src_argb, kStride * kHeight + OFF);                    \
+  align_buffer_page_end(dst_y_c, kWidth * kHeight);                            \
+  align_buffer_page_end(dst_u_c,                                               \
+                        kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y));            \
+  align_buffer_page_end(dst_v_c,                                               \
+                        kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y));            \
+  align_buffer_page_end(dst_y_opt, kWidth * kHeight);                          \
+  align_buffer_page_end(dst_u_opt,                                             \
+                        kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y));            \
+  align_buffer_page_end(dst_v_opt,                                             \
+                  kStrideUV *                                                  \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  memset(dst_y_c, 1, kWidth * kHeight);                                        \
+  memset(dst_u_c, 2,                                                           \
+         kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  memset(dst_v_c, 3,                                                           \
+         kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  memset(dst_y_opt, 101, kWidth * kHeight);                                    \
+  memset(dst_u_opt, 102,                                                       \
+         kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  memset(dst_v_opt, 103,                                                       \
+         kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kStride; ++j)                                          \
+      src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);                 \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                               \
+                        dst_y_c, kWidth,                                       \
+                        dst_u_c, kStrideUV,                                    \
+                        dst_v_c, kStrideUV,                                    \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                             \
+                          dst_y_opt, kWidth,                                   \
+                          dst_u_opt, kStrideUV,                                \
+                          dst_v_opt, kStrideUV,                                \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]),                   \
+                  static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF);          \
+    }                                                                          \
+  }                                                                            \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < kStrideUV; ++j) {                                      \
+      EXPECT_NEAR(static_cast<int>(dst_u_c[i * kStrideUV + j]),                \
+                  static_cast<int>(dst_u_opt[i * kStrideUV + j]), DIFF);       \
+    }                                                                          \
+  }                                                                            \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < kStrideUV; ++j) {                                      \
+      EXPECT_NEAR(static_cast<int>(dst_v_c[i *                                 \
+                                   kStrideUV + j]),                            \
+                  static_cast<int>(dst_v_opt[i *                               \
+                                   kStrideUV + j]), DIFF);                     \
+    }                                                                          \
+  }                                                                            \
+  free_aligned_buffer_page_end(dst_y_c);                                       \
+  free_aligned_buffer_page_end(dst_u_c);                                       \
+  free_aligned_buffer_page_end(dst_v_c);                                       \
+  free_aligned_buffer_page_end(dst_y_opt);                                     \
+  free_aligned_buffer_page_end(dst_u_opt);                                     \
+  free_aligned_buffer_page_end(dst_v_opt);                                     \
+  free_aligned_buffer_page_end(src_argb);                                      \
+}
+
+#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,  \
+                      DIFF)                                                    \
+    TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,     \
+                   benchmark_width_ - 4, DIFF, _Any, +, 0)                     \
+    TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,     \
+                   benchmark_width_, DIFF, _Unaligned, +, 1)                   \
+    TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,     \
+                   benchmark_width_, DIFF, _Invert, -, 0)                      \
+    TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,     \
+                   benchmark_width_, DIFF, _Opt, +, 0)
+
+TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
+#if defined(__arm__) || defined (__aarch64__)
+// arm version subsamples by summing 4 pixels then multiplying by matrix with
+// 4x smaller coefficients which are rounded to nearest integer.
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4)
+TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, 4)
+#else
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0)
+TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, 0)
+#endif
+TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
+// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9.
+TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
+TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
+TESTATOPLANAR(ARGB, 4, 1, I411, 4, 1, 4)
+TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
+TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
+TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2)
+TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
+TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2)
+TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
+TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
+
+#define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR,                      \
+                         SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF)             \
+TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                          \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                        \
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                          \
+  align_buffer_page_end(src_argb, kStride * kHeight + OFF);                    \
+  align_buffer_page_end(dst_y_c, kWidth * kHeight);                            \
+  align_buffer_page_end(dst_uv_c,                                              \
+                        kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+  align_buffer_page_end(dst_y_opt, kWidth * kHeight);                          \
+  align_buffer_page_end(dst_uv_opt,                                            \
+                        kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kStride; ++j)                                          \
+      src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);                 \
+  memset(dst_y_c, 1, kWidth * kHeight);                                        \
+  memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));          \
+  memset(dst_y_opt, 101, kWidth * kHeight);                                    \
+  memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                               \
+                        dst_y_c, kWidth, dst_uv_c, kStrideUV * 2,              \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                             \
+                          dst_y_opt, kWidth,                                   \
+                          dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight);     \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 4);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < kStrideUV * 2; ++j) {                                  \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_uv_c[i * kStrideUV * 2 + j]) -              \
+              static_cast<int>(dst_uv_opt[i * kStrideUV * 2 + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 4);                                                      \
+  free_aligned_buffer_page_end(dst_y_c);                                       \
+  free_aligned_buffer_page_end(dst_uv_c);                                      \
+  free_aligned_buffer_page_end(dst_y_opt);                                     \
+  free_aligned_buffer_page_end(dst_uv_opt);                                    \
+  free_aligned_buffer_page_end(src_argb);                                      \
+}
+
+#define TESTATOBIPLANAR(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+    TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,    \
+                     benchmark_width_ - 4, _Any, +, 0)                         \
+    TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,    \
+                     benchmark_width_, _Unaligned, +, 1)                       \
+    TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,    \
+                     benchmark_width_, _Invert, -, 0)                          \
+    TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,    \
+                     benchmark_width_, _Opt, +, 0)
+
+TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2)
+TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2)
+TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2)
+TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
+
+#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                            \
+                  FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                            \
+                  W1280, DIFF, N, NEG, OFF)                                    \
+TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) {                               \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;         \
+  const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;         \
+  const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;  \
+  const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;  \
+  align_buffer_page_end(src_argb, kStrideA * kHeightA + OFF);                  \
+  align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                      \
+  align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                    \
+  for (int i = 0; i < kStrideA * kHeightA; ++i) {                              \
+    src_argb[i + OFF] = (fastrand() & 0xff);                                   \
+  }                                                                            \
+  memset(dst_argb_c, 1, kStrideB * kHeightB);                                  \
+  memset(dst_argb_opt, 101, kStrideB * kHeightB);                              \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_A##To##FMT_B(src_argb + OFF, kStrideA,                                   \
+                   dst_argb_c, kStrideB,                                       \
+                   kWidth, NEG kHeight);                                       \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_A##To##FMT_B(src_argb + OFF, kStrideA,                                 \
+                     dst_argb_opt, kStrideB,                                   \
+                     kWidth, NEG kHeight);                                     \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kStrideB * kHeightB; ++i) {                              \
+    int abs_diff =                                                             \
+        abs(static_cast<int>(dst_argb_c[i]) -                                  \
+            static_cast<int>(dst_argb_opt[i]));                                \
+    if (abs_diff > max_diff) {                                                 \
+      max_diff = abs_diff;                                                     \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_page_end(src_argb);                                      \
+  free_aligned_buffer_page_end(dst_argb_c);                                    \
+  free_aligned_buffer_page_end(dst_argb_opt);                                  \
+}
+
+#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                       \
+                       FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                 \
+TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) {                         \
+  for (int times = 0; times < benchmark_iterations_; ++times) {                \
+    const int kWidth = (fastrand() & 63) + 1;                                  \
+    const int kHeight = (fastrand() & 31) + 1;                                 \
+    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;       \
+    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;       \
+    const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\
+    const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\
+    align_buffer_page_end(src_argb, kStrideA * kHeightA);                      \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                    \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                  \
+    for (int i = 0; i < kStrideA * kHeightA; ++i) {                            \
+      src_argb[i] = (fastrand() & 0xff);                                       \
+    }                                                                          \
+    memset(dst_argb_c, 123, kStrideB * kHeightB);                              \
+    memset(dst_argb_opt, 123, kStrideB * kHeightB);                            \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_A##To##FMT_B(src_argb, kStrideA,                                       \
+                     dst_argb_c, kStrideB,                                     \
+                     kWidth, kHeight);                                         \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    FMT_A##To##FMT_B(src_argb, kStrideA,                                       \
+                     dst_argb_opt, kStrideB,                                   \
+                     kWidth, kHeight);                                         \
+    int max_diff = 0;                                                          \
+    for (int i = 0; i < kStrideB * kHeightB; ++i) {                            \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_argb_c[i]) -                                \
+              static_cast<int>(dst_argb_opt[i]));                              \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+    EXPECT_LE(max_diff, DIFF);                                                 \
+    free_aligned_buffer_page_end(src_argb);                                    \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+  }                                                                            \
+}
+
+#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                             \
+                 FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                       \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                                \
+              FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                                \
+              benchmark_width_ - 4, DIFF, _Any, +, 0)                          \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                                \
+              FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                                \
+              benchmark_width_, DIFF, _Unaligned, +, 1)                        \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                                \
+              FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                                \
+              benchmark_width_, DIFF, _Invert, -, 0)                           \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                                \
+              FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                                \
+              benchmark_width_, DIFF, _Opt, +, 0)                              \
+    TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                           \
+                   FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)
+
+TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
+TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
+TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
+TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
+TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1, 0)
+TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, 4)
+TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, 4)
+TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
+TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
+
+#define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                           \
+                   FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                           \
+                   W1280, DIFF, N, NEG, OFF)                                   \
+TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) {                       \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;         \
+  const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;         \
+  const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;  \
+  const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;  \
+  align_buffer_page_end(src_argb, kStrideA * kHeightA + OFF);                  \
+  align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                      \
+  align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                    \
+  for (int i = 0; i < kStrideA * kHeightA; ++i) {                              \
+    src_argb[i + OFF] = (fastrand() & 0xff);                                   \
+  }                                                                            \
+  memset(dst_argb_c, 1, kStrideB * kHeightB);                                  \
+  memset(dst_argb_opt, 101, kStrideB * kHeightB);                              \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA,                           \
+                           dst_argb_c, kStrideB,                               \
+                           NULL, kWidth, NEG kHeight);                         \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA,                         \
+                             dst_argb_opt, kStrideB,                           \
+                             NULL, kWidth, NEG kHeight);                       \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kStrideB * kHeightB; ++i) {                              \
+    int abs_diff =                                                             \
+        abs(static_cast<int>(dst_argb_c[i]) -                                  \
+            static_cast<int>(dst_argb_opt[i]));                                \
+    if (abs_diff > max_diff) {                                                 \
+      max_diff = abs_diff;                                                     \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_page_end(src_argb);                                      \
+  free_aligned_buffer_page_end(dst_argb_c);                                    \
+  free_aligned_buffer_page_end(dst_argb_opt);                                  \
+}
+
+#define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                      \
+                       FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                 \
+TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither_Random) {                   \
+  for (int times = 0; times < benchmark_iterations_; ++times) {                \
+    const int kWidth = (fastrand() & 63) + 1;                                  \
+    const int kHeight = (fastrand() & 31) + 1;                                 \
+    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;       \
+    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;       \
+    const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\
+    const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\
+    align_buffer_page_end(src_argb, kStrideA * kHeightA);                      \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                    \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                  \
+    for (int i = 0; i < kStrideA * kHeightA; ++i) {                            \
+      src_argb[i] = (fastrand() & 0xff);                                       \
+    }                                                                          \
+    memset(dst_argb_c, 123, kStrideB * kHeightB);                              \
+    memset(dst_argb_opt, 123, kStrideB * kHeightB);                            \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_A##To##FMT_B##Dither(src_argb, kStrideA,                               \
+                             dst_argb_c, kStrideB,                             \
+                             NULL, kWidth, kHeight);                           \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    FMT_A##To##FMT_B##Dither(src_argb, kStrideA,                               \
+                             dst_argb_opt, kStrideB,                           \
+                             NULL, kWidth, kHeight);                           \
+    int max_diff = 0;                                                          \
+    for (int i = 0; i < kStrideB * kHeightB; ++i) {                            \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_argb_c[i]) -                                \
+              static_cast<int>(dst_argb_opt[i]));                              \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+    EXPECT_LE(max_diff, DIFF);                                                 \
+    free_aligned_buffer_page_end(src_argb);                                    \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+  }                                                                            \
+}
+
+#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                            \
+                  FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                      \
+    TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                               \
+               FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                               \
+               benchmark_width_ - 4, DIFF, _Any, +, 0)                         \
+    TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                               \
+               FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                               \
+               benchmark_width_, DIFF, _Unaligned, +, 1)                       \
+    TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                               \
+               FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                               \
+               benchmark_width_, DIFF, _Invert, -, 0)                          \
+    TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                               \
+               FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                               \
+               benchmark_width_, DIFF, _Opt, +, 0)                             \
+    TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                          \
+                    FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)
+
+TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
+
+#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A,                          \
+                 W1280, N, NEG, OFF)                                           \
+TEST_F(LibYUVConvertTest, FMT_ATOB##_Symetric##N) {                            \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;         \
+  const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;  \
+  align_buffer_page_end(src_argb, kStrideA * kHeightA + OFF);                  \
+  align_buffer_page_end(dst_argb_c, kStrideA * kHeightA);                      \
+  align_buffer_page_end(dst_argb_opt, kStrideA * kHeightA);                    \
+  for (int i = 0; i < kStrideA * kHeightA; ++i) {                              \
+    src_argb[i + OFF] = (fastrand() & 0xff);                                   \
+  }                                                                            \
+  memset(dst_argb_c, 1, kStrideA * kHeightA);                                  \
+  memset(dst_argb_opt, 101, kStrideA * kHeightA);                              \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_ATOB(src_argb + OFF, kStrideA,                                           \
+           dst_argb_c, kStrideA,                                               \
+           kWidth, NEG kHeight);                                               \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_ATOB(src_argb + OFF, kStrideA,                                         \
+             dst_argb_opt, kStrideA,                                           \
+             kWidth, NEG kHeight);                                             \
+  }                                                                            \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_ATOB(dst_argb_c, kStrideA,                                               \
+           dst_argb_c, kStrideA,                                               \
+           kWidth, NEG kHeight);                                               \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  FMT_ATOB(dst_argb_opt, kStrideA,                                             \
+           dst_argb_opt, kStrideA,                                             \
+           kWidth, NEG kHeight);                                               \
+  for (int i = 0; i < kStrideA * kHeightA; ++i) {                              \
+    EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]);                             \
+    EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                                 \
+  }                                                                            \
+  free_aligned_buffer_page_end(src_argb);                                      \
+  free_aligned_buffer_page_end(dst_argb_c);                                    \
+  free_aligned_buffer_page_end(dst_argb_opt);                                  \
+}
+
+#define TESTSYM(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A)                           \
+    TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A,                              \
+             benchmark_width_ - 4, _Any, +, 0)                                 \
+    TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A,                              \
+             benchmark_width_, _Unaligned, +, 1)                               \
+    TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A,                              \
+             benchmark_width_, _Opt, +, 0)
+
+TESTSYM(ARGBToARGB, 4, 4, 1)
+TESTSYM(ARGBToBGRA, 4, 4, 1)
+TESTSYM(ARGBToABGR, 4, 4, 1)
+TESTSYM(BGRAToARGB, 4, 4, 1)
+TESTSYM(ABGRToARGB, 4, 4, 1)
+
+TEST_F(LibYUVConvertTest, Test565) {
+  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  SIMD_ALIGNED(uint8 pixels565[256][2]);
+
+  for (int i = 0; i < 256; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      orig_pixels[i][j] = i;
+    }
+  }
+  ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
+  uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
+  EXPECT_EQ(610919429u, checksum);
+}
+
+#ifdef HAVE_JPEG
+TEST_F(LibYUVConvertTest, ValidateJpeg) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+    benchmark_width_ * benchmark_height_ : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  align_buffer_page_end(orig_pixels, kSize);
+
+  // No SOI or EOI. Expect fail.
+  memset(orig_pixels, 0, kSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  // Test special value that matches marker start.
+  memset(orig_pixels, 0xff, kSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  // EOI, SOI. Expect pass.
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
+  }
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+    benchmark_width_ * benchmark_height_ : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  const int kMultiple = 10;
+  const int kBufSize = kImageSize * kMultiple + kOff;
+  align_buffer_page_end(orig_pixels, kBufSize);
+
+  // No SOI or EOI. Expect fail.
+  memset(orig_pixels, 0, kBufSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kBufSize));
+
+  // EOI, SOI. Expect pass.
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    EXPECT_TRUE(ValidateJpeg(orig_pixels, kBufSize));
+  }
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, InvalidateJpeg) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+    benchmark_width_ * benchmark_height_ : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  align_buffer_page_end(orig_pixels, kSize);
+
+  // NULL pointer. Expect fail.
+  EXPECT_FALSE(ValidateJpeg(NULL, kSize));
+
+  // Negative size. Expect fail.
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, -1));
+
+  // Too large size. Expect fail.
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, 0xfb000000ull));
+
+  // No SOI or EOI. Expect fail.
+  memset(orig_pixels, 0, kSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  // SOI but no EOI. Expect fail.
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+  }
+
+  // EOI but no SOI. Expect fail.
+  orig_pixels[0] = 0;
+  orig_pixels[1] = 0;
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, FuzzJpeg) {
+  // SOI but no EOI. Expect fail.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    const int kSize = fastrand() % 5000 + 2;
+    align_buffer_page_end(orig_pixels, kSize);
+    MemRandomize(orig_pixels, kSize);
+
+    // Add SOI so frame will be scanned.
+    orig_pixels[0] = 0xff;
+    orig_pixels[1] = 0xd8;  // SOI.
+    orig_pixels[kSize - 1] = 0xff;
+    ValidateJpeg(orig_pixels, kSize);  // Failure normally expected.
+    free_aligned_buffer_page_end(orig_pixels);
+  }
+}
+
+TEST_F(LibYUVConvertTest, MJPGToI420) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+    benchmark_width_ * benchmark_height_ : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  align_buffer_page_end(orig_pixels, kSize);
+  align_buffer_page_end(dst_y_opt, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(dst_u_opt,
+                        SUBSAMPLE(benchmark_width_, 2) *
+                        SUBSAMPLE(benchmark_height_, 2));
+  align_buffer_page_end(dst_v_opt,
+                        SUBSAMPLE(benchmark_width_, 2) *
+                        SUBSAMPLE(benchmark_height_, 2));
+
+  // EOI, SOI to make MJPG appear valid.
+  memset(orig_pixels, 0, kSize);
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    int ret = MJPGToI420(orig_pixels, kSize,
+                         dst_y_opt, benchmark_width_,
+                         dst_u_opt, SUBSAMPLE(benchmark_width_, 2),
+                         dst_v_opt, SUBSAMPLE(benchmark_width_, 2),
+                         benchmark_width_, benchmark_height_,
+                         benchmark_width_, benchmark_height_);
+    // Expect failure because image is not really valid.
+    EXPECT_EQ(1, ret);
+  }
+
+  free_aligned_buffer_page_end(dst_y_opt);
+  free_aligned_buffer_page_end(dst_u_opt);
+  free_aligned_buffer_page_end(dst_v_opt);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, MJPGToARGB) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+    benchmark_width_ * benchmark_height_ : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  align_buffer_page_end(orig_pixels, kSize);
+  align_buffer_page_end(dst_argb_opt, benchmark_width_ * benchmark_height_ * 4);
+
+  // EOI, SOI to make MJPG appear valid.
+  memset(orig_pixels, 0, kSize);
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    int ret = MJPGToARGB(orig_pixels, kSize,
+                         dst_argb_opt, benchmark_width_ * 4,
+                         benchmark_width_, benchmark_height_,
+                         benchmark_width_, benchmark_height_);
+    // Expect failure because image is not really valid.
+    EXPECT_EQ(1, ret);
+  }
+
+  free_aligned_buffer_page_end(dst_argb_opt);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+#endif  // HAVE_JPEG
+
+TEST_F(LibYUVConvertTest, NV12Crop) {
+  const int SUBSAMP_X = 2;
+  const int SUBSAMP_Y = 2;
+  const int kWidth = benchmark_width_;
+  const int kHeight = benchmark_height_;
+  const int crop_y =
+    ((benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2 + 1) & ~1;
+  const int kDestWidth = benchmark_width_;
+  const int kDestHeight = benchmark_height_ - crop_y * 2;
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);
+  const int sample_size = kWidth * kHeight +
+    kStrideUV *
+    SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;
+  align_buffer_page_end(src_y, sample_size);
+  uint8* src_uv = src_y + kWidth * kHeight;
+
+  align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
+  align_buffer_page_end(dst_u,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                  SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  align_buffer_page_end(dst_v,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                  SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  align_buffer_page_end(dst_y_2, kDestWidth * kDestHeight);
+  align_buffer_page_end(dst_u_2,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                  SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  align_buffer_page_end(dst_v_2,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                  SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  for (int i = 0; i < kHeight * kWidth; ++i) {
+    src_y[i] = (fastrand() & 0xff);
+  }
+  for (int i = 0; i < (SUBSAMPLE(kHeight, SUBSAMP_Y) *
+       kStrideUV) * 2; ++i) {
+    src_uv[i] = (fastrand() & 0xff);
+  }
+  memset(dst_y, 1, kDestWidth * kDestHeight);
+  memset(dst_u, 2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_v, 3, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_y_2, 1, kDestWidth * kDestHeight);
+  memset(dst_u_2, 2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_v_2, 3, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  ConvertToI420(src_y, sample_size,
+                dst_y_2, kDestWidth,
+                dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X),
+                dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X),
+                0, crop_y,
+                kWidth, kHeight,
+                kDestWidth, kDestHeight,
+                libyuv::kRotate0, libyuv::FOURCC_NV12);
+
+  NV12ToI420(src_y + crop_y * kWidth, kWidth,
+             src_uv + (crop_y / 2) * kStrideUV * 2,
+               kStrideUV * 2,
+             dst_y, kDestWidth,
+             dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X),
+             dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X),
+             kDestWidth, kDestHeight);
+
+  for (int i = 0; i < kDestHeight; ++i) {
+    for (int j = 0; j < kDestWidth; ++j) {
+      EXPECT_EQ(dst_y[i * kWidth + j], dst_y_2[i * kWidth + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+                dst_u_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+                dst_v_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_u);
+  free_aligned_buffer_page_end(dst_v);
+  free_aligned_buffer_page_end(dst_y_2);
+  free_aligned_buffer_page_end(dst_u_2);
+  free_aligned_buffer_page_end(dst_v_2);
+  free_aligned_buffer_page_end(src_y);
+}
+
+TEST_F(LibYUVConvertTest, TestYToARGB) {
+  uint8 y[32];
+  uint8 expectedg[32];
+  for (int i = 0; i < 32; ++i) {
+    y[i] = i * 5 + 17;
+    expectedg[i] = static_cast<int>((y[i] - 16) * 1.164f + 0.5f);
+  }
+  uint8 argb[32 * 4];
+  YToARGB(y, 0, argb, 0, 32, 1);
+
+  for (int i = 0; i < 32; ++i) {
+    printf("%2d %d: %d <-> %d,%d,%d,%d\n", i, y[i], expectedg[i],
+           argb[i * 4 + 0],
+           argb[i * 4 + 1],
+           argb[i * 4 + 2],
+           argb[i * 4 + 3]);
+  }
+  for (int i = 0; i < 32; ++i) {
+    EXPECT_EQ(expectedg[i], argb[i * 4 + 0]);
+  }
+}
+
+static const uint8 kNoDither4x4[16] = {
+  0, 0, 0, 0,
+  0, 0, 0, 0,
+  0, 0, 0, 0,
+  0, 0, 0, 0,
+};
+
+TEST_F(LibYUVConvertTest, TestNoDither) {
+  align_buffer_page_end(src_argb, benchmark_width_ * benchmark_height_ * 4);
+  align_buffer_page_end(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+  align_buffer_page_end(dst_rgb565dither,
+                        benchmark_width_ * benchmark_height_ * 2);
+  MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4);
+  MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+  MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
+  ARGBToRGB565(src_argb, benchmark_width_ * 4,
+               dst_rgb565, benchmark_width_ * 2,
+               benchmark_width_, benchmark_height_);
+  ARGBToRGB565Dither(src_argb, benchmark_width_ * 4,
+                     dst_rgb565dither, benchmark_width_ * 2,
+                     kNoDither4x4, benchmark_width_, benchmark_height_);
+  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
+    EXPECT_EQ(dst_rgb565[i], dst_rgb565dither[i]);
+  }
+
+  free_aligned_buffer_page_end(src_argb);
+  free_aligned_buffer_page_end(dst_rgb565);
+  free_aligned_buffer_page_end(dst_rgb565dither);
+}
+
+// Ordered 4x4 dither for 888 to 565.  Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+  0, 4, 1, 5,
+  6, 2, 7, 3,
+  1, 5, 0, 4,
+  7, 3, 6, 2,
+};
+
+TEST_F(LibYUVConvertTest, TestDither) {
+  align_buffer_page_end(src_argb, benchmark_width_ * benchmark_height_ * 4);
+  align_buffer_page_end(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+  align_buffer_page_end(dst_rgb565dither,
+                        benchmark_width_ * benchmark_height_ * 2);
+  align_buffer_page_end(dst_argb, benchmark_width_ * benchmark_height_ * 4);
+  align_buffer_page_end(dst_argbdither,
+                        benchmark_width_ * benchmark_height_ * 4);
+  MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4);
+  MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+  MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
+  MemRandomize(dst_argb, benchmark_width_ * benchmark_height_ * 4);
+  MemRandomize(dst_argbdither, benchmark_width_ * benchmark_height_ * 4);
+  ARGBToRGB565(src_argb, benchmark_width_ * 4,
+               dst_rgb565, benchmark_width_ * 2,
+               benchmark_width_, benchmark_height_);
+  ARGBToRGB565Dither(src_argb, benchmark_width_ * 4,
+                     dst_rgb565dither, benchmark_width_ * 2,
+                     kDither565_4x4, benchmark_width_, benchmark_height_);
+  RGB565ToARGB(dst_rgb565, benchmark_width_ * 2,
+               dst_argb, benchmark_width_ * 4,
+               benchmark_width_, benchmark_height_);
+  RGB565ToARGB(dst_rgb565dither, benchmark_width_ * 2,
+               dst_argbdither, benchmark_width_ * 4,
+               benchmark_width_, benchmark_height_);
+
+  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
+    EXPECT_NEAR(dst_argb[i], dst_argbdither[i], 9);
+  }
+  free_aligned_buffer_page_end(src_argb);
+  free_aligned_buffer_page_end(dst_rgb565);
+  free_aligned_buffer_page_end(dst_rgb565dither);
+  free_aligned_buffer_page_end(dst_argb);
+  free_aligned_buffer_page_end(dst_argbdither);
+}
+
+#define TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                       YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C)         \
+TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) {                  \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                     \
+  const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                        \
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                          \
+  const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);               \
+  align_buffer_page_end(src_y, kWidth * kHeight + OFF);                        \
+  align_buffer_page_end(src_u, kSizeUV + OFF);                                 \
+  align_buffer_page_end(src_v, kSizeUV + OFF);                                 \
+  align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);                 \
+  align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);               \
+  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
+    src_y[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  for (int i = 0; i < kSizeUV; ++i) {                                          \
+    src_u[i + OFF] = (fastrand() & 0xff);                                      \
+    src_v[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                             \
+  memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                         \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth,                           \
+                        src_u + OFF, kStrideUV,                                \
+                        src_v + OFF, kStrideUV,                                \
+                        dst_argb_c + OFF, kStrideB,                            \
+                        NULL, kWidth, NEG kHeight);                            \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth,                         \
+                          src_u + OFF, kStrideUV,                              \
+                          src_v + OFF, kStrideUV,                              \
+                          dst_argb_opt + OFF, kStrideB,                        \
+                          NULL, kWidth, NEG kHeight);                          \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  /* Convert to ARGB so 565 is expanded to bytes that can be compared. */      \
+  align_buffer_page_end(dst_argb32_c, kWidth * BPP_C  * kHeight);              \
+  align_buffer_page_end(dst_argb32_opt, kWidth * BPP_C  * kHeight);            \
+  memset(dst_argb32_c, 2, kWidth * BPP_C  * kHeight);                          \
+  memset(dst_argb32_opt, 102, kWidth * BPP_C  * kHeight);                      \
+  FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB,                                 \
+                   dst_argb32_c, kWidth * BPP_C ,                              \
+                   kWidth, kHeight);                                           \
+  FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB,                               \
+                   dst_argb32_opt, kWidth * BPP_C ,                            \
+                   kWidth, kHeight);                                           \
+  for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                         \
+    int abs_diff =                                                             \
+        abs(static_cast<int>(dst_argb32_c[i]) -                                \
+            static_cast<int>(dst_argb32_opt[i]));                              \
+    if (abs_diff > max_diff) {                                                 \
+      max_diff = abs_diff;                                                     \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_page_end(src_y);                                         \
+  free_aligned_buffer_page_end(src_u);                                         \
+  free_aligned_buffer_page_end(src_v);                                         \
+  free_aligned_buffer_page_end(dst_argb_c);                                    \
+  free_aligned_buffer_page_end(dst_argb_opt);                                  \
+  free_aligned_buffer_page_end(dst_argb32_c);                                  \
+  free_aligned_buffer_page_end(dst_argb32_opt);                                \
+}
+
+#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                      YALIGN, DIFF, FMT_C, BPP_C)                              \
+    TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C)          \
+    TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C)        \
+    TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C)           \
+    TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+
+TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
+
+#define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12)                                 \
+TEST_F(LibYUVConvertTest, NAME) {                                              \
+  const int kWidth = benchmark_width_;                                         \
+  const int kHeight = benchmark_height_;                                       \
+                                                                               \
+  align_buffer_page_end(orig_uyvy,                                             \
+                  4 * SUBSAMPLE(kWidth, 2) * kHeight);                         \
+  align_buffer_page_end(orig_y, kWidth * kHeight);                             \
+  align_buffer_page_end(orig_u,                                                \
+                  SUBSAMPLE(kWidth, 2) *                                       \
+                  SUBSAMPLE(kHeight, 2));                                      \
+  align_buffer_page_end(orig_v,                                                \
+                  SUBSAMPLE(kWidth, 2) *                                       \
+                  SUBSAMPLE(kHeight, 2));                                      \
+                                                                               \
+  align_buffer_page_end(dst_y_orig, kWidth * kHeight);                         \
+  align_buffer_page_end(dst_uv_orig, 2 *                                       \
+                  SUBSAMPLE(kWidth, 2) *                                       \
+                  SUBSAMPLE(kHeight, 2));                                      \
+                                                                               \
+  align_buffer_page_end(dst_y, kWidth * kHeight);                              \
+  align_buffer_page_end(dst_uv, 2 *                                            \
+                  SUBSAMPLE(kWidth, 2) *                                       \
+                  SUBSAMPLE(kHeight, 2));                                      \
+                                                                               \
+  MemRandomize(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight);                 \
+                                                                               \
+  /* Convert UYVY to NV12 in 2 steps for reference */                          \
+  libyuv::UYVYTOI420(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2),                      \
+                     orig_y, kWidth,                                           \
+                     orig_u, SUBSAMPLE(kWidth, 2),                             \
+                     orig_v, SUBSAMPLE(kWidth, 2),                             \
+                     kWidth, kHeight);                                         \
+  libyuv::I420ToNV12(orig_y, kWidth,                                           \
+                     orig_u, SUBSAMPLE(kWidth, 2),                             \
+                     orig_v, SUBSAMPLE(kWidth, 2),                             \
+                     dst_y_orig, kWidth,                                       \
+                     dst_uv_orig, 2 * SUBSAMPLE(kWidth, 2),                    \
+                     kWidth, kHeight);                                         \
+                                                                               \
+  /* Convert to NV12 */                                                        \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    libyuv::UYVYTONV12(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2),                    \
+                       dst_y, kWidth,                                          \
+                       dst_uv, 2 * SUBSAMPLE(kWidth, 2),                       \
+                       kWidth, kHeight);                                       \
+  }                                                                            \
+                                                                               \
+  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
+    EXPECT_EQ(orig_y[i], dst_y[i]);                                            \
+  }                                                                            \
+  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
+    EXPECT_EQ(dst_y_orig[i], dst_y[i]);                                        \
+  }                                                                            \
+  for (int i = 0; i < 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2); ++i) { \
+    EXPECT_EQ(dst_uv_orig[i], dst_uv[i]);                                      \
+  }                                                                            \
+                                                                               \
+  free_aligned_buffer_page_end(orig_uyvy);                                     \
+  free_aligned_buffer_page_end(orig_y);                                        \
+  free_aligned_buffer_page_end(orig_u);                                        \
+  free_aligned_buffer_page_end(orig_v);                                        \
+  free_aligned_buffer_page_end(dst_y_orig);                                    \
+  free_aligned_buffer_page_end(dst_uv_orig);                                   \
+  free_aligned_buffer_page_end(dst_y);                                         \
+  free_aligned_buffer_page_end(dst_uv);                                        \
+}
+
+TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12)
+TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
+
+#define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,  \
+                       W1280, N, NEG, OFF, FMT_C, BPP_C)                       \
+TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) {                \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                       \
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                          \
+  const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);               \
+  align_buffer_page_end(src_y, kWidth * kHeight + OFF);                        \
+  align_buffer_page_end(src_u, kSizeUV + OFF);                                 \
+  align_buffer_page_end(src_v, kSizeUV + OFF);                                 \
+  align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF);                 \
+  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
+    src_y[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  for (int i = 0; i < kSizeUV; ++i) {                                          \
+    src_u[i + OFF] = (fastrand() & 0xff);                                      \
+    src_v[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                             \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                 \
+                          src_u + OFF, kStrideUV,                              \
+                          src_v + OFF, kStrideUV,                              \
+                          dst_argb_b + OFF, kStrideB,                          \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  /* Convert to a 3rd format in 1 step and 2 steps and compare  */             \
+  const int kStrideC = kWidth * BPP_C;                                         \
+  align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF);                 \
+  align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF);                \
+  memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                             \
+  memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                            \
+  FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth,                                   \
+                        src_u + OFF, kStrideUV,                                \
+                        src_v + OFF, kStrideUV,                                \
+                        dst_argb_c + OFF, kStrideC,                            \
+                        kWidth, NEG kHeight);                                  \
+  /* Convert B to C */                                                         \
+  FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB,                                 \
+                   dst_argb_bc + OFF, kStrideC,                                \
+                   kWidth, kHeight);                                           \
+  for (int i = 0; i < kStrideC * kHeight; ++i) {                               \
+    EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                      \
+  }                                                                            \
+  free_aligned_buffer_page_end(src_y);                                         \
+  free_aligned_buffer_page_end(src_u);                                         \
+  free_aligned_buffer_page_end(src_v);                                         \
+  free_aligned_buffer_page_end(dst_argb_b);                                    \
+  free_aligned_buffer_page_end(dst_argb_c);                                    \
+  free_aligned_buffer_page_end(dst_argb_bc);                                   \
+}
+
+#define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,   \
+                      FMT_C, BPP_C)                                            \
+    TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+        benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C)                        \
+    TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+        benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C)                      \
+    TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+        benchmark_width_, _Invert, -, 0, FMT_C, BPP_C)                         \
+    TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+        benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
+
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, BGRA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3)
+TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2)
+TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I411, 4, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4)
+
+#define TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+                       W1280, N, NEG, OFF, FMT_C, BPP_C, ATTEN)                \
+TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) {                \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                       \
+  const int kSizeUV =                                                          \
+    SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y);              \
+  align_buffer_page_end(src_y, kWidth * kHeight + OFF);                        \
+  align_buffer_page_end(src_u, kSizeUV + OFF);                                 \
+  align_buffer_page_end(src_v, kSizeUV + OFF);                                 \
+  align_buffer_page_end(src_a, kWidth * kHeight + OFF);                        \
+  align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF);                 \
+  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
+    src_y[i + OFF] = (fastrand() & 0xff);                                      \
+    src_a[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  for (int i = 0; i < kSizeUV; ++i) {                                          \
+    src_u[i + OFF] = (fastrand() & 0xff);                                      \
+    src_v[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                             \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                 \
+                          src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),           \
+                          src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),           \
+                          src_a + OFF, kWidth,                                 \
+                          dst_argb_b + OFF, kStrideB,                          \
+                          kWidth, NEG kHeight, ATTEN);                         \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  /* Convert to a 3rd format in 1 step and 2 steps and compare  */             \
+  const int kStrideC = kWidth * BPP_C;                                         \
+  align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF);                 \
+  align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF);                \
+  memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                             \
+  memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                            \
+  FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth,                                   \
+                        src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),             \
+                        src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),             \
+                        src_a + OFF, kWidth,                                   \
+                        dst_argb_c + OFF, kStrideC,                            \
+                        kWidth, NEG kHeight, ATTEN);                           \
+  /* Convert B to C */                                                         \
+  FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB,                                 \
+                   dst_argb_bc + OFF, kStrideC,                                \
+                   kWidth, kHeight);                                           \
+  for (int i = 0; i < kStrideC * kHeight; ++i) {                               \
+    EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                      \
+  }                                                                            \
+  free_aligned_buffer_page_end(src_y);                                         \
+  free_aligned_buffer_page_end(src_u);                                         \
+  free_aligned_buffer_page_end(src_v);                                         \
+  free_aligned_buffer_page_end(src_a);                                         \
+  free_aligned_buffer_page_end(dst_argb_b);                                    \
+  free_aligned_buffer_page_end(dst_argb_c);                                    \
+  free_aligned_buffer_page_end(dst_argb_bc);                                   \
+}
+
+#define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,  \
+                      FMT_C, BPP_C)                                            \
+    TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,     \
+        benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C, 0)                     \
+    TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,     \
+        benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C, 0)                   \
+    TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,     \
+        benchmark_width_, _Invert, -, 0, FMT_C, BPP_C, 0)                      \
+    TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,     \
+        benchmark_width_, _Opt, +, 0, FMT_C, BPP_C, 0)                         \
+      TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,   \
+          benchmark_width_, _Premult, +, 0, FMT_C, BPP_C, 1)
+
+TESTQPLANARTOE(I420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+
+}  // namespace libyuv
diff --git a/files/unit_test/cpu_test.cc b/files/unit_test/cpu_test.cc
index 52810e80..0cd06f9b 100644
--- a/files/unit_test/cpu_test.cc
+++ b/files/unit_test/cpu_test.cc
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -18,8 +18,8 @@
 
 namespace libyuv {
 
-TEST_F(libyuvTest, TestCpuHas) {
-  int cpu_flags = TestCpuFlag(~kCpuInitialized);
+TEST_F(LibYUVBaseTest, TestCpuHas) {
+  int cpu_flags = TestCpuFlag(-1);
   printf("Cpu Flags %x\n", cpu_flags);
   int has_arm = TestCpuFlag(kCpuHasARM);
   printf("Has ARM %x\n", has_arm);
@@ -39,14 +39,46 @@ TEST_F(libyuvTest, TestCpuHas) {
   printf("Has AVX %x\n", has_avx);
   int has_avx2 = TestCpuFlag(kCpuHasAVX2);
   printf("Has AVX2 %x\n", has_avx2);
+  int has_erms = TestCpuFlag(kCpuHasERMS);
+  printf("Has ERMS %x\n", has_erms);
+  int has_fma3 = TestCpuFlag(kCpuHasFMA3);
+  printf("Has FMA3 %x\n", has_fma3);
+  int has_avx3 = TestCpuFlag(kCpuHasAVX3);
+  printf("Has AVX3 %x\n", has_avx3);
+  int has_mips = TestCpuFlag(kCpuHasMIPS);
+  printf("Has MIPS %x\n", has_mips);
+  int has_dspr2 = TestCpuFlag(kCpuHasDSPR2);
+  printf("Has DSPR2 %x\n", has_dspr2);
+}
+
+TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) {
+#if defined(__aarch64__)
+  printf("Arm64 build\n");
+#endif
+#if defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)
+  printf("Neon build enabled\n");
+#endif
+#if defined(__x86_64__) || defined(_M_X64)
+  printf("x64 build\n");
+#endif
+#ifdef _MSC_VER
+printf("_MSC_VER %d\n", _MSC_VER);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && (defined(GCC_HAS_AVX2) || \
+    defined(CLANG_HAS_AVX2) || defined(VISUALC_HAS_AVX2))
+  printf("Has AVX2 1\n");
+#else
+  printf("Has AVX2 0\n");
+  // If compiler does not support AVX2, the following function not expected:
+#endif
 }
 
 #if defined(__i386__) || defined(__x86_64__) || \
     defined(_M_IX86) || defined(_M_X64)
-TEST_F(libyuvTest, TestCpuId) {
+TEST_F(LibYUVBaseTest, TestCpuId) {
   int has_x86 = TestCpuFlag(kCpuHasX86);
   if (has_x86) {
-    int cpu_info[4];
+    uint32 cpu_info[4];
     // Vendor ID:
     // AuthenticAMD AMD processor
     // CentaurHauls Centaur processor
@@ -58,7 +90,7 @@ TEST_F(libyuvTest, TestCpuId) {
     // RiseRiseRise Rise Technology processor
     // SiS SiS SiS  SiS processor
     // UMC UMC UMC  UMC processor
-    CpuId(cpu_info, 0);
+    CpuId(0, 0, cpu_info);
     cpu_info[0] = cpu_info[1];  // Reorder output
     cpu_info[1] = cpu_info[3];
     cpu_info[3] = 0;
@@ -73,7 +105,7 @@ TEST_F(libyuvTest, TestCpuId) {
     // 13:12 - Processor Type
     // 19:16 - Extended Model
     // 27:20 - Extended Family
-    CpuId(cpu_info, 1);
+    CpuId(1, 0, cpu_info);
     int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
     int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
     printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
@@ -82,18 +114,25 @@ TEST_F(libyuvTest, TestCpuId) {
 }
 #endif
 
-TEST_F(libyuvTest, TestLinuxNeon) {
-  int testdata = ArmCpuCaps("unit_test/testdata/arm_v7.txt");
-  if (testdata) {
-    EXPECT_EQ(kCpuInitialized,
-              ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
-    EXPECT_EQ((kCpuInitialized | kCpuHasNEON),
-              ArmCpuCaps("unit_test/testdata/tegra3.txt"));
+static int FileExists(const char* file_name) {
+  FILE* f = fopen(file_name, "r");
+  if (!f) {
+    return 0;
+  }
+  fclose(f);
+  return 1;
+}
+
+TEST_F(LibYUVBaseTest, TestLinuxNeon) {
+  if (FileExists("../../unit_test/testdata/arm_v7.txt")) {
+    EXPECT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt"));
+    EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt"));
+    EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/juno.txt"));
   } else {
-    printf("WARNING: unable to load \"unit_test/testdata/arm_v7.txt\"\n");
+    printf("WARNING: unable to load \"../../unit_test/testdata/arm_v7.txt\"\n");
   }
 #if defined(__linux__) && defined(__ARM_NEON__)
-  EXPECT_NE(0, ArmCpuCaps("/proc/cpuinfo"));
+  EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("/proc/cpuinfo"));
 #endif
 }
 
diff --git a/files/unit_test/math_test.cc b/files/unit_test/math_test.cc
new file mode 100644
index 00000000..19af9f6b
--- /dev/null
+++ b/files/unit_test/math_test.cc
@@ -0,0 +1,155 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+TEST_F(LibYUVBaseTest, TestFixedDiv) {
+  int num[1280];
+  int div[1280];
+  int result_opt[1280];
+  int result_c[1280];
+
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(1, 1));
+  EXPECT_EQ(0x7fff0000, libyuv::FixedDiv(0x7fff, 1));
+  // TODO(fbarchard): Avoid the following that throw exceptions.
+  // EXPECT_EQ(0x100000000, libyuv::FixedDiv(0x10000, 1));
+  // EXPECT_EQ(0x80000000, libyuv::FixedDiv(0x8000, 1));
+
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(640 * 2, 640));
+  EXPECT_EQ(0x30000, libyuv::FixedDiv(640 * 3, 640));
+  EXPECT_EQ(0x40000, libyuv::FixedDiv(640 * 4, 640));
+  EXPECT_EQ(0x50000, libyuv::FixedDiv(640 * 5, 640));
+  EXPECT_EQ(0x60000, libyuv::FixedDiv(640 * 6, 640));
+  EXPECT_EQ(0x70000, libyuv::FixedDiv(640 * 7, 640));
+  EXPECT_EQ(0x80000, libyuv::FixedDiv(640 * 8, 640));
+  EXPECT_EQ(0xa0000, libyuv::FixedDiv(640 * 10, 640));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(960 * 2, 960));
+  EXPECT_EQ(0x08000, libyuv::FixedDiv(640 / 2, 640));
+  EXPECT_EQ(0x04000, libyuv::FixedDiv(640 / 4, 640));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(1080 * 2, 1080));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(200000, 100000));
+  EXPECT_EQ(0x18000, libyuv::FixedDiv(150000, 100000));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(40000, 20000));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(-40000, -20000));
+  EXPECT_EQ(-0x20000, libyuv::FixedDiv(40000, -20000));
+  EXPECT_EQ(-0x20000, libyuv::FixedDiv(-40000, 20000));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(4095, 4095));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(4096, 4096));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(4097, 4097));
+  EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
+
+  for (int i = 1; i < 4100; ++i) {
+    EXPECT_EQ(0x10000, libyuv::FixedDiv(i, i));
+    EXPECT_EQ(0x20000, libyuv::FixedDiv(i * 2, i));
+    EXPECT_EQ(0x30000, libyuv::FixedDiv(i * 3, i));
+    EXPECT_EQ(0x40000, libyuv::FixedDiv(i * 4, i));
+    EXPECT_EQ(0x08000, libyuv::FixedDiv(i, i * 2));
+    EXPECT_NEAR(16384 * 65536 / i, libyuv::FixedDiv(16384, i), 1);
+  }
+  EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
+
+  MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
+  MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
+  for (int j = 0; j < 1280; ++j) {
+    if (div[j] == 0) {
+      div[j] = 1280;
+    }
+    num[j] &= 0xffff;  // Clamp to avoid divide overflow.
+  }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    for (int j = 0; j < 1280; ++j) {
+      result_opt[j] = libyuv::FixedDiv(num[j], div[j]);
+    }
+  }
+  for (int j = 0; j < 1280; ++j) {
+    result_c[j] = libyuv::FixedDiv_C(num[j], div[j]);
+    EXPECT_NEAR(result_c[j], result_opt[j], 1);
+  }
+}
+
+TEST_F(LibYUVBaseTest, TestFixedDiv_Opt) {
+  int num[1280];
+  int div[1280];
+  int result_opt[1280];
+  int result_c[1280];
+
+  MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
+  MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
+  for (int j = 0; j < 1280; ++j) {
+    num[j] &= 4095;  // Make numerator smaller.
+    div[j] &= 4095;  // Make divisor smaller.
+    if (div[j] == 0) {
+      div[j] = 1280;
+    }
+  }
+
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    if (has_x86) {
+      for (int j = 0; j < 1280; ++j) {
+        result_opt[j] = libyuv::FixedDiv(num[j], div[j]);
+      }
+    } else {
+      for (int j = 0; j < 1280; ++j) {
+        result_opt[j] = libyuv::FixedDiv_C(num[j], div[j]);
+      }
+    }
+  }
+  for (int j = 0; j < 1280; ++j) {
+    result_c[j] = libyuv::FixedDiv_C(num[j], div[j]);
+    EXPECT_NEAR(result_c[j], result_opt[j], 1);
+  }
+}
+
+TEST_F(LibYUVBaseTest, TestFixedDiv1_Opt) {
+  int num[1280];
+  int div[1280];
+  int result_opt[1280];
+  int result_c[1280];
+
+  MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
+  MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
+  for (int j = 0; j < 1280; ++j) {
+    num[j] &= 4095;  // Make numerator smaller.
+    div[j] &= 4095;  // Make divisor smaller.
+    if (div[j] <= 1) {
+      div[j] = 1280;
+    }
+  }
+
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    if (has_x86) {
+      for (int j = 0; j < 1280; ++j) {
+        result_opt[j] = libyuv::FixedDiv1(num[j], div[j]);
+      }
+    } else {
+      for (int j = 0; j < 1280; ++j) {
+        result_opt[j] = libyuv::FixedDiv1_C(num[j], div[j]);
+      }
+    }
+  }
+  for (int j = 0; j < 1280; ++j) {
+    result_c[j] = libyuv::FixedDiv1_C(num[j], div[j]);
+    EXPECT_NEAR(result_c[j], result_opt[j], 1);
+  }
+}
+
+}  // namespace libyuv
diff --git a/files/unit_test/planar_test.cc b/files/unit_test/planar_test.cc
index e9053a35..bc0eebb5 100644
--- a/files/unit_test/planar_test.cc
+++ b/files/unit_test/planar_test.cc
@@ -4,460 +4,257 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #include <stdlib.h>
 #include <time.h>
 
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
 #include "libyuv/convert_argb.h"
 #include "libyuv/convert_from.h"
-#include "libyuv/compare.h"
+#include "libyuv/convert_from_argb.h"
 #include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
 #include "../unit_test/unit_test.h"
 
-#if defined(_MSC_VER)
-#define SIMD_ALIGNED(var) __declspec(align(16)) var
-#else  // __GNUC__
-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-#endif
-
 namespace libyuv {
 
-#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, N, NEG) \
-TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) {                        \
-  const int kWidth = 1280;                                                     \
-  const int kHeight = 720;                                                     \
-  const int kStride = (kWidth * 8 * BPP_B + 7) / 8;                            \
-  align_buffer_16(src_y, kWidth * kHeight);                                    \
-  align_buffer_16(src_u, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);            \
-  align_buffer_16(src_v, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);            \
-  align_buffer_16(dst_argb_c, kStride * kHeight);                              \
-  align_buffer_16(dst_argb_opt, kStride * kHeight);                            \
-  srandom(time(NULL));                                                         \
-  for (int i = 0; i < kHeight; ++i)                                            \
-    for (int j = 0; j < kWidth; ++j)                                           \
-      src_y[(i * kWidth) + j] = (random() & 0xff);                             \
-  for (int i = 0; i < kHeight / SUBSAMP_Y; ++i)                                \
-    for (int j = 0; j < kWidth / SUBSAMP_X; ++j) {                             \
-      src_u[(i * kWidth / SUBSAMP_X) + j] = (random() & 0xff);                 \
-      src_v[(i * kWidth / SUBSAMP_X) + j] = (random() & 0xff);                 \
-    }                                                                          \
-  MaskCpuFlags(kCpuInitialized);                                               \
-  FMT_PLANAR##To##FMT_B(src_y, kWidth,                                         \
-                        src_u, kWidth / SUBSAMP_X,                             \
-                        src_v, kWidth / SUBSAMP_X,                             \
-                        dst_argb_c, kStride,                                   \
-                        kWidth, NEG kHeight);                                  \
-  MaskCpuFlags(-1);                                                            \
-  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
-    FMT_PLANAR##To##FMT_B(src_y, kWidth,                                       \
-                          src_u, kWidth / SUBSAMP_X,                           \
-                          src_v, kWidth / SUBSAMP_X,                           \
-                          dst_argb_opt, kStride,                               \
-                          kWidth, NEG kHeight);                                \
-  }                                                                            \
-  int max_diff = 0;                                                            \
-  for (int i = 0; i < kHeight; ++i) {                                          \
-    for (int j = 0; j < kWidth * BPP_B; ++j) {                                 \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_argb_c[i * kWidth * BPP_B + j]) -           \
-              static_cast<int>(dst_argb_opt[i * kWidth * BPP_B + j]));         \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 2);                                                      \
-  free_aligned_buffer_16(src_y)                                                \
-  free_aligned_buffer_16(src_u)                                                \
-  free_aligned_buffer_16(src_v)                                                \
-  free_aligned_buffer_16(dst_argb_c)                                           \
-  free_aligned_buffer_16(dst_argb_opt)                                         \
-}
-
-#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B)          \
-    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +)        \
-    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -)
-
-TESTPLANARTOB(I420, 2, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BGRA, 4)
-TESTPLANARTOB(I420, 2, 2, ABGR, 4)
-TESTPLANARTOB(I420, 2, 2, RGBA, 4)
-TESTPLANARTOB(I420, 2, 2, RAW, 3)
-TESTPLANARTOB(I420, 2, 2, RGB24, 3)
-TESTPLANARTOB(I420, 2, 2, RGB565, 2)
-TESTPLANARTOB(I420, 2, 2, ARGB1555, 2)
-TESTPLANARTOB(I420, 2, 2, ARGB4444, 2)
-TESTPLANARTOB(I422, 2, 1, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, BGRA, 4)
-TESTPLANARTOB(I422, 2, 1, ABGR, 4)
-TESTPLANARTOB(I422, 2, 1, RGBA, 4)
-TESTPLANARTOB(I411, 4, 1, ARGB, 4)
-TESTPLANARTOB(I444, 1, 1, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, YUY2, 2)
-TESTPLANARTOB(I420, 2, 2, UYVY, 2)
-// TODO(fbarchard): Re-enable test and fix valgrind.
-// TESTPLANARTOB(I420, 2, 2, V210, 16 / 6)
-TESTPLANARTOB(I420, 2, 2, I400, 1)
-TESTPLANARTOB(I420, 2, 2, BayerBGGR, 1)
-TESTPLANARTOB(I420, 2, 2, BayerRGGB, 1)
-TESTPLANARTOB(I420, 2, 2, BayerGBRG, 1)
-TESTPLANARTOB(I420, 2, 2, BayerGRBG, 1)
-
-#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,       \
-                         N, NEG)                                               \
-TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) {                        \
-  const int kWidth = 1280;                                                     \
-  const int kHeight = 720;                                                     \
-  align_buffer_16(src_y, kWidth * kHeight);                                    \
-  align_buffer_16(src_uv, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y * 2);       \
-  align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight);                     \
-  align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight);                   \
-  srandom(time(NULL));                                                         \
-  for (int i = 0; i < kHeight; ++i)                                            \
-    for (int j = 0; j < kWidth; ++j)                                           \
-      src_y[(i * kWidth) + j] = (random() & 0xff);                             \
-  for (int i = 0; i < kHeight / SUBSAMP_Y; ++i)                                \
-    for (int j = 0; j < kWidth / SUBSAMP_X * 2; ++j) {                         \
-      src_uv[(i * kWidth / SUBSAMP_X) * 2 + j] = (random() & 0xff);            \
-    }                                                                          \
-  MaskCpuFlags(kCpuInitialized);                                               \
-  FMT_PLANAR##To##FMT_B(src_y, kWidth,                                         \
-                        src_uv, kWidth / SUBSAMP_X * 2,                        \
-                        dst_argb_c, kWidth * BPP_B,                            \
-                        kWidth, NEG kHeight);                                  \
-  MaskCpuFlags(-1);                                                            \
-  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
-    FMT_PLANAR##To##FMT_B(src_y, kWidth,                                       \
-                          src_uv, kWidth / SUBSAMP_X * 2,                      \
-                          dst_argb_opt, kWidth * BPP_B,                        \
-                          kWidth, NEG kHeight);                                \
-  }                                                                            \
-  int max_diff = 0;                                                            \
-  for (int i = 0; i < kHeight; ++i) {                                          \
-    for (int j = 0; j < kWidth * BPP_B; ++j) {                                 \
-      int abs_diff =                                                           \
-        abs(static_cast<int>(dst_argb_c[i * kWidth * BPP_B + j]) -             \
-            static_cast<int>(dst_argb_opt[i * kWidth * BPP_B + j]));           \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 3);                                                      \
-  free_aligned_buffer_16(src_y)                                                \
-  free_aligned_buffer_16(src_uv)                                               \
-  free_aligned_buffer_16(dst_argb_c)                                           \
-  free_aligned_buffer_16(dst_argb_opt)                                         \
-}
-
-#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B)        \
-    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +)      \
-    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -)
-
-TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4)
-TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4)
-TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2)
-TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2)
-
-#define TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, N, NEG) \
-TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N##_OptVsC) {                        \
-  const int kWidth = 1280;                                                     \
-  const int kHeight = 720;                                                     \
-  const int kStride = (kWidth * 8 * BPP_A + 7) / 8;                            \
-  align_buffer_16(src_argb, kStride * kHeight);                                \
-  align_buffer_16(dst_y_c, kWidth * kHeight);                                  \
-  align_buffer_16(dst_u_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);          \
-  align_buffer_16(dst_v_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);          \
-  align_buffer_16(dst_y_opt, kWidth * kHeight);                                \
-  align_buffer_16(dst_u_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);        \
-  align_buffer_16(dst_v_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);        \
-  srandom(time(NULL));                                                         \
-  for (int i = 0; i < kHeight; ++i)                                            \
-    for (int j = 0; j < kStride; ++j)                                          \
-      src_argb[(i * kStride) + j] = (random() & 0xff);                         \
-  MaskCpuFlags(kCpuInitialized);                                               \
-  FMT_A##To##FMT_PLANAR(src_argb, kStride,                                     \
-                        dst_y_c, kWidth,                                       \
-                        dst_u_c, kWidth / SUBSAMP_X,                           \
-                        dst_v_c, kWidth / SUBSAMP_X,                           \
-                        kWidth, NEG kHeight);                                  \
-  MaskCpuFlags(-1);                                                            \
-  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
-    FMT_A##To##FMT_PLANAR(src_argb, kStride,                                   \
-                          dst_y_opt, kWidth,                                   \
-                          dst_u_opt, kWidth / SUBSAMP_X,                       \
-                          dst_v_opt, kWidth / SUBSAMP_X,                       \
-                          kWidth, NEG kHeight);                                \
-  }                                                                            \
-  int max_diff = 0;                                                            \
-  for (int i = 0; i < kHeight; ++i) {                                          \
-    for (int j = 0; j < kWidth; ++j) {                                         \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
-              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 2);                                                      \
-  for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) {                              \
-    for (int j = 0; j < kWidth / SUBSAMP_X; ++j) {                             \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_u_c[i * kWidth / SUBSAMP_X + j]) -          \
-              static_cast<int>(dst_u_opt[i * kWidth / SUBSAMP_X + j]));        \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 2);                                                      \
-  for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) {                              \
-    for (int j = 0; j < kWidth / SUBSAMP_X; ++j) {                             \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_v_c[i * kWidth / SUBSAMP_X + j]) -          \
-              static_cast<int>(dst_v_opt[i * kWidth / SUBSAMP_X + j]));        \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 2);                                                      \
-  free_aligned_buffer_16(dst_y_c)                                              \
-  free_aligned_buffer_16(dst_u_c)                                              \
-  free_aligned_buffer_16(dst_v_c)                                              \
-  free_aligned_buffer_16(dst_y_opt)                                            \
-  free_aligned_buffer_16(dst_u_opt)                                            \
-  free_aligned_buffer_16(dst_v_opt)                                            \
-  free_aligned_buffer_16(src_argb)                                             \
-}
-
-#define TESTATOPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)          \
-    TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, , +)        \
-    TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, Invert, -)
-
-TESTATOPLANAR(ARGB, 4, I420, 2, 2)
-TESTATOPLANAR(BGRA, 4, I420, 2, 2)
-TESTATOPLANAR(ABGR, 4, I420, 2, 2)
-TESTATOPLANAR(RGBA, 4, I420, 2, 2)
-TESTATOPLANAR(RAW, 3, I420, 2, 2)
-TESTATOPLANAR(RGB24, 3, I420, 2, 2)
-TESTATOPLANAR(RGB565, 2, I420, 2, 2)
-TESTATOPLANAR(ARGB1555, 2, I420, 2, 2)
-TESTATOPLANAR(ARGB4444, 2, I420, 2, 2)
-// TESTATOPLANAR(ARGB, 4, I411, 4, 1)
-TESTATOPLANAR(ARGB, 4, I422, 2, 1)
-// TESTATOPLANAR(ARGB, 4, I444, 1, 1)
-// TODO(fbarchard): Implement and test 411 and 444
-TESTATOPLANAR(YUY2, 2, I420, 2, 2)
-TESTATOPLANAR(UYVY, 2, I420, 2, 2)
-TESTATOPLANAR(YUY2, 2, I422, 2, 1)
-TESTATOPLANAR(UYVY, 2, I422, 2, 1)
-TESTATOPLANAR(V210, 16 / 6, I420, 2, 2)
-TESTATOPLANAR(I400, 1, I420, 2, 2)
-TESTATOPLANAR(BayerBGGR, 1, I420, 2, 2)
-TESTATOPLANAR(BayerRGGB, 1, I420, 2, 2)
-TESTATOPLANAR(BayerGBRG, 1, I420, 2, 2)
-TESTATOPLANAR(BayerGRBG, 1, I420, 2, 2)
-
-#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, N, NEG)                \
-TEST_F(libyuvTest, FMT_A##To##FMT_B##N##_OptVsC) {                             \
-  const int kWidth = 1280;                                                     \
-  const int kHeight = 720;                                                     \
-  align_buffer_16(src_argb, (kWidth * BPP_A) * kHeight);                       \
-  align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight);                     \
-  align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight);                   \
-  srandom(time(NULL));                                                         \
-  for (int i = 0; i < kHeight * kWidth * BPP_A; ++i) {                         \
-    src_argb[i] = (random() & 0xff);                                           \
-  }                                                                            \
-  MaskCpuFlags(kCpuInitialized);                                               \
-  FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A,                                \
-                   dst_argb_c, kWidth * BPP_B,                                 \
-                   kWidth, NEG kHeight);                                       \
-  MaskCpuFlags(-1);                                                            \
-  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
-    FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A,                              \
-                     dst_argb_opt, kWidth * BPP_B,                             \
-                     kWidth, NEG kHeight);                                     \
-  }                                                                            \
-  int max_diff = 0;                                                            \
-  for (int i = 0; i < kHeight * kWidth * BPP_B; ++i) {                         \
-    int abs_diff =                                                             \
-        abs(static_cast<int>(dst_argb_c[i]) -                                  \
-            static_cast<int>(dst_argb_opt[i]));                                \
-    if (abs_diff > max_diff) {                                                 \
-      max_diff = abs_diff;                                                     \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 2);                                                      \
-  free_aligned_buffer_16(src_argb)                                             \
-  free_aligned_buffer_16(dst_argb_c)                                           \
-  free_aligned_buffer_16(dst_argb_opt)                                         \
-}
-#define TESTATOB(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B)                         \
-    TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, , +)                       \
-    TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, Invert, -)
-
-TESTATOB(I400, 1, 1, I400, 1)
-TESTATOB(ARGB, 4, 4, ARGB, 4)
-TESTATOB(ARGB, 4, 4, BGRA, 4)
-TESTATOB(ARGB, 4, 4, ABGR, 4)
-TESTATOB(ARGB, 4, 4, RGBA, 4)
-TESTATOB(ARGB, 4, 4, RAW, 3)
-TESTATOB(ARGB, 4, 4, RGB24, 3)
-TESTATOB(ARGB, 4, 4, RGB565, 2)
-TESTATOB(ARGB, 4, 4, ARGB1555, 2)
-TESTATOB(ARGB, 4, 4, ARGB4444, 2)
-TESTATOB(BGRA, 4, 4, ARGB, 4)
-TESTATOB(ABGR, 4, 4, ARGB, 4)
-TESTATOB(RGBA, 4, 4, ARGB, 4)
-TESTATOB(RAW, 3, 3, ARGB, 4)
-TESTATOB(RGB24, 3, 3, ARGB, 4)
-TESTATOB(RGB565, 2, 2, ARGB, 4)
-TESTATOB(ARGB1555, 2, 2, ARGB, 4)
-TESTATOB(ARGB4444, 2, 2, ARGB, 4)
-TESTATOB(YUY2, 2, 2, ARGB, 4)
-TESTATOB(UYVY, 2, 2, ARGB, 4)
-TESTATOB(M420, 3 / 2, 1, ARGB, 4)
-
-static const int kReadPad = 16;  // Allow overread of 16 bytes.
-#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B)                   \
-TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) {                                \
-  srandom(time(NULL));                                                         \
-  for (int times = 0; times < benchmark_iterations_; ++times) {                \
-    const int kWidth = (random() & 63) + 1;                                    \
-    const int kHeight = (random() & 31) + 1;                                   \
-    align_buffer_page_end(src_argb, (kWidth * BPP_A) * kHeight + kReadPad);    \
-    align_buffer_page_end(dst_argb_c, (kWidth * BPP_B) * kHeight);             \
-    align_buffer_page_end(dst_argb_opt, (kWidth * BPP_B) * kHeight);           \
-    for (int i = 0; i < kHeight * kWidth * BPP_A; ++i) {                       \
-      src_argb[i] = (random() & 0xff);                                         \
-    }                                                                          \
-    MaskCpuFlags(kCpuInitialized);                                             \
-    FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A,                              \
-                     dst_argb_c, kWidth * BPP_B,                               \
-                     kWidth, kHeight);                                         \
-    MaskCpuFlags(-1);                                                          \
-    FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A,                              \
-                     dst_argb_opt, kWidth * BPP_B,                             \
-                     kWidth, kHeight);                                         \
-    int max_diff = 0;                                                          \
-    for (int i = 0; i < kHeight * kWidth * BPP_B; ++i) {                       \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_argb_c[i]) -                                \
-              static_cast<int>(dst_argb_opt[i]));                              \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-    EXPECT_LE(max_diff, 2);                                                    \
-    free_aligned_buffer_page_end(src_argb)                                     \
-    free_aligned_buffer_page_end(dst_argb_c)                                   \
-    free_aligned_buffer_page_end(dst_argb_opt)                                 \
-  }                                                                            \
-}
-
-TESTATOBRANDOM(ARGB, 4, 4, ARGB, 4)
-TESTATOBRANDOM(ARGB, 4, 4, BGRA, 4)
-TESTATOBRANDOM(ARGB, 4, 4, ABGR, 4)
-TESTATOBRANDOM(ARGB, 4, 4, RGBA, 4)
-TESTATOBRANDOM(ARGB, 4, 4, RAW, 3)
-TESTATOBRANDOM(ARGB, 4, 4, RGB24, 3)
-TESTATOBRANDOM(ARGB, 4, 4, RGB565, 2)
-TESTATOBRANDOM(ARGB, 4, 4, ARGB1555, 2)
-TESTATOBRANDOM(ARGB, 4, 4, ARGB4444, 2)
-
-TESTATOBRANDOM(BGRA, 4, 4, ARGB, 4)
-TESTATOBRANDOM(ABGR, 4, 4, ARGB, 4)
-TESTATOBRANDOM(RGBA, 4, 4, ARGB, 4)
-TESTATOBRANDOM(RAW, 3, 3, ARGB, 4)
-TESTATOBRANDOM(RGB24, 3, 3, ARGB, 4)
-TESTATOBRANDOM(RGB565, 2, 2, ARGB, 4)
-TESTATOBRANDOM(ARGB1555, 2, 2, ARGB, 4)
-TESTATOBRANDOM(ARGB4444, 2, 2, ARGB, 4)
-
-TEST_F(libyuvTest, TestAttenuate) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
-  SIMD_ALIGNED(uint8 atten_pixels[256][4]);
-  SIMD_ALIGNED(uint8 unatten_pixels[256][4]);
-  SIMD_ALIGNED(uint8 atten2_pixels[256][4]);
+TEST_F(LibYUVPlanarTest, TestAttenuate) {
+  const int kSize = 1280 * 4;
+  align_buffer_page_end(orig_pixels, kSize);
+  align_buffer_page_end(atten_pixels, kSize);
+  align_buffer_page_end(unatten_pixels, kSize);
+  align_buffer_page_end(atten2_pixels, kSize);
 
   // Test unattenuation clamps
-  orig_pixels[0][0] = 200u;
-  orig_pixels[0][1] = 129u;
-  orig_pixels[0][2] = 127u;
-  orig_pixels[0][3] = 128u;
+  orig_pixels[0 * 4 + 0] = 200u;
+  orig_pixels[0 * 4 + 1] = 129u;
+  orig_pixels[0 * 4 + 2] = 127u;
+  orig_pixels[0 * 4 + 3] = 128u;
   // Test unattenuation transparent and opaque are unaffected
-  orig_pixels[1][0] = 16u;
-  orig_pixels[1][1] = 64u;
-  orig_pixels[1][2] = 192u;
-  orig_pixels[1][3] = 0u;
-  orig_pixels[2][0] = 16u;
-  orig_pixels[2][1] = 64u;
-  orig_pixels[2][2] = 192u;
-  orig_pixels[2][3] = 255u;
-  orig_pixels[3][0] = 16u;
-  orig_pixels[3][1] = 64u;
-  orig_pixels[3][2] = 192u;
-  orig_pixels[3][3] = 128u;
-  ARGBUnattenuate(&orig_pixels[0][0], 0, &unatten_pixels[0][0], 0, 4, 1);
-  EXPECT_EQ(255u, unatten_pixels[0][0]);
-  EXPECT_EQ(255u, unatten_pixels[0][1]);
-  EXPECT_EQ(254u, unatten_pixels[0][2]);
-  EXPECT_EQ(128u, unatten_pixels[0][3]);
-  EXPECT_EQ(16u, unatten_pixels[1][0]);
-  EXPECT_EQ(64u, unatten_pixels[1][1]);
-  EXPECT_EQ(192u, unatten_pixels[1][2]);
-  EXPECT_EQ(0u, unatten_pixels[1][3]);
-  EXPECT_EQ(16u, unatten_pixels[2][0]);
-  EXPECT_EQ(64u, unatten_pixels[2][1]);
-  EXPECT_EQ(192u, unatten_pixels[2][2]);
-  EXPECT_EQ(255u, unatten_pixels[2][3]);
-  EXPECT_EQ(32u, unatten_pixels[3][0]);
-  EXPECT_EQ(128u, unatten_pixels[3][1]);
-  EXPECT_EQ(255u, unatten_pixels[3][2]);
-  EXPECT_EQ(128u, unatten_pixels[3][3]);
-
-  for (int i = 0; i < 256; ++i) {
-    orig_pixels[i][0] = i;
-    orig_pixels[i][1] = i / 2;
-    orig_pixels[i][2] = i / 3;
-    orig_pixels[i][3] = i;
+  orig_pixels[1 * 4 + 0] = 16u;
+  orig_pixels[1 * 4 + 1] = 64u;
+  orig_pixels[1 * 4 + 2] = 192u;
+  orig_pixels[1 * 4 + 3] = 0u;
+  orig_pixels[2 * 4 + 0] = 16u;
+  orig_pixels[2 * 4 + 1] = 64u;
+  orig_pixels[2 * 4 + 2] = 192u;
+  orig_pixels[2 * 4 + 3] = 255u;
+  orig_pixels[3 * 4 + 0] = 16u;
+  orig_pixels[3 * 4 + 1] = 64u;
+  orig_pixels[3 * 4 + 2] = 192u;
+  orig_pixels[3 * 4 + 3] = 128u;
+  ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 4, 1);
+  EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]);
+  EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]);
+  EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]);
+  EXPECT_EQ(128u, unatten_pixels[0 * 4 + 3]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 0]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]);
+  EXPECT_EQ(16u, unatten_pixels[2 * 4 + 0]);
+  EXPECT_EQ(64u, unatten_pixels[2 * 4 + 1]);
+  EXPECT_EQ(192u, unatten_pixels[2 * 4 + 2]);
+  EXPECT_EQ(255u, unatten_pixels[2 * 4 + 3]);
+  EXPECT_EQ(32u, unatten_pixels[3 * 4 + 0]);
+  EXPECT_EQ(128u, unatten_pixels[3 * 4 + 1]);
+  EXPECT_EQ(255u, unatten_pixels[3 * 4 + 2]);
+  EXPECT_EQ(128u, unatten_pixels[3 * 4 + 3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i * 4 + 0] = i;
+    orig_pixels[i * 4 + 1] = i / 2;
+    orig_pixels[i * 4 + 2] = i / 3;
+    orig_pixels[i * 4 + 3] = i;
   }
-  ARGBAttenuate(&orig_pixels[0][0], 0, &atten_pixels[0][0], 0, 256, 1);
-  ARGBUnattenuate(&atten_pixels[0][0], 0, &unatten_pixels[0][0], 0, 256, 1);
-  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
-    ARGBAttenuate(&unatten_pixels[0][0], 0, &atten2_pixels[0][0], 0, 256, 1);
+  ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 1280, 1);
+  ARGBUnattenuate(atten_pixels, 0, unatten_pixels, 0, 1280, 1);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1);
   }
-  for (int i = 0; i < 256; ++i) {
-    EXPECT_NEAR(atten_pixels[i][0], atten2_pixels[i][0], 2);
-    EXPECT_NEAR(atten_pixels[i][1], atten2_pixels[i][1], 2);
-    EXPECT_NEAR(atten_pixels[i][2], atten2_pixels[i][2], 2);
-    EXPECT_NEAR(atten_pixels[i][3], atten2_pixels[i][3], 2);
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 2);
+    EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 2);
+    EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 2);
+    EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 2);
   }
   // Make sure transparent, 50% and opaque are fully accurate.
-  EXPECT_EQ(0, atten_pixels[0][0]);
-  EXPECT_EQ(0, atten_pixels[0][1]);
-  EXPECT_EQ(0, atten_pixels[0][2]);
-  EXPECT_EQ(0, atten_pixels[0][3]);
-  EXPECT_EQ(64, atten_pixels[128][0]);
-  EXPECT_EQ(32, atten_pixels[128][1]);
-  EXPECT_EQ(21,  atten_pixels[128][2]);
-  EXPECT_EQ(128, atten_pixels[128][3]);
-  EXPECT_EQ(255, atten_pixels[255][0]);
-  EXPECT_EQ(127, atten_pixels[255][1]);
-  EXPECT_EQ(85,  atten_pixels[255][2]);
-  EXPECT_EQ(255, atten_pixels[255][3]);
-}
-
-TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 0]);
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 1]);
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 2]);
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 3]);
+  EXPECT_EQ(64, atten_pixels[128 * 4 + 0]);
+  EXPECT_EQ(32, atten_pixels[128 * 4 + 1]);
+  EXPECT_EQ(21,  atten_pixels[128 * 4 + 2]);
+  EXPECT_EQ(128, atten_pixels[128 * 4 + 3]);
+  EXPECT_NEAR(255, atten_pixels[255 * 4 + 0], 1);
+  EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], 1);
+  EXPECT_NEAR(85,  atten_pixels[255 * 4 + 2], 1);
+  EXPECT_EQ(255, atten_pixels[255 * 4 + 3]);
+
+  free_aligned_buffer_page_end(atten2_pixels);
+  free_aligned_buffer_page_end(unatten_pixels);
+  free_aligned_buffer_page_end(atten_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+static int TestAttenuateI(int width, int height, int benchmark_iterations,
+                          int disable_cpu_flags, int benchmark_cpu_info,
+                          int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_page_end(src_argb, kStride * height + off);
+  align_buffer_page_end(dst_argb_c, kStride * height);
+  align_buffer_page_end(dst_argb_opt, kStride * height);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBAttenuate(src_argb + off, kStride,
+                dst_argb_c, kStride,
+                width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBAttenuate(src_argb + off, kStride,
+                  dst_argb_opt, kStride,
+                  width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_page_end(src_argb);
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) {
+  int max_diff = TestAttenuateI(benchmark_width_ - 1, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                +1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) {
+  int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                +1, 1);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) {
+  int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                -1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) {
+  int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                +1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+
+static int TestUnattenuateI(int width, int height, int benchmark_iterations,
+                            int disable_cpu_flags, int benchmark_cpu_info,
+                            int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_page_end(src_argb, kStride * height + off);
+  align_buffer_page_end(dst_argb_c, kStride * height);
+  align_buffer_page_end(dst_argb_opt, kStride * height);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb[i + off] = (fastrand() & 0xff);
+  }
+  ARGBAttenuate(src_argb + off, kStride,
+                src_argb + off, kStride,
+                width, height);
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBUnattenuate(src_argb + off, kStride,
+                  dst_argb_c, kStride,
+                  width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBUnattenuate(src_argb + off, kStride,
+                    dst_argb_opt, kStride,
+                    width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_page_end(src_argb);
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) {
+  int max_diff = TestUnattenuateI(benchmark_width_ - 1, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  +1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) {
+  int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  +1, 1);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) {
+  int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  -1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) {
+  int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  +1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
   SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
   SIMD_ALIGNED(int32 added_pixels[16][16][4]);
 
@@ -484,8 +281,9 @@ TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
   }
 }
 
-TEST_F(libyuvTest, TestARGBGray) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+TEST_F(LibYUVPlanarTest, TestARGBGray) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
 
   // Test blue
   orig_pixels[0][0] = 255u;
@@ -502,45 +300,62 @@ TEST_F(libyuvTest, TestARGBGray) {
   orig_pixels[2][1] = 0u;
   orig_pixels[2][2] = 255u;
   orig_pixels[2][3] = 255u;
+  // Test black
+  orig_pixels[3][0] = 0u;
+  orig_pixels[3][1] = 0u;
+  orig_pixels[3][2] = 0u;
+  orig_pixels[3][3] = 255u;
+  // Test white
+  orig_pixels[4][0] = 255u;
+  orig_pixels[4][1] = 255u;
+  orig_pixels[4][2] = 255u;
+  orig_pixels[4][3] = 255u;
   // Test color
-  orig_pixels[3][0] = 16u;
-  orig_pixels[3][1] = 64u;
-  orig_pixels[3][2] = 192u;
-  orig_pixels[3][3] = 224u;
+  orig_pixels[5][0] = 16u;
+  orig_pixels[5][1] = 64u;
+  orig_pixels[5][2] = 192u;
+  orig_pixels[5][3] = 224u;
   // Do 16 to test asm version.
   ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1);
-  EXPECT_EQ(27u, orig_pixels[0][0]);
-  EXPECT_EQ(27u, orig_pixels[0][1]);
-  EXPECT_EQ(27u, orig_pixels[0][2]);
+  EXPECT_EQ(30u, orig_pixels[0][0]);
+  EXPECT_EQ(30u, orig_pixels[0][1]);
+  EXPECT_EQ(30u, orig_pixels[0][2]);
   EXPECT_EQ(128u, orig_pixels[0][3]);
-  EXPECT_EQ(151u, orig_pixels[1][0]);
-  EXPECT_EQ(151u, orig_pixels[1][1]);
-  EXPECT_EQ(151u, orig_pixels[1][2]);
+  EXPECT_EQ(149u, orig_pixels[1][0]);
+  EXPECT_EQ(149u, orig_pixels[1][1]);
+  EXPECT_EQ(149u, orig_pixels[1][2]);
   EXPECT_EQ(0u, orig_pixels[1][3]);
-  EXPECT_EQ(75u, orig_pixels[2][0]);
-  EXPECT_EQ(75u, orig_pixels[2][1]);
-  EXPECT_EQ(75u, orig_pixels[2][2]);
+  EXPECT_EQ(76u, orig_pixels[2][0]);
+  EXPECT_EQ(76u, orig_pixels[2][1]);
+  EXPECT_EQ(76u, orig_pixels[2][2]);
   EXPECT_EQ(255u, orig_pixels[2][3]);
-  EXPECT_EQ(96u, orig_pixels[3][0]);
-  EXPECT_EQ(96u, orig_pixels[3][1]);
-  EXPECT_EQ(96u, orig_pixels[3][2]);
-  EXPECT_EQ(224u, orig_pixels[3][3]);
-
-  for (int i = 0; i < 256; ++i) {
+  EXPECT_EQ(0u, orig_pixels[3][0]);
+  EXPECT_EQ(0u, orig_pixels[3][1]);
+  EXPECT_EQ(0u, orig_pixels[3][2]);
+  EXPECT_EQ(255u, orig_pixels[3][3]);
+  EXPECT_EQ(255u, orig_pixels[4][0]);
+  EXPECT_EQ(255u, orig_pixels[4][1]);
+  EXPECT_EQ(255u, orig_pixels[4][2]);
+  EXPECT_EQ(255u, orig_pixels[4][3]);
+  EXPECT_EQ(96u, orig_pixels[5][0]);
+  EXPECT_EQ(96u, orig_pixels[5][1]);
+  EXPECT_EQ(96u, orig_pixels[5][2]);
+  EXPECT_EQ(224u, orig_pixels[5][3]);
+  for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
     orig_pixels[i][2] = i / 3;
     orig_pixels[i][3] = i;
   }
-
-  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
-    ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBGray(&orig_pixels[0][0], 0, 0, 0, 1280, 1);
   }
 }
 
-TEST_F(libyuvTest, TestARGBGrayTo) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
-  SIMD_ALIGNED(uint8 gray_pixels[256][4]);
+TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 gray_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
 
   // Test blue
   orig_pixels[0][0] = 255u;
@@ -557,44 +372,61 @@ TEST_F(libyuvTest, TestARGBGrayTo) {
   orig_pixels[2][1] = 0u;
   orig_pixels[2][2] = 255u;
   orig_pixels[2][3] = 255u;
+  // Test black
+  orig_pixels[3][0] = 0u;
+  orig_pixels[3][1] = 0u;
+  orig_pixels[3][2] = 0u;
+  orig_pixels[3][3] = 255u;
+  // Test white
+  orig_pixels[4][0] = 255u;
+  orig_pixels[4][1] = 255u;
+  orig_pixels[4][2] = 255u;
+  orig_pixels[4][3] = 255u;
   // Test color
-  orig_pixels[3][0] = 16u;
-  orig_pixels[3][1] = 64u;
-  orig_pixels[3][2] = 192u;
-  orig_pixels[3][3] = 224u;
+  orig_pixels[5][0] = 16u;
+  orig_pixels[5][1] = 64u;
+  orig_pixels[5][2] = 192u;
+  orig_pixels[5][3] = 224u;
   // Do 16 to test asm version.
   ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1);
-  EXPECT_EQ(27u, gray_pixels[0][0]);
-  EXPECT_EQ(27u, gray_pixels[0][1]);
-  EXPECT_EQ(27u, gray_pixels[0][2]);
+  EXPECT_EQ(30u, gray_pixels[0][0]);
+  EXPECT_EQ(30u, gray_pixels[0][1]);
+  EXPECT_EQ(30u, gray_pixels[0][2]);
   EXPECT_EQ(128u, gray_pixels[0][3]);
-  EXPECT_EQ(151u, gray_pixels[1][0]);
-  EXPECT_EQ(151u, gray_pixels[1][1]);
-  EXPECT_EQ(151u, gray_pixels[1][2]);
+  EXPECT_EQ(149u, gray_pixels[1][0]);
+  EXPECT_EQ(149u, gray_pixels[1][1]);
+  EXPECT_EQ(149u, gray_pixels[1][2]);
   EXPECT_EQ(0u, gray_pixels[1][3]);
-  EXPECT_EQ(75u, gray_pixels[2][0]);
-  EXPECT_EQ(75u, gray_pixels[2][1]);
-  EXPECT_EQ(75u, gray_pixels[2][2]);
+  EXPECT_EQ(76u, gray_pixels[2][0]);
+  EXPECT_EQ(76u, gray_pixels[2][1]);
+  EXPECT_EQ(76u, gray_pixels[2][2]);
   EXPECT_EQ(255u, gray_pixels[2][3]);
-  EXPECT_EQ(96u, gray_pixels[3][0]);
-  EXPECT_EQ(96u, gray_pixels[3][1]);
-  EXPECT_EQ(96u, gray_pixels[3][2]);
-  EXPECT_EQ(224u, gray_pixels[3][3]);
-
-  for (int i = 0; i < 256; ++i) {
+  EXPECT_EQ(0u, gray_pixels[3][0]);
+  EXPECT_EQ(0u, gray_pixels[3][1]);
+  EXPECT_EQ(0u, gray_pixels[3][2]);
+  EXPECT_EQ(255u, gray_pixels[3][3]);
+  EXPECT_EQ(255u, gray_pixels[4][0]);
+  EXPECT_EQ(255u, gray_pixels[4][1]);
+  EXPECT_EQ(255u, gray_pixels[4][2]);
+  EXPECT_EQ(255u, gray_pixels[4][3]);
+  EXPECT_EQ(96u, gray_pixels[5][0]);
+  EXPECT_EQ(96u, gray_pixels[5][1]);
+  EXPECT_EQ(96u, gray_pixels[5][2]);
+  EXPECT_EQ(224u, gray_pixels[5][3]);
+  for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
     orig_pixels[i][2] = i / 3;
     orig_pixels[i][3] = i;
   }
-
-  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
-    ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 256, 1);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 1280, 1);
   }
 }
 
-TEST_F(libyuvTest, TestARGBSepia) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+TEST_F(LibYUVPlanarTest, TestARGBSepia) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
 
   // Test blue
   orig_pixels[0][0] = 255u;
@@ -611,11 +443,21 @@ TEST_F(libyuvTest, TestARGBSepia) {
   orig_pixels[2][1] = 0u;
   orig_pixels[2][2] = 255u;
   orig_pixels[2][3] = 255u;
+  // Test black
+  orig_pixels[3][0] = 0u;
+  orig_pixels[3][1] = 0u;
+  orig_pixels[3][2] = 0u;
+  orig_pixels[3][3] = 255u;
+  // Test white
+  orig_pixels[4][0] = 255u;
+  orig_pixels[4][1] = 255u;
+  orig_pixels[4][2] = 255u;
+  orig_pixels[4][3] = 255u;
   // Test color
-  orig_pixels[3][0] = 16u;
-  orig_pixels[3][1] = 64u;
-  orig_pixels[3][2] = 192u;
-  orig_pixels[3][3] = 224u;
+  orig_pixels[5][0] = 16u;
+  orig_pixels[5][1] = 64u;
+  orig_pixels[5][2] = 192u;
+  orig_pixels[5][3] = 224u;
   // Do 16 to test asm version.
   ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 16, 1);
   EXPECT_EQ(33u, orig_pixels[0][0]);
@@ -630,32 +472,119 @@ TEST_F(libyuvTest, TestARGBSepia) {
   EXPECT_EQ(89u, orig_pixels[2][1]);
   EXPECT_EQ(99u, orig_pixels[2][2]);
   EXPECT_EQ(255u, orig_pixels[2][3]);
-  EXPECT_EQ(88u, orig_pixels[3][0]);
-  EXPECT_EQ(114u, orig_pixels[3][1]);
-  EXPECT_EQ(127u, orig_pixels[3][2]);
-  EXPECT_EQ(224u, orig_pixels[3][3]);
+  EXPECT_EQ(0u, orig_pixels[3][0]);
+  EXPECT_EQ(0u, orig_pixels[3][1]);
+  EXPECT_EQ(0u, orig_pixels[3][2]);
+  EXPECT_EQ(255u, orig_pixels[3][3]);
+  EXPECT_EQ(239u, orig_pixels[4][0]);
+  EXPECT_EQ(255u, orig_pixels[4][1]);
+  EXPECT_EQ(255u, orig_pixels[4][2]);
+  EXPECT_EQ(255u, orig_pixels[4][3]);
+  EXPECT_EQ(88u, orig_pixels[5][0]);
+  EXPECT_EQ(114u, orig_pixels[5][1]);
+  EXPECT_EQ(127u, orig_pixels[5][2]);
+  EXPECT_EQ(224u, orig_pixels[5][3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 1280, 1);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBColorMatrix) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
+
+  // Matrix for Sepia.
+  SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
+    17 / 2, 68 / 2, 35 / 2, 0,
+    22 / 2, 88 / 2, 45 / 2, 0,
+    24 / 2, 98 / 2, 50 / 2, 0,
+    0, 0, 0, 64,  // Copy alpha.
+  };
+  memset(orig_pixels, 0, sizeof(orig_pixels));
 
-  for (int i = 0; i < 256; ++i) {
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test color
+  orig_pixels[3][0] = 16u;
+  orig_pixels[3][1] = 64u;
+  orig_pixels[3][2] = 192u;
+  orig_pixels[3][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                  &kRGBToSepia[0], 16, 1);
+  EXPECT_EQ(31u, dst_pixels_opt[0][0]);
+  EXPECT_EQ(43u, dst_pixels_opt[0][1]);
+  EXPECT_EQ(47u, dst_pixels_opt[0][2]);
+  EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+  EXPECT_EQ(135u, dst_pixels_opt[1][0]);
+  EXPECT_EQ(175u, dst_pixels_opt[1][1]);
+  EXPECT_EQ(195u, dst_pixels_opt[1][2]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+  EXPECT_EQ(67u, dst_pixels_opt[2][0]);
+  EXPECT_EQ(87u, dst_pixels_opt[2][1]);
+  EXPECT_EQ(99u, dst_pixels_opt[2][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+  EXPECT_EQ(87u, dst_pixels_opt[3][0]);
+  EXPECT_EQ(112u, dst_pixels_opt[3][1]);
+  EXPECT_EQ(127u, dst_pixels_opt[3][2]);
+  EXPECT_EQ(224u, dst_pixels_opt[3][3]);
+
+  for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
     orig_pixels[i][2] = i / 3;
     orig_pixels[i][3] = i;
   }
+  MaskCpuFlags(disable_cpu_flags_);
+  ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0,
+                  &kRGBToSepia[0], 1280, 1);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                    &kRGBToSepia[0], 1280, 1);
+  }
 
-  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
-    ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+    EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+    EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+    EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
   }
 }
 
-TEST_F(libyuvTest, TestARGBColorMatrix) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+TEST_F(LibYUVPlanarTest, TestRGBColorMatrix) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
 
   // Matrix for Sepia.
-  static const int8 kARGBToSepia[] = {
+  SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
     17, 68, 35, 0,
     22, 88, 45, 0,
     24, 98, 50, 0,
+    0, 0, 0, 0,  // Unused but makes matrix 16 bytes.
   };
+  memset(orig_pixels, 0, sizeof(orig_pixels));
 
   // Test blue
   orig_pixels[0][0] = 255u;
@@ -678,8 +607,8 @@ TEST_F(libyuvTest, TestARGBColorMatrix) {
   orig_pixels[3][2] = 192u;
   orig_pixels[3][3] = 224u;
   // Do 16 to test asm version.
-  ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 16, 1);
-  EXPECT_EQ(33u, orig_pixels[0][0]);
+  RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 16, 1);
+  EXPECT_EQ(31u, orig_pixels[0][0]);
   EXPECT_EQ(43u, orig_pixels[0][1]);
   EXPECT_EQ(47u, orig_pixels[0][2]);
   EXPECT_EQ(128u, orig_pixels[0][3]);
@@ -687,29 +616,28 @@ TEST_F(libyuvTest, TestARGBColorMatrix) {
   EXPECT_EQ(175u, orig_pixels[1][1]);
   EXPECT_EQ(195u, orig_pixels[1][2]);
   EXPECT_EQ(0u, orig_pixels[1][3]);
-  EXPECT_EQ(69u, orig_pixels[2][0]);
-  EXPECT_EQ(89u, orig_pixels[2][1]);
+  EXPECT_EQ(67u, orig_pixels[2][0]);
+  EXPECT_EQ(87u, orig_pixels[2][1]);
   EXPECT_EQ(99u, orig_pixels[2][2]);
   EXPECT_EQ(255u, orig_pixels[2][3]);
-  EXPECT_EQ(88u, orig_pixels[3][0]);
-  EXPECT_EQ(114u, orig_pixels[3][1]);
+  EXPECT_EQ(87u, orig_pixels[3][0]);
+  EXPECT_EQ(112u, orig_pixels[3][1]);
   EXPECT_EQ(127u, orig_pixels[3][2]);
   EXPECT_EQ(224u, orig_pixels[3][3]);
 
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
     orig_pixels[i][2] = i / 3;
     orig_pixels[i][3] = i;
   }
-
-  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
-    ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 256, 1);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 1280, 1);
   }
 }
 
-TEST_F(libyuvTest, TestARGBColorTable) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+TEST_F(LibYUVPlanarTest, TestARGBColorTable) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
   memset(orig_pixels, 0, sizeof(orig_pixels));
 
   // Matrix for Sepia.
@@ -755,68 +683,127 @@ TEST_F(libyuvTest, TestARGBColorTable) {
   EXPECT_EQ(11u, orig_pixels[3][2]);
   EXPECT_EQ(16u, orig_pixels[3][3]);
 
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
     orig_pixels[i][2] = i / 3;
     orig_pixels[i][3] = i;
   }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 1280, 1);
+  }
+}
+
+// Same as TestARGBColorTable except alpha does not change.
+TEST_F(LibYUVPlanarTest, TestRGBColorTable) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  // Matrix for Sepia.
+  static const uint8 kARGBTable[256 * 4] = {
+    1u, 2u, 3u, 4u,
+    5u, 6u, 7u, 8u,
+    9u, 10u, 11u, 12u,
+    13u, 14u, 15u, 16u,
+  };
+
+  orig_pixels[0][0] = 0u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 0u;
+  orig_pixels[1][0] = 1u;
+  orig_pixels[1][1] = 1u;
+  orig_pixels[1][2] = 1u;
+  orig_pixels[1][3] = 1u;
+  orig_pixels[2][0] = 2u;
+  orig_pixels[2][1] = 2u;
+  orig_pixels[2][2] = 2u;
+  orig_pixels[2][3] = 2u;
+  orig_pixels[3][0] = 0u;
+  orig_pixels[3][1] = 1u;
+  orig_pixels[3][2] = 2u;
+  orig_pixels[3][3] = 3u;
+  // Do 16 to test asm version.
+  RGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1);
+  EXPECT_EQ(1u, orig_pixels[0][0]);
+  EXPECT_EQ(2u, orig_pixels[0][1]);
+  EXPECT_EQ(3u, orig_pixels[0][2]);
+  EXPECT_EQ(0u, orig_pixels[0][3]);  // Alpha unchanged.
+  EXPECT_EQ(5u, orig_pixels[1][0]);
+  EXPECT_EQ(6u, orig_pixels[1][1]);
+  EXPECT_EQ(7u, orig_pixels[1][2]);
+  EXPECT_EQ(1u, orig_pixels[1][3]);  // Alpha unchanged.
+  EXPECT_EQ(9u, orig_pixels[2][0]);
+  EXPECT_EQ(10u, orig_pixels[2][1]);
+  EXPECT_EQ(11u, orig_pixels[2][2]);
+  EXPECT_EQ(2u, orig_pixels[2][3]);  // Alpha unchanged.
+  EXPECT_EQ(1u, orig_pixels[3][0]);
+  EXPECT_EQ(6u, orig_pixels[3][1]);
+  EXPECT_EQ(11u, orig_pixels[3][2]);
+  EXPECT_EQ(3u, orig_pixels[3][3]);  // Alpha unchanged.
 
-  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
-    ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 256, 1);
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    RGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 1280, 1);
   }
 }
 
-TEST_F(libyuvTest, TestARGBQuantize) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
 
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
     orig_pixels[i][2] = i / 3;
     orig_pixels[i][3] = i;
   }
   ARGBQuantize(&orig_pixels[0][0], 0,
-               (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1);
+               (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1);
 
-  for (int i = 0; i < 256; ++i) {
-    EXPECT_EQ(i / 8 * 8 + 8 / 2, orig_pixels[i][0]);
-    EXPECT_EQ(i / 2 / 8 * 8 + 8 / 2, orig_pixels[i][1]);
-    EXPECT_EQ(i / 3 / 8 * 8 + 8 / 2, orig_pixels[i][2]);
-    EXPECT_EQ(i, orig_pixels[i][3]);
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ((i / 8 * 8 + 8 / 2) & 255, orig_pixels[i][0]);
+    EXPECT_EQ((i / 2 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][1]);
+    EXPECT_EQ((i / 3 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][2]);
+    EXPECT_EQ(i & 255, orig_pixels[i][3]);
   }
-  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
     ARGBQuantize(&orig_pixels[0][0], 0,
-                 (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1);
+                 (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1);
   }
 }
 
-TEST_F(libyuvTest, TestARGBMirror) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
-  SIMD_ALIGNED(uint8 dst_pixels[256][4]);
+TEST_F(LibYUVPlanarTest, TestARGBMirror) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels[1280][4]);
 
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
     orig_pixels[i][2] = i / 3;
     orig_pixels[i][3] = i / 4;
   }
-  ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1);
+  ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
 
-  for (int i = 0; i < 256; ++i) {
-    EXPECT_EQ(i, dst_pixels[255 - i][0]);
-    EXPECT_EQ(i / 2, dst_pixels[255 - i][1]);
-    EXPECT_EQ(i / 3, dst_pixels[255 - i][2]);
-    EXPECT_EQ(i / 4, dst_pixels[255 - i][3]);
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]);
+    EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]);
+    EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]);
+    EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]);
   }
-  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
-    ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
   }
 }
 
-TEST_F(libyuvTest, TestShade) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
-  SIMD_ALIGNED(uint8 shade_pixels[256][4]);
+TEST_F(LibYUVPlanarTest, TestShade) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 shade_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
 
   orig_pixels[0][0] = 10u;
   orig_pixels[0][1] = 20u;
@@ -834,7 +821,8 @@ TEST_F(libyuvTest, TestShade) {
   orig_pixels[3][1] = 0u;
   orig_pixels[3][2] = 0u;
   orig_pixels[3][3] = 0u;
-  ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 4, 1, 0x80ffffff);
+  // Do 8 pixels to allow opt version to be used.
+  ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80ffffff);
   EXPECT_EQ(10u, shade_pixels[0][0]);
   EXPECT_EQ(20u, shade_pixels[0][1]);
   EXPECT_EQ(40u, shade_pixels[0][2]);
@@ -852,22 +840,30 @@ TEST_F(libyuvTest, TestShade) {
   EXPECT_EQ(0u, shade_pixels[3][2]);
   EXPECT_EQ(0u, shade_pixels[3][3]);
 
-  ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 4, 1, 0x80808080);
+  ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80808080);
   EXPECT_EQ(5u, shade_pixels[0][0]);
   EXPECT_EQ(10u, shade_pixels[0][1]);
   EXPECT_EQ(20u, shade_pixels[0][2]);
   EXPECT_EQ(40u, shade_pixels[0][3]);
 
-  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
-    ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 256, 1,
+  ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x10204080);
+  EXPECT_EQ(5u, shade_pixels[0][0]);
+  EXPECT_EQ(5u, shade_pixels[0][1]);
+  EXPECT_EQ(5u, shade_pixels[0][2]);
+  EXPECT_EQ(5u, shade_pixels[0][3]);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 1280, 1,
               0x80808080);
   }
 }
 
-TEST_F(libyuvTest, TestInterpolate) {
-  SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
-  SIMD_ALIGNED(uint8 orig_pixels_1[256][4]);
-  SIMD_ALIGNED(uint8 interpolate_pixels[256][4]);
+TEST_F(LibYUVPlanarTest, TestARGBInterpolate) {
+  SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);
+  SIMD_ALIGNED(uint8 orig_pixels_1[1280][4]);
+  SIMD_ALIGNED(uint8 interpolate_pixels[1280][4]);
+  memset(orig_pixels_0, 0, sizeof(orig_pixels_0));
+  memset(orig_pixels_1, 0, sizeof(orig_pixels_1));
 
   orig_pixels_0[0][0] = 16u;
   orig_pixels_0[0][1] = 32u;
@@ -912,15 +908,15 @@ TEST_F(libyuvTest, TestInterpolate) {
   EXPECT_EQ(0u, interpolate_pixels[1][0]);
   EXPECT_EQ(0u, interpolate_pixels[1][1]);
   EXPECT_EQ(0u, interpolate_pixels[1][2]);
-  EXPECT_NEAR(128u, interpolate_pixels[1][3], 1);  // C = 127, SSE = 128.
+  EXPECT_EQ(128u, interpolate_pixels[1][3]);
   EXPECT_EQ(0u, interpolate_pixels[2][0]);
   EXPECT_EQ(0u, interpolate_pixels[2][1]);
   EXPECT_EQ(0u, interpolate_pixels[2][2]);
   EXPECT_EQ(0u, interpolate_pixels[2][3]);
-  EXPECT_NEAR(128u, interpolate_pixels[3][0], 1);
-  EXPECT_NEAR(128u, interpolate_pixels[3][1], 1);
-  EXPECT_NEAR(128u, interpolate_pixels[3][2], 1);
-  EXPECT_NEAR(128u, interpolate_pixels[3][3], 1);
+  EXPECT_EQ(128u, interpolate_pixels[3][0]);
+  EXPECT_EQ(128u, interpolate_pixels[3][1]);
+  EXPECT_EQ(128u, interpolate_pixels[3][2]);
+  EXPECT_EQ(128u, interpolate_pixels[3][3]);
 
   ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
                   &interpolate_pixels[0][0], 0, 4, 1, 0);
@@ -937,20 +933,418 @@ TEST_F(libyuvTest, TestInterpolate) {
   EXPECT_EQ(16u, interpolate_pixels[0][2]);
   EXPECT_EQ(32u, interpolate_pixels[0][3]);
 
-  for (int i = 0; i < benchmark_iterations_ * (1280 * 720 / 256); ++i) {
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
     ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
-                    &interpolate_pixels[0][0], 0, 256, 1, 128);
+                    &interpolate_pixels[0][0], 0, 1280, 1, 128);
   }
 }
 
-TEST_F(libyuvTest, TestAffine) {
-  SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
-  SIMD_ALIGNED(uint8 interpolate_pixels_C[256][4]);
-#if defined(HAS_ARGBAFFINEROW_SSE2)
-  SIMD_ALIGNED(uint8 interpolate_pixels_Opt[256][4]);
-#endif
+TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
+  SIMD_ALIGNED(uint8 orig_pixels_0[1280]);
+  SIMD_ALIGNED(uint8 orig_pixels_1[1280]);
+  SIMD_ALIGNED(uint8 interpolate_pixels[1280]);
+  memset(orig_pixels_0, 0, sizeof(orig_pixels_0));
+  memset(orig_pixels_1, 0, sizeof(orig_pixels_1));
+
+  orig_pixels_0[0] = 16u;
+  orig_pixels_0[1] = 32u;
+  orig_pixels_0[2] = 64u;
+  orig_pixels_0[3] = 128u;
+  orig_pixels_0[4] = 0u;
+  orig_pixels_0[5] = 0u;
+  orig_pixels_0[6] = 0u;
+  orig_pixels_0[7] = 255u;
+  orig_pixels_0[8] = 0u;
+  orig_pixels_0[9] = 0u;
+  orig_pixels_0[10] = 0u;
+  orig_pixels_0[11] = 0u;
+  orig_pixels_0[12] = 0u;
+  orig_pixels_0[13] = 0u;
+  orig_pixels_0[14] = 0u;
+  orig_pixels_0[15] = 0u;
+
+  orig_pixels_1[0] = 0u;
+  orig_pixels_1[1] = 0u;
+  orig_pixels_1[2] = 0u;
+  orig_pixels_1[3] = 0u;
+  orig_pixels_1[4] = 0u;
+  orig_pixels_1[5] = 0u;
+  orig_pixels_1[6] = 0u;
+  orig_pixels_1[7] = 0u;
+  orig_pixels_1[8] = 0u;
+  orig_pixels_1[9] = 0u;
+  orig_pixels_1[10] = 0u;
+  orig_pixels_1[11] = 0u;
+  orig_pixels_1[12] = 255u;
+  orig_pixels_1[13] = 255u;
+  orig_pixels_1[14] = 255u;
+  orig_pixels_1[15] = 255u;
+
+  InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+                   &interpolate_pixels[0], 0, 16, 1, 128);
+  EXPECT_EQ(8u, interpolate_pixels[0]);
+  EXPECT_EQ(16u, interpolate_pixels[1]);
+  EXPECT_EQ(32u, interpolate_pixels[2]);
+  EXPECT_EQ(64u, interpolate_pixels[3]);
+  EXPECT_EQ(0u, interpolate_pixels[4]);
+  EXPECT_EQ(0u, interpolate_pixels[5]);
+  EXPECT_EQ(0u, interpolate_pixels[6]);
+  EXPECT_EQ(128u, interpolate_pixels[7]);
+  EXPECT_EQ(0u, interpolate_pixels[8]);
+  EXPECT_EQ(0u, interpolate_pixels[9]);
+  EXPECT_EQ(0u, interpolate_pixels[10]);
+  EXPECT_EQ(0u, interpolate_pixels[11]);
+  EXPECT_EQ(128u, interpolate_pixels[12]);
+  EXPECT_EQ(128u, interpolate_pixels[13]);
+  EXPECT_EQ(128u, interpolate_pixels[14]);
+  EXPECT_EQ(128u, interpolate_pixels[15]);
+
+  InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+                   &interpolate_pixels[0], 0, 16, 1, 0);
+  EXPECT_EQ(16u, interpolate_pixels[0]);
+  EXPECT_EQ(32u, interpolate_pixels[1]);
+  EXPECT_EQ(64u, interpolate_pixels[2]);
+  EXPECT_EQ(128u, interpolate_pixels[3]);
+
+  InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+                   &interpolate_pixels[0], 0, 16, 1, 192);
+
+  EXPECT_EQ(4u, interpolate_pixels[0]);
+  EXPECT_EQ(8u, interpolate_pixels[1]);
+  EXPECT_EQ(16u, interpolate_pixels[2]);
+  EXPECT_EQ(32u, interpolate_pixels[3]);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+                     &interpolate_pixels[0], 0, 1280, 1, 123);
+  }
+}
+
+#define TESTTERP(FMT_A, BPP_A, STRIDE_A,                                       \
+                 FMT_B, BPP_B, STRIDE_B,                                       \
+                 W1280, TERP, N, NEG, OFF)                                     \
+TEST_F(LibYUVPlanarTest, ARGBInterpolate##TERP##N) {                           \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;  \
+  const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;  \
+  align_buffer_page_end(src_argb_a, kStrideA * kHeight + OFF);                 \
+  align_buffer_page_end(src_argb_b, kStrideA * kHeight + OFF);                 \
+  align_buffer_page_end(dst_argb_c, kStrideB * kHeight);                       \
+  align_buffer_page_end(dst_argb_opt, kStrideB * kHeight);                     \
+  for (int i = 0; i < kStrideA * kHeight; ++i) {                               \
+    src_argb_a[i + OFF] = (fastrand() & 0xff);                                 \
+    src_argb_b[i + OFF] = (fastrand() & 0xff);                                 \
+  }                                                                            \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  ARGBInterpolate(src_argb_a + OFF, kStrideA,                                  \
+                  src_argb_b + OFF, kStrideA,                                  \
+                  dst_argb_c, kStrideB,                                        \
+                  kWidth, NEG kHeight, TERP);                                  \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    ARGBInterpolate(src_argb_a + OFF, kStrideA,                                \
+                    src_argb_b + OFF, kStrideA,                                \
+                    dst_argb_opt, kStrideB,                                    \
+                    kWidth, NEG kHeight, TERP);                                \
+  }                                                                            \
+  for (int i = 0; i < kStrideB * kHeight; ++i) {                               \
+    EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                                 \
+  }                                                                            \
+  free_aligned_buffer_page_end(src_argb_a);                                    \
+  free_aligned_buffer_page_end(src_argb_b);                                    \
+  free_aligned_buffer_page_end(dst_argb_c);                                    \
+  free_aligned_buffer_page_end(dst_argb_opt);                                  \
+}
+
+#define TESTINTERPOLATE(TERP)                                                  \
+    TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ - 1, TERP, _Any, +, 0)   \
+    TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Unaligned, +, 1) \
+    TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Invert, -, 0)    \
+    TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Opt, +, 0)
+
+TESTINTERPOLATE(0)
+TESTINTERPOLATE(64)
+TESTINTERPOLATE(128)
+TESTINTERPOLATE(192)
+TESTINTERPOLATE(255)
+
+static int TestBlend(int width, int height, int benchmark_iterations,
+                     int disable_cpu_flags, int benchmark_cpu_info,
+                     int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_page_end(src_argb_a, kStride * height + off);
+  align_buffer_page_end(src_argb_b, kStride * height + off);
+  align_buffer_page_end(dst_argb_c, kStride * height);
+  align_buffer_page_end(dst_argb_opt, kStride * height);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+    src_argb_b[i + off] = (fastrand() & 0xff);
+  }
+  ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
+                height);
+  ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width,
+                height);
+  memset(dst_argb_c, 255, kStride * height);
+  memset(dst_argb_opt, 255, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBBlend(src_argb_a + off, kStride,
+            src_argb_b + off, kStride,
+            dst_argb_c, kStride,
+            width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBBlend(src_argb_a + off, kStride,
+              src_argb_b + off, kStride,
+              dst_argb_opt, kStride,
+              width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_page_end(src_argb_a);
+  free_aligned_buffer_page_end(src_argb_b);
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlend_Any) {
+  int max_diff = TestBlend(benchmark_width_ - 4, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_,  benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
 
-  for (int i = 0; i < 256; ++i) {
+TEST_F(LibYUVPlanarTest, ARGBBlend_Unaligned) {
+  int max_diff = TestBlend(benchmark_width_, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlend_Invert) {
+  int max_diff = TestBlend(benchmark_width_, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) {
+  int max_diff = TestBlend(benchmark_width_, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+static void TestBlendPlane(int width, int height, int benchmark_iterations,
+                           int disable_cpu_flags, int benchmark_cpu_info,
+                           int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 1;
+  const int kStride = width * kBpp;
+  align_buffer_page_end(src_argb_a, kStride * height + off);
+  align_buffer_page_end(src_argb_b, kStride * height + off);
+  align_buffer_page_end(src_argb_alpha, kStride * height + off);
+  align_buffer_page_end(dst_argb_c, kStride * height + off);
+  align_buffer_page_end(dst_argb_opt, kStride * height + off);
+  memset(dst_argb_c, 255, kStride * height + off);
+  memset(dst_argb_opt, 255, kStride * height + off);
+
+  // Test source is maintained exactly if alpha is 255.
+  for (int i = 0; i < width; ++i) {
+    src_argb_a[i + off] = i & 255;
+    src_argb_b[i + off] = 255 - (i & 255);
+  }
+  memset(src_argb_alpha + off, 255, width);
+  BlendPlane(src_argb_a + off, width,
+             src_argb_b + off, width,
+             src_argb_alpha + off, width,
+             dst_argb_opt + off, width,
+             width, 1);
+  for (int i = 0; i < width; ++i) {
+    EXPECT_EQ(src_argb_a[i + off], dst_argb_opt[i + off]);
+  }
+  // Test destination is maintained exactly if alpha is 0.
+  memset(src_argb_alpha + off, 0, width);
+  BlendPlane(src_argb_a + off, width,
+             src_argb_b + off, width,
+             src_argb_alpha + off, width,
+             dst_argb_opt + off, width,
+             width, 1);
+  for (int i = 0; i < width; ++i) {
+    EXPECT_EQ(src_argb_b[i + off], dst_argb_opt[i + off]);
+  }
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+    src_argb_b[i + off] = (fastrand() & 0xff);
+    src_argb_alpha[i + off] = (fastrand() & 0xff);
+  }
+
+  MaskCpuFlags(disable_cpu_flags);
+  BlendPlane(src_argb_a + off, width,
+             src_argb_b + off, width,
+             src_argb_alpha + off, width,
+             dst_argb_c + off, width,
+             width, height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    BlendPlane(src_argb_a + off, width,
+               src_argb_b + off, width,
+               src_argb_alpha + off, width,
+               dst_argb_opt + off, width,
+               width, height);
+  }
+  for (int i = 0; i < kStride * height; ++i) {
+    EXPECT_EQ(dst_argb_c[i + off], dst_argb_opt[i + off]);
+  }
+  free_aligned_buffer_page_end(src_argb_a);
+  free_aligned_buffer_page_end(src_argb_b);
+  free_aligned_buffer_page_end(src_argb_alpha);
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  return;
+}
+
+TEST_F(LibYUVPlanarTest, BlendPlane_Opt) {
+  TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+}
+TEST_F(LibYUVPlanarTest, BlendPlane_Unaligned) {
+  TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+}
+TEST_F(LibYUVPlanarTest, BlendPlane_Any) {
+  TestBlendPlane(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+}
+TEST_F(LibYUVPlanarTest, BlendPlane_Invert) {
+  TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, -1, 1);
+}
+
+#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
+
+static void TestI420Blend(int width, int height, int benchmark_iterations,
+                          int disable_cpu_flags, int benchmark_cpu_info,
+                          int invert, int off) {
+  width = ((width) > 0) ? (width) : 1;
+  const int kStrideUV = SUBSAMPLE(width, 2);
+  const int kSizeUV = kStrideUV * SUBSAMPLE(height, 2);
+  align_buffer_page_end(src_y0, width * height + off);
+  align_buffer_page_end(src_u0, kSizeUV + off);
+  align_buffer_page_end(src_v0, kSizeUV + off);
+  align_buffer_page_end(src_y1, width * height + off);
+  align_buffer_page_end(src_u1, kSizeUV + off);
+  align_buffer_page_end(src_v1, kSizeUV + off);
+  align_buffer_page_end(src_a, width * height + off);
+  align_buffer_page_end(dst_y_c, width * height + off);
+  align_buffer_page_end(dst_u_c, kSizeUV + off);
+  align_buffer_page_end(dst_v_c, kSizeUV + off);
+  align_buffer_page_end(dst_y_opt, width * height + off);
+  align_buffer_page_end(dst_u_opt, kSizeUV + off);
+  align_buffer_page_end(dst_v_opt, kSizeUV + off);
+
+  MemRandomize(src_y0, width * height + off);
+  MemRandomize(src_u0, kSizeUV + off);
+  MemRandomize(src_v0, kSizeUV + off);
+  MemRandomize(src_y1, width * height + off);
+  MemRandomize(src_u1, kSizeUV + off);
+  MemRandomize(src_v1, kSizeUV + off);
+  MemRandomize(src_a, width * height + off);
+  memset(dst_y_c, 255, width * height + off);
+  memset(dst_u_c, 255, kSizeUV + off);
+  memset(dst_v_c, 255, kSizeUV + off);
+  memset(dst_y_opt, 255, width * height + off);
+  memset(dst_u_opt, 255, kSizeUV + off);
+  memset(dst_v_opt, 255, kSizeUV + off);
+
+  MaskCpuFlags(disable_cpu_flags);
+  I420Blend(src_y0 + off, width,
+            src_u0 + off, kStrideUV,
+            src_v0 + off, kStrideUV,
+            src_y1 + off, width,
+            src_u1 + off, kStrideUV,
+            src_v1 + off, kStrideUV,
+            src_a + off, width,
+            dst_y_c + off, width,
+            dst_u_c + off, kStrideUV,
+            dst_v_c + off, kStrideUV,
+            width, height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I420Blend(src_y0 + off, width,
+              src_u0 + off, kStrideUV,
+              src_v0 + off, kStrideUV,
+              src_y1 + off, width,
+              src_u1 + off, kStrideUV,
+              src_v1 + off, kStrideUV,
+              src_a + off, width,
+              dst_y_opt + off, width,
+              dst_u_opt + off, kStrideUV,
+              dst_v_opt + off, kStrideUV,
+              width, height);
+  }
+  for (int i = 0; i < width * height; ++i) {
+    EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]);
+  }
+  for (int i = 0; i < kSizeUV; ++i) {
+    EXPECT_EQ(dst_u_c[i + off], dst_u_opt[i + off]);
+    EXPECT_EQ(dst_v_c[i + off], dst_v_opt[i + off]);
+  }
+  free_aligned_buffer_page_end(src_y0);
+  free_aligned_buffer_page_end(src_u0);
+  free_aligned_buffer_page_end(src_v0);
+  free_aligned_buffer_page_end(src_y1);
+  free_aligned_buffer_page_end(src_u1);
+  free_aligned_buffer_page_end(src_v1);
+  free_aligned_buffer_page_end(src_a);
+  free_aligned_buffer_page_end(dst_y_c);
+  free_aligned_buffer_page_end(dst_u_c);
+  free_aligned_buffer_page_end(dst_v_c);
+  free_aligned_buffer_page_end(dst_y_opt);
+  free_aligned_buffer_page_end(dst_u_opt);
+  free_aligned_buffer_page_end(dst_v_opt);
+  return;
+}
+
+TEST_F(LibYUVPlanarTest, I420Blend_Opt) {
+  TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+}
+TEST_F(LibYUVPlanarTest, I420Blend_Unaligned) {
+  TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+}
+
+// TODO(fbarchard): DISABLED because _Any uses C.  Avoid C and re-enable.
+TEST_F(LibYUVPlanarTest, DISABLED_I420Blend_Any) {
+  TestI420Blend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+}
+TEST_F(LibYUVPlanarTest, I420Blend_Invert) {
+  TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+}
+
+TEST_F(LibYUVPlanarTest, TestAffine) {
+  SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);
+  SIMD_ALIGNED(uint8 interpolate_pixels_C[1280][4]);
+
+  for (int i = 0; i < 1280; ++i) {
     for (int j = 0; j < 4; ++j) {
       orig_pixels_0[i][j] = i;
     }
@@ -959,47 +1353,1009 @@ TEST_F(libyuvTest, TestAffine) {
   float uv_step[4] = { 0.f, 0.f, 0.75f, 0.f };
 
   ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0],
-                  uv_step, 256);
+                  uv_step, 1280);
   EXPECT_EQ(0u, interpolate_pixels_C[0][0]);
   EXPECT_EQ(96u, interpolate_pixels_C[128][0]);
   EXPECT_EQ(191u, interpolate_pixels_C[255][3]);
 
 #if defined(HAS_ARGBAFFINEROW_SSE2)
+  SIMD_ALIGNED(uint8 interpolate_pixels_Opt[1280][4]);
   ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
-                     uv_step, 256);
-  EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 256 * 4));
-#endif
+                     uv_step, 1280);
+  EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 1280 * 4));
 
-#if defined(HAS_ARGBAFFINEROW_SSE2)
   int has_sse2 = TestCpuFlag(kCpuHasSSE2);
   if (has_sse2) {
-    for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+    for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
       ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
-                         uv_step, 256);
+                         uv_step, 1280);
     }
-  } else {
+  }
 #endif
-    for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
-      ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0],
-                      uv_step, 256);
+}
+
+TEST_F(LibYUVPlanarTest, TestCopyPlane) {
+  int err = 0;
+  int yw = benchmark_width_;
+  int yh = benchmark_height_;
+  int b = 12;
+  int i, j;
+
+  int y_plane_size = (yw + b * 2) * (yh + b * 2);
+  align_buffer_page_end(orig_y, y_plane_size);
+  align_buffer_page_end(dst_c, y_plane_size);
+  align_buffer_page_end(dst_opt, y_plane_size);
+
+  memset(orig_y, 0, y_plane_size);
+  memset(dst_c, 0, y_plane_size);
+  memset(dst_opt, 0, y_plane_size);
+
+  // Fill image buffers with random data.
+  for (i = b; i < (yh + b); ++i) {
+    for (j = b; j < (yw + b); ++j) {
+      orig_y[i * (yw + b * 2) + j] = fastrand() & 0xff;
     }
-#if defined(HAS_ARGBAFFINEROW_SSE2)
   }
-#endif
+
+  // Fill destination buffers with random data.
+  for (i = 0; i < y_plane_size; ++i) {
+    uint8 random_number = fastrand() & 0x7f;
+    dst_c[i] = random_number;
+    dst_opt[i] = dst_c[i];
+  }
+
+  int y_off = b * (yw + b * 2) + b;
+
+  int y_st = yw + b * 2;
+  int stride = 8;
+
+  // Disable all optimizations.
+  MaskCpuFlags(disable_cpu_flags_);
+  double c_time = get_time();
+  for (j = 0; j < benchmark_iterations_; j++) {
+    CopyPlane(orig_y + y_off, y_st, dst_c + y_off, stride, yw, yh);
+  }
+  c_time = (get_time() - c_time) / benchmark_iterations_;
+
+  // Enable optimizations.
+  MaskCpuFlags(benchmark_cpu_info_);
+  double opt_time = get_time();
+  for (j = 0; j < benchmark_iterations_; j++) {
+    CopyPlane(orig_y + y_off, y_st, dst_opt + y_off, stride, yw, yh);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+
+  for (i = 0; i < y_plane_size; ++i) {
+    if (dst_c[i] != dst_opt[i])
+      ++err;
+  }
+
+  free_aligned_buffer_page_end(orig_y);
+  free_aligned_buffer_page_end(dst_c);
+  free_aligned_buffer_page_end(dst_opt);
+
+  EXPECT_EQ(0, err);
 }
 
-TEST_F(libyuvTest, Test565) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
-  SIMD_ALIGNED(uint8 pixels565[256][2]);
+static int TestMultiply(int width, int height, int benchmark_iterations,
+                        int disable_cpu_flags, int benchmark_cpu_info,
+                        int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_page_end(src_argb_a, kStride * height + off);
+  align_buffer_page_end(src_argb_b, kStride * height + off);
+  align_buffer_page_end(dst_argb_c, kStride * height);
+  align_buffer_page_end(dst_argb_opt, kStride * height);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+    src_argb_b[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBMultiply(src_argb_a + off, kStride,
+               src_argb_b + off, kStride,
+               dst_argb_c, kStride,
+               width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBMultiply(src_argb_a + off, kStride,
+                 src_argb_b + off, kStride,
+                 dst_argb_opt, kStride,
+                 width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_page_end(src_argb_a);
+  free_aligned_buffer_page_end(src_argb_b);
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  return max_diff;
+}
 
-  for (int i = 0; i < 256; ++i) {
-    for (int j = 0; j < 4; ++j) {
-      orig_pixels[i][j] = i;
+TEST_F(LibYUVPlanarTest, ARGBMultiply_Any) {
+  int max_diff = TestMultiply(benchmark_width_ - 1, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBMultiply_Unaligned) {
+  int max_diff = TestMultiply(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBMultiply_Invert) {
+  int max_diff = TestMultiply(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBMultiply_Opt) {
+  int max_diff = TestMultiply(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+static int TestAdd(int width, int height, int benchmark_iterations,
+                   int disable_cpu_flags,  int benchmark_cpu_info,
+                   int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_page_end(src_argb_a, kStride * height + off);
+  align_buffer_page_end(src_argb_b, kStride * height + off);
+  align_buffer_page_end(dst_argb_c, kStride * height);
+  align_buffer_page_end(dst_argb_opt, kStride * height);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+    src_argb_b[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBAdd(src_argb_a + off, kStride,
+          src_argb_b + off, kStride,
+          dst_argb_c, kStride,
+          width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBAdd(src_argb_a + off, kStride,
+            src_argb_b + off, kStride,
+            dst_argb_opt, kStride,
+            width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_page_end(src_argb_a);
+  free_aligned_buffer_page_end(src_argb_b);
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAdd_Any) {
+  int max_diff = TestAdd(benchmark_width_ - 1, benchmark_height_,
+                         benchmark_iterations_,
+                         disable_cpu_flags_,  benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAdd_Unaligned) {
+  int max_diff = TestAdd(benchmark_width_, benchmark_height_,
+                         benchmark_iterations_,
+                         disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAdd_Invert) {
+  int max_diff = TestAdd(benchmark_width_, benchmark_height_,
+                         benchmark_iterations_,
+                         disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAdd_Opt) {
+  int max_diff = TestAdd(benchmark_width_, benchmark_height_,
+                         benchmark_iterations_,
+                         disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+static int TestSubtract(int width, int height, int benchmark_iterations,
+                        int disable_cpu_flags, int benchmark_cpu_info,
+                        int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_page_end(src_argb_a, kStride * height + off);
+  align_buffer_page_end(src_argb_b, kStride * height + off);
+  align_buffer_page_end(dst_argb_c, kStride * height);
+  align_buffer_page_end(dst_argb_opt, kStride * height);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+    src_argb_b[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBSubtract(src_argb_a + off, kStride,
+               src_argb_b + off, kStride,
+               dst_argb_c, kStride,
+               width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBSubtract(src_argb_a + off, kStride,
+                 src_argb_b + off, kStride,
+                 dst_argb_opt, kStride,
+                 width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_page_end(src_argb_a);
+  free_aligned_buffer_page_end(src_argb_b);
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSubtract_Any) {
+  int max_diff = TestSubtract(benchmark_width_ - 1, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSubtract_Unaligned) {
+  int max_diff = TestSubtract(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSubtract_Invert) {
+  int max_diff = TestSubtract(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSubtract_Opt) {
+  int max_diff = TestSubtract(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+static int TestSobel(int width, int height, int benchmark_iterations,
+                     int disable_cpu_flags, int benchmark_cpu_info,
+                     int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_page_end(src_argb_a, kStride * height + off);
+  align_buffer_page_end(dst_argb_c, kStride * height);
+  align_buffer_page_end(dst_argb_opt, kStride * height);
+  memset(src_argb_a, 0, kStride * height + off);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBSobel(src_argb_a + off, kStride,
+            dst_argb_c, kStride,
+            width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBSobel(src_argb_a + off, kStride,
+              dst_argb_opt, kStride,
+              width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_page_end(src_argb_a);
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobel_Any) {
+  int max_diff = TestSobel(benchmark_width_ - 1, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobel_Unaligned) {
+  int max_diff = TestSobel(benchmark_width_, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobel_Invert) {
+  int max_diff = TestSobel(benchmark_width_, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobel_Opt) {
+  int max_diff = TestSobel(benchmark_width_, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+static int TestSobelToPlane(int width, int height, int benchmark_iterations,
+                            int disable_cpu_flags, int benchmark_cpu_info,
+                            int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kSrcBpp = 4;
+  const int kDstBpp = 1;
+  const int kSrcStride = (width * kSrcBpp + 15) & ~15;
+  const int kDstStride = (width * kDstBpp + 15) & ~15;
+  align_buffer_page_end(src_argb_a, kSrcStride * height + off);
+  align_buffer_page_end(dst_argb_c, kDstStride * height);
+  align_buffer_page_end(dst_argb_opt, kDstStride * height);
+  memset(src_argb_a, 0, kSrcStride * height + off);
+  for (int i = 0; i < kSrcStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_argb_c, 0, kDstStride * height);
+  memset(dst_argb_opt, 0, kDstStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBSobelToPlane(src_argb_a + off, kSrcStride,
+                   dst_argb_c, kDstStride,
+                   width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBSobelToPlane(src_argb_a + off, kSrcStride,
+                     dst_argb_opt, kDstStride,
+                     width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kDstStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_page_end(src_argb_a);
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Any) {
+  int max_diff = TestSobelToPlane(benchmark_width_ - 1, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  +1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Unaligned) {
+  int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  +1, 1);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Invert) {
+  int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  -1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Opt) {
+  int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  +1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+static int TestSobelXY(int width, int height, int benchmark_iterations,
+                       int disable_cpu_flags, int benchmark_cpu_info,
+                       int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_page_end(src_argb_a, kStride * height + off);
+  align_buffer_page_end(dst_argb_c, kStride * height);
+  align_buffer_page_end(dst_argb_opt, kStride * height);
+  memset(src_argb_a, 0, kStride * height + off);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBSobelXY(src_argb_a + off, kStride,
+            dst_argb_c, kStride,
+            width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBSobelXY(src_argb_a + off, kStride,
+              dst_argb_opt, kStride,
+              width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_page_end(src_argb_a);
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelXY_Any) {
+  int max_diff = TestSobelXY(benchmark_width_ - 1, benchmark_height_,
+                             benchmark_iterations_,
+                             disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelXY_Unaligned) {
+  int max_diff = TestSobelXY(benchmark_width_, benchmark_height_,
+                             benchmark_iterations_,
+                             disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelXY_Invert) {
+  int max_diff = TestSobelXY(benchmark_width_, benchmark_height_,
+                             benchmark_iterations_,
+                             disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelXY_Opt) {
+  int max_diff = TestSobelXY(benchmark_width_, benchmark_height_,
+                             benchmark_iterations_,
+                             disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+static int TestBlur(int width, int height, int benchmark_iterations,
+                    int disable_cpu_flags, int benchmark_cpu_info,
+                    int invert, int off, int radius) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_page_end(src_argb_a, kStride * height + off);
+  align_buffer_page_end(dst_cumsum, width * height * 16);
+  align_buffer_page_end(dst_argb_c, kStride * height);
+  align_buffer_page_end(dst_argb_opt, kStride * height);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_cumsum, 0, width * height * 16);
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBBlur(src_argb_a + off, kStride,
+           dst_argb_c, kStride,
+           reinterpret_cast<int32*>(dst_cumsum), width * 4,
+           width, invert * height, radius);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBBlur(src_argb_a + off, kStride,
+             dst_argb_opt, kStride,
+             reinterpret_cast<int32*>(dst_cumsum), width * 4,
+             width, invert * height, radius);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
     }
   }
-  ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
-  uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
-  EXPECT_EQ(610919429u, checksum);
+  free_aligned_buffer_page_end(src_argb_a);
+  free_aligned_buffer_page_end(dst_cumsum);
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  return max_diff;
+}
+
+static const int kBlurSize = 55;
+TEST_F(LibYUVPlanarTest, ARGBBlur_Any) {
+  int max_diff = TestBlur(benchmark_width_ - 1, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          +1, 0, kBlurSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlur_Unaligned) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          +1, 1, kBlurSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlur_Invert) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          -1, 0, kBlurSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlur_Opt) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          +1, 0, kBlurSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+static const int kBlurSmallSize = 5;
+TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Any) {
+  int max_diff = TestBlur(benchmark_width_ - 1, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          +1, 0, kBlurSmallSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Unaligned) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          +1, 1, kBlurSmallSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Invert) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          -1, 0, kBlurSmallSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Opt) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          +1, 0, kBlurSmallSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  SIMD_ALIGNED(static const float kWarmifyPolynomial[16]) = {
+    0.94230f,  -3.03300f,    -2.92500f,  0.f,  // C0
+    0.584500f,  1.112000f,    1.535000f, 1.f,  // C1 x
+    0.001313f, -0.002503f,   -0.004496f, 0.f,  // C2 x * x
+    0.0f,       0.000006965f, 0.000008781f, 0.f,  // C3 x * x * x
+  };
+
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test white
+  orig_pixels[3][0] = 255u;
+  orig_pixels[3][1] = 255u;
+  orig_pixels[3][2] = 255u;
+  orig_pixels[3][3] = 255u;
+  // Test color
+  orig_pixels[4][0] = 16u;
+  orig_pixels[4][1] = 64u;
+  orig_pixels[4][2] = 192u;
+  orig_pixels[4][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                 &kWarmifyPolynomial[0], 16, 1);
+  EXPECT_EQ(235u, dst_pixels_opt[0][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][2]);
+  EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][0]);
+  EXPECT_EQ(233u, dst_pixels_opt[1][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][2]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][1]);
+  EXPECT_EQ(241u, dst_pixels_opt[2][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+  EXPECT_EQ(235u, dst_pixels_opt[3][0]);
+  EXPECT_EQ(233u, dst_pixels_opt[3][1]);
+  EXPECT_EQ(241u, dst_pixels_opt[3][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[3][3]);
+  EXPECT_EQ(10u, dst_pixels_opt[4][0]);
+  EXPECT_EQ(59u, dst_pixels_opt[4][1]);
+  EXPECT_EQ(188u, dst_pixels_opt[4][2]);
+  EXPECT_EQ(224u, dst_pixels_opt[4][3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+
+  MaskCpuFlags(disable_cpu_flags_);
+  ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0,
+                 &kWarmifyPolynomial[0], 1280, 1);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                   &kWarmifyPolynomial[0], 1280, 1);
+  }
+
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+    EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+    EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+    EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  align_buffer_page_end(lumacolortable, 32768);
+  int v = 0;
+  for (int i = 0; i < 32768; ++i) {
+    lumacolortable[i] = v;
+    v += 3;
+  }
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test color
+  orig_pixels[3][0] = 16u;
+  orig_pixels[3][1] = 64u;
+  orig_pixels[3][2] = 192u;
+  orig_pixels[3][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                     &lumacolortable[0], 16, 1);
+  EXPECT_EQ(253u, dst_pixels_opt[0][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][2]);
+  EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][0]);
+  EXPECT_EQ(253u, dst_pixels_opt[1][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][2]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][1]);
+  EXPECT_EQ(253u, dst_pixels_opt[2][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+  EXPECT_EQ(48u, dst_pixels_opt[3][0]);
+  EXPECT_EQ(192u, dst_pixels_opt[3][1]);
+  EXPECT_EQ(64u, dst_pixels_opt[3][2]);
+  EXPECT_EQ(224u, dst_pixels_opt[3][3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+
+  MaskCpuFlags(disable_cpu_flags_);
+  ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0,
+                     lumacolortable, 1280, 1);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                       lumacolortable, 1280, 1);
+  }
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+    EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+    EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+    EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+  }
+
+  free_aligned_buffer_page_end(lumacolortable);
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBCopyAlpha) {
+  const int kSize = benchmark_width_ * benchmark_height_ * 4;
+  align_buffer_page_end(orig_pixels, kSize);
+  align_buffer_page_end(dst_pixels_opt, kSize);
+  align_buffer_page_end(dst_pixels_c, kSize);
+
+  MemRandomize(orig_pixels, kSize);
+  MemRandomize(dst_pixels_opt, kSize);
+  memcpy(dst_pixels_c, dst_pixels_opt, kSize);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4,
+                dst_pixels_c, benchmark_width_ * 4,
+                benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4,
+                  dst_pixels_opt, benchmark_width_ * 4,
+                  benchmark_width_, benchmark_height_);
+  }
+  for (int i = 0; i < kSize; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(dst_pixels_c);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels, kPixels * 4);
+  align_buffer_page_end(dst_pixels_opt, kPixels);
+  align_buffer_page_end(dst_pixels_c, kPixels);
+
+  MemRandomize(src_pixels, kPixels * 4);
+  MemRandomize(dst_pixels_opt, kPixels);
+  memcpy(dst_pixels_c, dst_pixels_opt, kPixels);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  ARGBExtractAlpha(src_pixels, benchmark_width_ * 4,
+                   dst_pixels_c, benchmark_width_,
+                   benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ARGBExtractAlpha(src_pixels, benchmark_width_ * 4,
+                     dst_pixels_opt, benchmark_width_,
+                     benchmark_width_, benchmark_height_);
+  }
+  for (int i = 0; i < kPixels; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(dst_pixels_c);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(src_pixels);
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(orig_pixels, kPixels);
+  align_buffer_page_end(dst_pixels_opt, kPixels * 4);
+  align_buffer_page_end(dst_pixels_c, kPixels * 4);
+
+  MemRandomize(orig_pixels, kPixels);
+  MemRandomize(dst_pixels_opt, kPixels * 4);
+  memcpy(dst_pixels_c, dst_pixels_opt, kPixels * 4);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  ARGBCopyYToAlpha(orig_pixels, benchmark_width_,
+                   dst_pixels_c, benchmark_width_ * 4,
+                   benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ARGBCopyYToAlpha(orig_pixels, benchmark_width_,
+                     dst_pixels_opt, benchmark_width_ * 4,
+                     benchmark_width_, benchmark_height_);
+  }
+  for (int i = 0; i < kPixels * 4; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(dst_pixels_c);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+static int TestARGBRect(int width, int height, int benchmark_iterations,
+                        int disable_cpu_flags, int benchmark_cpu_info,
+                        int invert, int off, int bpp) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kStride = width * bpp;
+  const int kSize = kStride * height;
+  const uint32 v32 = fastrand() & (bpp == 4 ? 0xffffffff : 0xff);
+
+  align_buffer_page_end(dst_argb_c, kSize + off);
+  align_buffer_page_end(dst_argb_opt, kSize + off);
+
+  MemRandomize(dst_argb_c + off, kSize);
+  memcpy(dst_argb_opt + off, dst_argb_c + off, kSize);
+
+  MaskCpuFlags(disable_cpu_flags);
+  if (bpp == 4) {
+    ARGBRect(dst_argb_c + off, kStride, 0, 0, width, invert * height, v32);
+  } else {
+    SetPlane(dst_argb_c + off, kStride, width, invert * height, v32);
+  }
+
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    if (bpp == 4) {
+      ARGBRect(dst_argb_opt + off, kStride, 0, 0, width, invert * height, v32);
+    } else {
+      SetPlane(dst_argb_opt + off, kStride, width, invert * height, v32);
+    }
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i + off]) -
+            static_cast<int>(dst_argb_opt[i + off]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBRect_Any) {
+  int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              +1, 0, 4);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBRect_Unaligned) {
+  int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              +1, 1, 4);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBRect_Invert) {
+  int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              -1, 0, 4);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBRect_Opt) {
+  int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              +1, 0, 4);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, SetPlane_Any) {
+  int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              +1, 0, 1);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, SetPlane_Unaligned) {
+  int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              +1, 1, 1);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, SetPlane_Invert) {
+  int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              -1, 0, 1);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, SetPlane_Opt) {
+  int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              +1, 0, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 }  // namespace libyuv
diff --git a/files/unit_test/rotate_argb_test.cc b/files/unit_test/rotate_argb_test.cc
index fe8435e1..9c83c356 100644
--- a/files/unit_test/rotate_argb_test.cc
+++ b/files/unit_test/rotate_argb_test.cc
@@ -4,12 +4,11 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #include <stdlib.h>
-#include <time.h>
 
 #include "libyuv/cpu_id.h"
 #include "libyuv/rotate_argb.h"
@@ -17,179 +16,181 @@
 
 namespace libyuv {
 
-static int ARGBTestRotate(int src_width, int src_height,
-                          int dst_width, int dst_height,
-                          libyuv::RotationMode mode, int runs) {
-  const int b = 128;
-  int src_argb_plane_size = (src_width + b * 2) * (src_height + b * 2) * 4;
-  int src_stride_argb = (b * 2 + src_width) * 4;
-
-  align_buffer_16(src_argb, src_argb_plane_size)
-  memset(src_argb, 1, src_argb_plane_size);
-
-  int dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
-  int dst_stride_argb = (b * 2 + dst_width) * 4;
-
-  srandom(time(NULL));
-
-  int i, j;
-  for (i = b; i < (src_height + b); ++i) {
-    for (j = b; j < (src_width + b) * 4; ++j) {
-      src_argb[(i * src_stride_argb) + j] = (random() & 0xff);
-    }
+void TestRotateBpp(int src_width, int src_height,
+                   int dst_width, int dst_height,
+                   libyuv::RotationMode mode,
+                   int benchmark_iterations,
+                   int disable_cpu_flags,
+                   int benchmark_cpu_info,
+                   const int kBpp) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height < 1) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_stride_argb = src_width * kBpp;
+  int src_argb_plane_size = src_stride_argb * abs(src_height);
+  align_buffer_page_end(src_argb, src_argb_plane_size);
+  for (int i = 0; i < src_argb_plane_size; ++i) {
+    src_argb[i] = fastrand() & 0xff;
   }
 
-  align_buffer_16(dst_argb_c, dst_argb_plane_size)
-  align_buffer_16(dst_argb_opt, dst_argb_plane_size)
+  int dst_stride_argb = dst_width * kBpp;
+  int dst_argb_plane_size = dst_stride_argb * dst_height;
+  align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
+  align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
   memset(dst_argb_c, 2, dst_argb_plane_size);
   memset(dst_argb_opt, 3, dst_argb_plane_size);
 
-  // Warm up both versions for consistent benchmarks.
-  MaskCpuFlags(0);  // Disable all CPU optimization.
-  ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
-             dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
-             src_width, src_height, mode);
-  MaskCpuFlags(-1);  // Enable all CPU optimization.
-  ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
-             dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
-             src_width, src_height, mode);
-
-  MaskCpuFlags(0);  // Disable all CPU optimization.
-  double c_time = get_time();
-  for (i = 0; i < runs; ++i) {
-    ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
-               dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+  if (kBpp == 1) {
+    MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+    RotatePlane(src_argb, src_stride_argb,
+                dst_argb_c, dst_stride_argb,
+                src_width, src_height, mode);
+
+    MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+    for (int i = 0; i < benchmark_iterations; ++i) {
+      RotatePlane(src_argb, src_stride_argb,
+                  dst_argb_opt, dst_stride_argb,
+                  src_width, src_height, mode);
+    }
+  } else if (kBpp == 4) {
+    MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+    ARGBRotate(src_argb, src_stride_argb,
+               dst_argb_c, dst_stride_argb,
                src_width, src_height, mode);
-  }
-  c_time = (get_time() - c_time) / runs;
 
-  MaskCpuFlags(-1);  // Enable all CPU optimization.
-  double opt_time = get_time();
-  for (i = 0; i < runs; ++i) {
-    ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
-               dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
-               src_width, src_height, mode);
-  }
-  opt_time = (get_time() - opt_time) / runs;
-
-  // Report performance of C vs OPT
-  printf("filter %d - %8d us C - %8d us OPT\n",
-         mode, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
-
-  // C version may be a little off from the optimized. Order of
-  //  operations may introduce rounding somewhere. So do a difference
-  //  of the buffers and look to see that the max difference isn't
-  //  over 2.
-  int max_diff = 0;
-  for (i = b; i < (dst_height + b); ++i) {
-    for (j = b * 4; j < (dst_width + b) * 4; ++j) {
-      int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] -
-                         dst_argb_opt[(i * dst_stride_argb) + j]);
-      if (abs_diff > max_diff)
-        max_diff = abs_diff;
+    MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+    for (int i = 0; i < benchmark_iterations; ++i) {
+      ARGBRotate(src_argb, src_stride_argb,
+                 dst_argb_opt, dst_stride_argb,
+                 src_width, src_height, mode);
     }
   }
 
-  free_aligned_buffer_16(dst_argb_c)
-  free_aligned_buffer_16(dst_argb_opt)
-  free_aligned_buffer_16(src_argb)
-  return max_diff;
-}
-
-TEST_F(libyuvTest, ARGBRotate0) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = 1280;
-  const int dst_height = 720;
+  // Rotation should be exact.
+  for (int i = 0; i < dst_argb_plane_size; ++i) {
+    EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);
+  }
 
-  int err = ARGBTestRotate(src_width, src_height,
-                           dst_width, dst_height, kRotate0,
-                           benchmark_iterations_);
-  EXPECT_GE(1, err);
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  free_aligned_buffer_page_end(src_argb);
 }
 
-TEST_F(libyuvTest, ARGBRotate90) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = 720;
-  const int dst_height = 1280;
+static void ARGBTestRotate(int src_width, int src_height,
+                           int dst_width, int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  TestRotateBpp(src_width, src_height,
+                dst_width, dst_height,
+                mode, benchmark_iterations,
+                disable_cpu_flags, benchmark_cpu_info, 4);
+}
 
-  int err = ARGBTestRotate(src_width, src_height,
-                           dst_width, dst_height, kRotate90,
-                           benchmark_iterations_);
-  EXPECT_GE(1, err);
+TEST_F(LibYUVRotateTest, ARGBRotate0_Opt) {
+  ARGBTestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, ARGBRotate180) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = 1280;
-  const int dst_height = 720;
+TEST_F(LibYUVRotateTest, ARGBRotate90_Opt) {
+  ARGBTestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
 
-  int err = ARGBTestRotate(src_width, src_height,
-                           dst_width, dst_height, kRotate180,
-                           benchmark_iterations_);
-  EXPECT_GE(1, err);
+TEST_F(LibYUVRotateTest, ARGBRotate180_Opt) {
+  ARGBTestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, ARGBRotate270) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = 720;
-  const int dst_height = 1280;
+TEST_F(LibYUVRotateTest, ARGBRotate270_Opt) {
+  ARGBTestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
 
-  int err = ARGBTestRotate(src_width, src_height,
-                           dst_width, dst_height, kRotate270,
-                           benchmark_iterations_);
-  EXPECT_GE(1, err);
+static void TestRotatePlane(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            libyuv::RotationMode mode,
+                            int benchmark_iterations,
+                            int disable_cpu_flags,
+                            int benchmark_cpu_info) {
+  TestRotateBpp(src_width, src_height,
+                dst_width, dst_height,
+                mode, benchmark_iterations,
+                disable_cpu_flags, benchmark_cpu_info, 1);
 }
 
-TEST_F(libyuvTest, ARGBRotate0_Odd) {
-  const int src_width = 1277;
-  const int src_height = 719;
-  const int dst_width = 1277;
-  const int dst_height = 719;
+TEST_F(LibYUVRotateTest, RotatePlane0_Opt) {
+  TestRotatePlane(benchmark_width_, benchmark_height_,
+                  benchmark_width_, benchmark_height_,
+                  kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
 
-  int err = ARGBTestRotate(src_width, src_height,
-                           dst_width, dst_height, kRotate0,
-                           benchmark_iterations_);
-  EXPECT_GE(1, err);
+TEST_F(LibYUVRotateTest, RotatePlane90_Opt) {
+  TestRotatePlane(benchmark_width_, benchmark_height_,
+                  benchmark_height_, benchmark_width_,
+                  kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, ARGBRotate90_Odd) {
-  const int src_width = 1277;
-  const int src_height = 719;
-  const int dst_width = 719;
-  const int dst_height = 1277;
+TEST_F(LibYUVRotateTest, RotatePlane180_Opt) {
+  TestRotatePlane(benchmark_width_, benchmark_height_,
+                  benchmark_width_, benchmark_height_,
+                  kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
 
-  int err = ARGBTestRotate(src_width, src_height,
-                           dst_width, dst_height, kRotate90,
-                           benchmark_iterations_);
-  EXPECT_GE(1, err);
+TEST_F(LibYUVRotateTest, RotatePlane270_Opt) {
+  TestRotatePlane(benchmark_width_, benchmark_height_,
+                  benchmark_height_, benchmark_width_,
+                  kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, ARGBRotate180_Odd) {
-  const int src_width = 1277;
-  const int src_height = 719;
-  const int dst_width = 1277;
-  const int dst_height = 719;
+TEST_F(LibYUVRotateTest, DISABLED_RotatePlane0_Odd) {
+  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+                  benchmark_width_ - 3, benchmark_height_ - 1,
+                  kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
 
-  int err = ARGBTestRotate(src_width, src_height,
-                           dst_width, dst_height, kRotate180,
-                           benchmark_iterations_);
-  EXPECT_GE(1, err);
+TEST_F(LibYUVRotateTest, DISABLED_RotatePlane90_Odd) {
+  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+                  benchmark_height_ - 1, benchmark_width_ - 3,
+                  kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, ARGBRotate270_Odd) {
-  const int src_width = 1277;
-  const int src_height = 719;
-  const int dst_width = 719;
-  const int dst_height = 1277;
+TEST_F(LibYUVRotateTest, DISABLED_RotatePlane180_Odd) {
+  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+                  benchmark_width_ - 3, benchmark_height_ - 1,
+                  kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
 
-  int err = ARGBTestRotate(src_width, src_height,
-                           dst_width, dst_height, kRotate270,
-                           benchmark_iterations_);
-  EXPECT_GE(1, err);
+TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
+  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+                  benchmark_height_ - 1, benchmark_width_ - 3,
+                  kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
 }  // namespace libyuv
diff --git a/files/unit_test/rotate_test.cc b/files/unit_test/rotate_test.cc
index 788e511e..07e2f73a 100644
--- a/files/unit_test/rotate_test.cc
+++ b/files/unit_test/rotate_test.cc
@@ -1,1549 +1,296 @@
 /*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #include <stdlib.h>
-#include <time.h>
 
+#include "libyuv/cpu_id.h"
 #include "libyuv/rotate.h"
 #include "../unit_test/unit_test.h"
 
 namespace libyuv {
 
-void PrintArray(uint8 *array, int w, int h) {
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; ++j) {
-      printf("%4d", (signed char)array[i * w + j]);
-    }
-    printf("\n");
-  }
+static void I420TestRotate(int src_width, int src_height,
+                           int dst_width, int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags, int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i420_y_size = src_width * Abs(src_height);
+  int src_i420_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2);
+  int src_i420_size = src_i420_y_size + src_i420_uv_size * 2;
+  align_buffer_page_end(src_i420, src_i420_size);
+  for (int i = 0; i < src_i420_size; ++i) {
+    src_i420[i] = fastrand() & 0xff;
+  }
+
+  int dst_i420_y_size = dst_width * dst_height;
+  int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+  int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
+  align_buffer_page_end(dst_i420_c, dst_i420_size);
+  align_buffer_page_end(dst_i420_opt, dst_i420_size);
+  memset(dst_i420_c, 2, dst_i420_size);
+  memset(dst_i420_opt, 3, dst_i420_size);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I420Rotate(src_i420, src_width,
+             src_i420 + src_i420_y_size, (src_width + 1) / 2,
+             src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2,
+             dst_i420_c, dst_width,
+             dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
+             dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
+               (dst_width + 1) / 2,
+             src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I420Rotate(src_i420, src_width,
+               src_i420 + src_i420_y_size, (src_width + 1) / 2,
+               src_i420 + src_i420_y_size + src_i420_uv_size,
+                 (src_width + 1) / 2,
+               dst_i420_opt, dst_width,
+               dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2,
+               dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
+                 (dst_width + 1) / 2,
+               src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i420_size; ++i) {
+    EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(dst_i420_c);
+  free_aligned_buffer_page_end(dst_i420_opt);
+  free_aligned_buffer_page_end(src_i420);
 }
 
-TEST_F(libyuvTest, Transpose) {
-  int iw, ih, ow, oh;
-  int err = 0;
-
-  for (iw = 8; iw < rotate_max_w_ && !err; ++iw) {
-    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
-      int i;
-      ow = ih;
-      oh = iw;
-
-      align_buffer_16(input, iw * ih)
-      align_buffer_16(output_1, ow * oh)
-      align_buffer_16(output_2, iw * ih)
-
-      for (i = 0; i < iw * ih; ++i) {
-        input[i] = i;
-      }
-
-      TransposePlane(input,    iw, output_1, ow, iw, ih);
-      TransposePlane(output_1, ow, output_2, oh, ow, oh);
-
-      for (i = 0; i < iw * ih; ++i) {
-        if (input[i] != output_2[i]) {
-          err++;
-        }
-      }
-
-      if (err) {
-        printf("input %dx%d \n", iw, ih);
-        PrintArray(input, iw, ih);
-
-        printf("transpose 1\n");
-        PrintArray(output_1, ow, oh);
-
-        printf("transpose 2\n");
-        PrintArray(output_2, iw, ih);
-      }
-
-      free_aligned_buffer_16(input)
-      free_aligned_buffer_16(output_1)
-      free_aligned_buffer_16(output_2)
-    }
-  }
-
-  EXPECT_EQ(0, err);
+TEST_F(LibYUVRotateTest, I420Rotate0_Opt) {
+  I420TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, TransposeUV) {
-  int iw, ih, ow, oh;
-  int err = 0;
-
-  for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) {
-    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
-      int i;
-
-      ow = ih;
-      oh = iw >> 1;
-
-      align_buffer_16(input, iw * ih)
-      align_buffer_16(output_a1, ow * oh)
-      align_buffer_16(output_b1, ow * oh)
-      align_buffer_16(output_a2, iw * ih)
-      align_buffer_16(output_b2, iw * ih)
-
-      for (i = 0; i < iw * ih; i += 2) {
-        input[i] = i >> 1;
-        input[i + 1] = -(i >> 1);
-      }
-
-      TransposeUV(input, iw, output_a1, ow, output_b1, ow, iw >> 1, ih);
-
-      TransposePlane(output_a1, ow, output_a2, oh, ow, oh);
-      TransposePlane(output_b1, ow, output_b2, oh, ow, oh);
-
-      for (i = 0; i < iw * ih; i += 2) {
-        if (input[i] != output_a2[i >> 1]) {
-          err++;
-        }
-        if (input[i + 1] != output_b2[i >> 1]) {
-          err++;
-        }
-      }
-
-      if (err) {
-        printf("input %dx%d \n", iw, ih);
-        PrintArray(input, iw, ih);
-
-        printf("transpose 1\n");
-        PrintArray(output_a1, ow, oh);
-        PrintArray(output_b1, ow, oh);
-
-        printf("transpose 2\n");
-        PrintArray(output_a2, oh, ow);
-        PrintArray(output_b2, oh, ow);
-      }
-
-      free_aligned_buffer_16(input)
-      free_aligned_buffer_16(output_a1)
-      free_aligned_buffer_16(output_b1)
-      free_aligned_buffer_16(output_a2)
-      free_aligned_buffer_16(output_b2)
-    }
-  }
-
-  EXPECT_EQ(0, err);
+TEST_F(LibYUVRotateTest, I420Rotate90_Opt) {
+  I420TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, RotatePlane90) {
-  int iw, ih, ow, oh;
-  int err = 0;
-
-  for (iw = 8; iw < rotate_max_w_ && !err; ++iw) {
-    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
-      int i;
-
-      ow = ih;
-      oh = iw;
-
-      align_buffer_16(input, iw * ih)
-      align_buffer_16(output_0, iw * ih)
-      align_buffer_16(output_90, ow * oh)
-      align_buffer_16(output_180, iw * ih)
-      align_buffer_16(output_270, ow * oh)
-
-      for (i = 0; i < iw * ih; ++i) {
-        input[i] = i;
-      }
-
-      RotatePlane90(input,      iw, output_90,  ow, iw, ih);
-      RotatePlane90(output_90,  ow, output_180, oh, ow, oh);
-      RotatePlane90(output_180, oh, output_270, ow, oh, ow);
-      RotatePlane90(output_270, ow, output_0,   iw, ow, oh);
-
-      for (i = 0; i < iw * ih; ++i) {
-        if (input[i] != output_0[i]) {
-          err++;
-        }
-      }
-
-      if (err) {
-        printf("input %dx%d \n", iw, ih);
-        PrintArray(input, iw, ih);
-
-        printf("output 90\n");
-        PrintArray(output_90, ow, oh);
-
-        printf("output 180\n");
-        PrintArray(output_180, iw, ih);
-
-        printf("output 270\n");
-        PrintArray(output_270, ow, oh);
-
-        printf("output 0\n");
-        PrintArray(output_0, iw, ih);
-      }
-
-      free_aligned_buffer_16(input)
-      free_aligned_buffer_16(output_0)
-      free_aligned_buffer_16(output_90)
-      free_aligned_buffer_16(output_180)
-      free_aligned_buffer_16(output_270)
-    }
-  }
-
-  EXPECT_EQ(0, err);
+TEST_F(LibYUVRotateTest, I420Rotate180_Opt) {
+  I420TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, RotateUV90) {
-  int iw, ih, ow, oh;
-  int err = 0;
-
-  for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) {
-    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
-      int i;
-
-      ow = ih;
-      oh = iw >> 1;
-
-      align_buffer_16(input, iw * ih)
-      align_buffer_16(output_0_u, ow * oh)
-      align_buffer_16(output_0_v, ow * oh)
-      align_buffer_16(output_90_u, ow * oh)
-      align_buffer_16(output_90_v, ow * oh)
-      align_buffer_16(output_180_u, ow * oh)
-      align_buffer_16(output_180_v, ow * oh)
-
-      for (i = 0; i < iw * ih; i += 2) {
-        input[i] = i >> 1;
-        input[i + 1] = -(i >> 1);
-      }
-
-      RotateUV90(input, iw, output_90_u, ow, output_90_v, ow, iw >> 1, ih);
-
-      RotatePlane90(output_90_u, ow, output_180_u, oh, ow, oh);
-      RotatePlane90(output_90_v, ow, output_180_v, oh, ow, oh);
-
-      RotatePlane180(output_180_u, ow, output_0_u, ow, ow, oh);
-      RotatePlane180(output_180_v, ow, output_0_v, ow, ow, oh);
-
-      for (i = 0; i < (ow * oh); ++i) {
-        if (output_0_u[i] != (uint8)i) {
-          err++;
-        }
-        if (output_0_v[i] != (uint8)(-i)) {
-          err++;
-        }
-      }
-
-      if (err) {
-        printf("input %dx%d \n", iw, ih);
-        PrintArray(input, iw, ih);
-
-        printf("output 90_u\n");
-        PrintArray(output_90_u, ow, oh);
-
-        printf("output 90_v\n");
-        PrintArray(output_90_v, ow, oh);
-
-        printf("output 180_u\n");
-        PrintArray(output_180_u, oh, ow);
-
-        printf("output 180_v\n");
-        PrintArray(output_180_v, oh, ow);
-
-        printf("output 0_u\n");
-        PrintArray(output_0_u, oh, ow);
-
-        printf("output 0_v\n");
-        PrintArray(output_0_v, oh, ow);
-      }
-
-      free_aligned_buffer_16(input)
-      free_aligned_buffer_16(output_0_u)
-      free_aligned_buffer_16(output_0_v)
-      free_aligned_buffer_16(output_90_u)
-      free_aligned_buffer_16(output_90_v)
-      free_aligned_buffer_16(output_180_u)
-      free_aligned_buffer_16(output_180_v)
-    }
-  }
-
-  EXPECT_EQ(0, err);
+TEST_F(LibYUVRotateTest, I420Rotate270_Opt) {
+  I420TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, RotateUV180) {
-  int iw, ih, ow, oh;
-  int err = 0;
-
-  for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) {
-    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
-      int i;
-
-      ow = iw >> 1;
-      oh = ih;
-
-      align_buffer_16(input, iw * ih)
-      align_buffer_16(output_0_u, ow * oh)
-      align_buffer_16(output_0_v, ow * oh)
-      align_buffer_16(output_90_u, ow * oh)
-      align_buffer_16(output_90_v, ow * oh)
-      align_buffer_16(output_180_u, ow * oh)
-      align_buffer_16(output_180_v, ow * oh)
-
-      for (i = 0; i < iw * ih; i += 2) {
-        input[i] = i >> 1;
-        input[i + 1] = -(i >> 1);
-      }
-
-      RotateUV180(input, iw, output_180_u, ow, output_180_v, ow, iw >> 1, ih);
-
-      RotatePlane90(output_180_u, ow, output_90_u, oh, ow, oh);
-      RotatePlane90(output_180_v, ow, output_90_v, oh, ow, oh);
-
-      RotatePlane90(output_90_u, oh, output_0_u, ow, oh, ow);
-      RotatePlane90(output_90_v, oh, output_0_v, ow, oh, ow);
-
-      for (i = 0; i < (ow * oh); ++i) {
-        if (output_0_u[i] != (uint8)i) {
-          err++;
-        }
-        if (output_0_v[i] != (uint8)(-i)) {
-          err++;
-        }
-      }
-
-      if (err) {
-        printf("input %dx%d \n", iw, ih);
-        PrintArray(input, iw, ih);
-
-        printf("output 180_u\n");
-        PrintArray(output_180_u, oh, ow);
-
-        printf("output 180_v\n");
-        PrintArray(output_180_v, oh, ow);
-
-        printf("output 90_u\n");
-        PrintArray(output_90_u, oh, ow);
-
-        printf("output 90_v\n");
-        PrintArray(output_90_v, oh, ow);
-
-        printf("output 0_u\n");
-        PrintArray(output_0_u, ow, oh);
-
-        printf("output 0_v\n");
-        PrintArray(output_0_v, ow, oh);
-      }
-
-      free_aligned_buffer_16(input)
-      free_aligned_buffer_16(output_0_u)
-      free_aligned_buffer_16(output_0_v)
-      free_aligned_buffer_16(output_90_u)
-      free_aligned_buffer_16(output_90_v)
-      free_aligned_buffer_16(output_180_u)
-      free_aligned_buffer_16(output_180_v)
-    }
-  }
-
-  EXPECT_EQ(0, err);
+// TODO(fbarchard): Remove odd width tests.
+// Odd width tests work but disabled because they use C code and can be
+// tested by passing an odd width command line or environment variable.
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate0_Odd) {
+  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1,
+                 kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, RotateUV270) {
-  int iw, ih, ow, oh;
-  int err = 0;
-
-  for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) {
-    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
-      int i;
-
-      ow = ih;
-      oh = iw >> 1;
-
-      align_buffer_16(input, iw * ih)
-      align_buffer_16(output_0_u, ow * oh)
-      align_buffer_16(output_0_v, ow * oh)
-      align_buffer_16(output_270_u, ow * oh)
-      align_buffer_16(output_270_v, ow * oh)
-      align_buffer_16(output_180_u, ow * oh)
-      align_buffer_16(output_180_v, ow * oh)
-
-      for (i = 0; i < iw * ih; i += 2) {
-        input[i] = i >> 1;
-        input[i + 1] = -(i >> 1);
-      }
-
-      RotateUV270(input, iw, output_270_u, ow, output_270_v, ow,
-                       iw >> 1, ih);
-
-      RotatePlane270(output_270_u, ow, output_180_u, oh, ow, oh);
-      RotatePlane270(output_270_v, ow, output_180_v, oh, ow, oh);
-
-      RotatePlane180(output_180_u, ow, output_0_u, ow, ow, oh);
-      RotatePlane180(output_180_v, ow, output_0_v, ow, ow, oh);
-
-      for (i = 0; i < (ow * oh); ++i) {
-        if (output_0_u[i] != (uint8)i) {
-          err++;
-        }
-        if (output_0_v[i] != (uint8)(-i)) {
-          err++;
-        }
-      }
-
-      if (err) {
-        printf("input %dx%d \n", iw, ih);
-        PrintArray(input, iw, ih);
-
-        printf("output 270_u\n");
-        PrintArray(output_270_u, ow, oh);
-
-        printf("output 270_v\n");
-        PrintArray(output_270_v, ow, oh);
-
-        printf("output 180_u\n");
-        PrintArray(output_180_u, oh, ow);
-
-        printf("output 180_v\n");
-        PrintArray(output_180_v, oh, ow);
-
-        printf("output 0_u\n");
-        PrintArray(output_0_u, oh, ow);
-
-        printf("output 0_v\n");
-        PrintArray(output_0_v, oh, ow);
-      }
-
-      free_aligned_buffer_16(input)
-      free_aligned_buffer_16(output_0_u)
-      free_aligned_buffer_16(output_0_v)
-      free_aligned_buffer_16(output_270_u)
-      free_aligned_buffer_16(output_270_v)
-      free_aligned_buffer_16(output_180_u)
-      free_aligned_buffer_16(output_180_v)
-    }
-  }
-
-  EXPECT_EQ(0, err);
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate90_Odd) {
+  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3,
+                 kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, RotatePlane180) {
-  int iw, ih, ow, oh;
-  int err = 0;
-
-  for (iw = 8; iw < rotate_max_w_ && !err; ++iw)
-    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
-      int i;
-
-      ow = iw;
-      oh = ih;
-
-      align_buffer_16(input, iw * ih)
-      align_buffer_16(output_0, iw * ih)
-      align_buffer_16(output_180, iw * ih)
-
-      for (i = 0; i < iw * ih; ++i) {
-        input[i] = i;
-      }
-
-      RotatePlane180(input,      iw, output_180, ow, iw, ih);
-      RotatePlane180(output_180, ow, output_0,   iw, ow, oh);
-
-      for (i = 0; i < iw * ih; ++i) {
-        if (input[i] != output_0[i]) {
-          err++;
-        }
-      }
-
-      if (err) {
-        printf("input %dx%d \n", iw, ih);
-        PrintArray(input, iw, ih);
-
-        printf("output 180\n");
-        PrintArray(output_180, iw, ih);
-
-        printf("output 0\n");
-        PrintArray(output_0, iw, ih);
-      }
-
-      free_aligned_buffer_16(input)
-      free_aligned_buffer_16(output_0)
-      free_aligned_buffer_16(output_180)
-    }
-
-  EXPECT_EQ(0, err);
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate180_Odd) {
+  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1,
+                 kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, RotatePlane270) {
-  int iw, ih, ow, oh;
-  int err = 0;
-
-  for (iw = 8; iw < rotate_max_w_ && !err; ++iw) {
-    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
-      int i;
-
-      ow = ih;
-      oh = iw;
-
-      align_buffer_16(input, iw * ih)
-      align_buffer_16(output_0, iw * ih)
-      align_buffer_16(output_90, ow * oh)
-      align_buffer_16(output_180, iw * ih)
-      align_buffer_16(output_270, ow * oh)
-
-      for (i = 0; i < iw * ih; ++i)
-        input[i] = i;
-
-      RotatePlane270(input,      iw, output_270, ow, iw, ih);
-      RotatePlane270(output_270, ow, output_180, oh, ow, oh);
-      RotatePlane270(output_180, oh, output_90,  ow, oh, ow);
-      RotatePlane270(output_90,  ow, output_0,   iw, ow, oh);
-
-      for (i = 0; i < iw * ih; ++i) {
-        if (input[i] != output_0[i]) {
-          err++;
-        }
-      }
-
-      if (err) {
-        printf("input %dx%d \n", iw, ih);
-        PrintArray(input, iw, ih);
-
-        printf("output 270\n");
-        PrintArray(output_270, ow, oh);
-
-        printf("output 180\n");
-        PrintArray(output_180, iw, ih);
-
-        printf("output 90\n");
-        PrintArray(output_90, ow, oh);
-
-        printf("output 0\n");
-        PrintArray(output_0, iw, ih);
-      }
-
-      free_aligned_buffer_16(input)
-      free_aligned_buffer_16(output_0)
-      free_aligned_buffer_16(output_90)
-      free_aligned_buffer_16(output_180)
-      free_aligned_buffer_16(output_270)
-    }
-  }
-
-  EXPECT_EQ(0, err);
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
+  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3,
+                 kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, RotatePlane90and270) {
-  int iw, ih, ow, oh;
-  int err = 0;
-
-  for (iw = 16; iw < rotate_max_w_ && !err; iw += 4)
-    for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) {
-      int i;
-
-      ow = ih;
-      oh = iw;
-
-      align_buffer_16(input, iw * ih)
-      align_buffer_16(output_0, iw * ih)
-      align_buffer_16(output_90, ow * oh)
-
-      for (i = 0; i < iw * ih; ++i) {
-        input[i] = i;
-      }
-
-      RotatePlane90(input,      iw, output_90,  ow, iw, ih);
-      RotatePlane270(output_90, ow, output_0,   iw, ow, oh);
-
-      for (i = 0; i < iw * ih; ++i) {
-        if (input[i] != output_0[i]) {
-          err++;
-        }
-      }
-
-      if (err) {
-        printf("intput %dx%d\n", iw, ih);
-        PrintArray(input, iw, ih);
-
-        printf("output \n");
-        PrintArray(output_90, ow, oh);
-
-        printf("output \n");
-        PrintArray(output_0, iw, ih);
-      }
-
-      free_aligned_buffer_16(input)
-      free_aligned_buffer_16(output_0)
-      free_aligned_buffer_16(output_90)
-    }
-
-  EXPECT_EQ(0, err);
+static void NV12TestRotate(int src_width, int src_height,
+                           int dst_width, int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags, int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {  // allow negative for inversion test.
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_nv12_y_size = src_width * Abs(src_height);
+  int src_nv12_uv_size =
+      ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2) * 2;
+  int src_nv12_size = src_nv12_y_size + src_nv12_uv_size;
+  align_buffer_page_end(src_nv12, src_nv12_size);
+  for (int i = 0; i < src_nv12_size; ++i) {
+    src_nv12[i] = fastrand() & 0xff;
+  }
+
+  int dst_i420_y_size = dst_width * dst_height;
+  int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+  int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
+  align_buffer_page_end(dst_i420_c, dst_i420_size);
+  align_buffer_page_end(dst_i420_opt, dst_i420_size);
+  memset(dst_i420_c, 2, dst_i420_size);
+  memset(dst_i420_opt, 3, dst_i420_size);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  NV12ToI420Rotate(src_nv12, src_width,
+                   src_nv12 + src_nv12_y_size, (src_width + 1) & ~1,
+                   dst_i420_c, dst_width,
+                   dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
+                   dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
+                     (dst_width + 1) / 2,
+                   src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    NV12ToI420Rotate(src_nv12, src_width,
+                     src_nv12 + src_nv12_y_size, (src_width + 1) & ~1,
+                     dst_i420_opt, dst_width,
+                     dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2,
+                     dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
+                       (dst_width + 1) / 2,
+                     src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i420_size; ++i) {
+    EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(dst_i420_c);
+  free_aligned_buffer_page_end(dst_i420_opt);
+  free_aligned_buffer_page_end(src_nv12);
 }
 
-TEST_F(libyuvTest, RotatePlane90Pitch) {
-  int iw, ih;
-  int err = 0;
-
-  for (iw = 16; iw < rotate_max_w_ && !err; iw += 4)
-    for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) {
-      int i;
-
-      int ow = ih;
-      int oh = iw;
-
-      align_buffer_16(input, iw * ih)
-      align_buffer_16(output_0, iw * ih)
-      align_buffer_16(output_90, ow * oh)
-
-      for (i = 0; i < iw * ih; ++i) {
-        input[i] = i;
-      }
-
-      RotatePlane90(input, iw,
-                    output_90 + (ow >> 1), ow,
-                    iw >> 1, ih >> 1);
-      RotatePlane90(input + (iw >> 1), iw,
-                    output_90 + (ow >> 1) + ow * (oh >> 1), ow,
-                    iw >> 1, ih >> 1);
-      RotatePlane90(input + iw * (ih >> 1), iw,
-                    output_90, ow,
-                    iw >> 1, ih >> 1);
-      RotatePlane90(input + (iw >> 1) + iw * (ih >> 1), iw,
-                    output_90 + ow * (oh >> 1), ow,
-                    iw >> 1, ih >> 1);
-
-      RotatePlane270(output_90, ih, output_0,   iw, ow, oh);
-
-      for (i = 0; i < iw * ih; ++i) {
-        if (input[i] != output_0[i]) {
-          err++;
-        }
-      }
-
-      if (err) {
-        printf("intput %dx%d\n", iw, ih);
-        PrintArray(input, iw, ih);
-
-        printf("output \n");
-        PrintArray(output_90, ow, oh);
-
-        printf("output \n");
-        PrintArray(output_0, iw, ih);
-      }
-
-      free_aligned_buffer_16(input)
-      free_aligned_buffer_16(output_0)
-      free_aligned_buffer_16(output_90)
-    }
-
-  EXPECT_EQ(0, err);
+TEST_F(LibYUVRotateTest, NV12Rotate0_Opt) {
+  NV12TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, RotatePlane270Pitch) {
-  int iw, ih, ow, oh;
-  int err = 0;
-
-  for (iw = 16; iw < rotate_max_w_ && !err; iw += 4) {
-    for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) {
-      int i;
-
-      ow = ih;
-      oh = iw;
-
-      align_buffer_16(input, iw * ih)
-      align_buffer_16(output_0, iw * ih)
-      align_buffer_16(output_270, ow * oh)
-
-      for (i = 0; i < iw * ih; ++i) {
-        input[i] = i;
-      }
-
-      RotatePlane270(input, iw,
-                     output_270 + ow * (oh >> 1), ow,
-                     iw >> 1, ih >> 1);
-      RotatePlane270(input + (iw >> 1), iw,
-                     output_270, ow,
-                     iw >> 1, ih >> 1);
-      RotatePlane270(input + iw * (ih >> 1), iw,
-                     output_270 + (ow >> 1) + ow * (oh >> 1), ow,
-                     iw >> 1, ih >> 1);
-      RotatePlane270(input + (iw >> 1) + iw * (ih >> 1), iw,
-                     output_270 + (ow >> 1), ow,
-                     iw >> 1, ih >> 1);
-
-      RotatePlane90(output_270, ih, output_0,   iw, ow, oh);
-
-      for (i = 0; i < iw * ih; ++i) {
-        if (input[i] != output_0[i]) {
-          err++;
-        }
-      }
-
-      if (err) {
-        printf("intput %dx%d\n", iw, ih);
-        PrintArray(input, iw, ih);
-
-        printf("output \n");
-        PrintArray(output_270, ow, oh);
-
-        printf("output \n");
-        PrintArray(output_0, iw, ih);
-      }
-
-      free_aligned_buffer_16(input)
-      free_aligned_buffer_16(output_0)
-      free_aligned_buffer_16(output_270)
-    }
-  }
-
-  EXPECT_EQ(0, err);
+TEST_F(LibYUVRotateTest, NV12Rotate90_Opt) {
+  NV12TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, I420Rotate90) {
-  int err = 0;
-
-  int yw = 1024;
-  int yh = 768;
-  int b = 128;
-  int uvw = (yw + 1) >> 1;
-  int uvh = (yh + 1) >> 1;
-
-  int i, j;
-
-  int y_plane_size = (yw + b * 2) * (yh + b * 2);
-  int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
-
-  srandom(time(NULL));
-
-  align_buffer_16(orig_y, y_plane_size)
-  align_buffer_16(orig_u, uv_plane_size)
-  align_buffer_16(orig_v, uv_plane_size)
-  align_buffer_16(ro0_y, y_plane_size)
-  align_buffer_16(ro0_u, uv_plane_size)
-  align_buffer_16(ro0_v, uv_plane_size)
-  align_buffer_16(ro90_y, y_plane_size)
-  align_buffer_16(ro90_u, uv_plane_size)
-  align_buffer_16(ro90_v, uv_plane_size)
-  align_buffer_16(ro270_y, y_plane_size)
-  align_buffer_16(ro270_u, uv_plane_size)
-  align_buffer_16(ro270_v, uv_plane_size)
-  memset(orig_y, 0, y_plane_size);
-  memset(orig_u, 0, uv_plane_size);
-  memset(orig_v, 0, uv_plane_size);
-  memset(ro0_y, 0, y_plane_size);
-  memset(ro0_u, 0, uv_plane_size);
-  memset(ro0_v, 0, uv_plane_size);
-  memset(ro90_y, 0, y_plane_size);
-  memset(ro90_u, 0, uv_plane_size);
-  memset(ro90_v, 0, uv_plane_size);
-  memset(ro270_y, 0, y_plane_size);
-  memset(ro270_u, 0, uv_plane_size);
-  memset(ro270_v, 0, uv_plane_size);
-
-  // fill image buffers with random data
-  for (i = b; i < (yh + b); ++i) {
-    for (j = b; j < (yw + b); ++j) {
-      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
-    }
-  }
-
-  for (i = b; i < (uvh + b); ++i) {
-    for (j = b; j < (uvw + b); ++j) {
-      orig_u[i * (uvw + b * 2) + j] = random() & 0xff;
-      orig_v[i * (uvw + b * 2) + j] = random() & 0xff;
-    }
-  }
-
-  int y_off_0 = b * (yw + b * 2) + b;
-  int uv_off_0 = b * (uvw + b * 2) + b;
-  int y_off_90 = b * (yh + b * 2) + b;
-  int uv_off_90 = b * (uvh + b * 2) + b;
-
-  int y_st_0 = yw + b * 2;
-  int uv_st_0 = uvw + b * 2;
-  int y_st_90 = yh + b * 2;
-  int uv_st_90 = uvh + b * 2;
-
-  I420Rotate(orig_y+y_off_0, y_st_0,
-             orig_u+uv_off_0, uv_st_0,
-             orig_v+uv_off_0, uv_st_0,
-             ro90_y+y_off_90, y_st_90,
-             ro90_u+uv_off_90, uv_st_90,
-             ro90_v+uv_off_90, uv_st_90,
-             yw, yh,
-             kRotateClockwise);
-
-  I420Rotate(ro90_y+y_off_90, y_st_90,
-             ro90_u+uv_off_90, uv_st_90,
-             ro90_v+uv_off_90, uv_st_90,
-             ro270_y+y_off_90, y_st_90,
-             ro270_u+uv_off_90, uv_st_90,
-             ro270_v+uv_off_90, uv_st_90,
-             yh, yw,
-             kRotate180);
-
-  I420Rotate(ro270_y+y_off_90, y_st_90,
-             ro270_u+uv_off_90, uv_st_90,
-             ro270_v+uv_off_90, uv_st_90,
-             ro0_y+y_off_0, y_st_0,
-             ro0_u+uv_off_0, uv_st_0,
-             ro0_v+uv_off_0, uv_st_0,
-             yh, yw,
-             kRotateClockwise);
-
-  for (i = 0; i < y_plane_size; ++i) {
-    if (orig_y[i] != ro0_y[i]) {
-      ++err;
-    }
-  }
-
-  for (i = 0; i < uv_plane_size; ++i) {
-    if (orig_u[i] != ro0_u[i]) {
-      ++err;
-    }
-    if (orig_v[i] != ro0_v[i]) {
-      ++err;
-    }
-  }
-
-  free_aligned_buffer_16(orig_y)
-  free_aligned_buffer_16(orig_u)
-  free_aligned_buffer_16(orig_v)
-  free_aligned_buffer_16(ro0_y)
-  free_aligned_buffer_16(ro0_u)
-  free_aligned_buffer_16(ro0_v)
-  free_aligned_buffer_16(ro90_y)
-  free_aligned_buffer_16(ro90_u)
-  free_aligned_buffer_16(ro90_v)
-  free_aligned_buffer_16(ro270_y)
-  free_aligned_buffer_16(ro270_u)
-  free_aligned_buffer_16(ro270_v)
-
-  EXPECT_EQ(0, err);
+TEST_F(LibYUVRotateTest, NV12Rotate180_Opt) {
+  NV12TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, I420Rotate270) {
-  int err = 0;
-
-  int yw = 1024;
-  int yh = 768;
-  int b = 128;
-  int uvw = (yw + 1) >> 1;
-  int uvh = (yh + 1) >> 1;
-
-  int i, j;
-
-  int y_plane_size = (yw + b * 2) * (yh + b * 2);
-  int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
-
-  srandom(time(NULL));
-
-  align_buffer_16(orig_y, y_plane_size)
-  align_buffer_16(orig_u, uv_plane_size)
-  align_buffer_16(orig_v, uv_plane_size)
-  align_buffer_16(ro0_y, y_plane_size)
-  align_buffer_16(ro0_u, uv_plane_size)
-  align_buffer_16(ro0_v, uv_plane_size)
-  align_buffer_16(ro90_y, y_plane_size)
-  align_buffer_16(ro90_u, uv_plane_size)
-  align_buffer_16(ro90_v, uv_plane_size)
-  align_buffer_16(ro270_y, y_plane_size)
-  align_buffer_16(ro270_u, uv_plane_size)
-  align_buffer_16(ro270_v, uv_plane_size)
-  memset(orig_y, 0, y_plane_size);
-  memset(orig_u, 0, uv_plane_size);
-  memset(orig_v, 0, uv_plane_size);
-  memset(ro0_y, 0, y_plane_size);
-  memset(ro0_u, 0, uv_plane_size);
-  memset(ro0_v, 0, uv_plane_size);
-  memset(ro90_y, 0, y_plane_size);
-  memset(ro90_u, 0, uv_plane_size);
-  memset(ro90_v, 0, uv_plane_size);
-  memset(ro270_y, 0, y_plane_size);
-  memset(ro270_u, 0, uv_plane_size);
-  memset(ro270_v, 0, uv_plane_size);
-
-  // fill image buffers with random data
-  for (i = b; i < (yh + b); ++i) {
-    for (j = b; j < (yw + b); ++j) {
-      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
-    }
-  }
-
-  for (i = b; i < (uvh + b); ++i) {
-    for (j = b; j < (uvw + b); ++j) {
-      orig_u[i * (uvw + b * 2) + j] = random() & 0xff;
-      orig_v[i * (uvw + b * 2) + j] = random() & 0xff;
-    }
-  }
-
-  int y_off_0 = b * (yw + b * 2) + b;
-  int uv_off_0 = b * (uvw + b * 2) + b;
-  int y_off_90 = b * (yh + b * 2) + b;
-  int uv_off_90 = b * (uvh + b * 2) + b;
-
-  int y_st_0 = yw + b * 2;
-  int uv_st_0 = uvw + b * 2;
-  int y_st_90 = yh + b * 2;
-  int uv_st_90 = uvh + b * 2;
-
-  I420Rotate(orig_y+y_off_0, y_st_0,
-             orig_u+uv_off_0, uv_st_0,
-             orig_v+uv_off_0, uv_st_0,
-             ro270_y+y_off_90, y_st_90,
-             ro270_u+uv_off_90, uv_st_90,
-             ro270_v+uv_off_90, uv_st_90,
-             yw, yh,
-             kRotateCounterClockwise);
-
-  I420Rotate(ro270_y+y_off_90, y_st_90,
-             ro270_u+uv_off_90, uv_st_90,
-             ro270_v+uv_off_90, uv_st_90,
-             ro90_y+y_off_90, y_st_90,
-             ro90_u+uv_off_90, uv_st_90,
-             ro90_v+uv_off_90, uv_st_90,
-             yh, yw,
-             kRotate180);
-
-  I420Rotate(ro90_y+y_off_90, y_st_90,
-             ro90_u+uv_off_90, uv_st_90,
-             ro90_v+uv_off_90, uv_st_90,
-             ro0_y+y_off_0, y_st_0,
-             ro0_u+uv_off_0, uv_st_0,
-             ro0_v+uv_off_0, uv_st_0,
-             yh, yw,
-             kRotateCounterClockwise);
-
-  for (i = 0; i < y_plane_size; ++i) {
-    if (orig_y[i] != ro0_y[i]) {
-      ++err;
-    }
-  }
-
-  for (i = 0; i < uv_plane_size; ++i) {
-    if (orig_u[i] != ro0_u[i]) {
-      ++err;
-    }
-    if (orig_v[i] != ro0_v[i]) {
-      ++err;
-    }
-  }
-
-  free_aligned_buffer_16(orig_y)
-  free_aligned_buffer_16(orig_u)
-  free_aligned_buffer_16(orig_v)
-  free_aligned_buffer_16(ro0_y)
-  free_aligned_buffer_16(ro0_u)
-  free_aligned_buffer_16(ro0_v)
-  free_aligned_buffer_16(ro90_y)
-  free_aligned_buffer_16(ro90_u)
-  free_aligned_buffer_16(ro90_v)
-  free_aligned_buffer_16(ro270_y)
-  free_aligned_buffer_16(ro270_u)
-  free_aligned_buffer_16(ro270_v)
-
-  EXPECT_EQ(0, err);
+TEST_F(LibYUVRotateTest, NV12Rotate270_Opt) {
+  NV12TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, NV12ToI420Rotate90) {
-  int err = 0;
-
-  int yw = 1024;
-  int yh = 768;
-  int b = 128;
-  int uvw = (yw + 1) >> 1;
-  int uvh = (yh + 1) >> 1;
-  int i, j;
-
-  int y_plane_size = (yw + b * 2) * (yh + b * 2);
-  int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
-  int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2);
-
-  srandom(time(NULL));
-
-  align_buffer_16(orig_y, y_plane_size)
-  align_buffer_16(orig_uv, o_uv_plane_size)
-  align_buffer_16(ro0_y, y_plane_size)
-  align_buffer_16(ro0_u, uv_plane_size)
-  align_buffer_16(ro0_v, uv_plane_size)
-  align_buffer_16(ro90_y, y_plane_size)
-  align_buffer_16(ro90_u, uv_plane_size)
-  align_buffer_16(ro90_v, uv_plane_size)
-  memset(orig_y, 0, y_plane_size);
-  memset(orig_uv, 0, uv_plane_size);
-  memset(ro0_y, 0, y_plane_size);
-  memset(ro0_u, 0, uv_plane_size);
-  memset(ro0_v, 0, uv_plane_size);
-  memset(ro90_y, 0, y_plane_size);
-  memset(ro90_u, 0, uv_plane_size);
-  memset(ro90_v, 0, uv_plane_size);
-
-  // fill image buffers with random data
-  for (i = b; i < (yh + b); ++i) {
-    for (j = b; j < (yw + b); ++j) {
-      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
-    }
-  }
-
-  for (i = b; i < (uvh + b); ++i) {
-    for (j = b; j < (uvw * 2 + b); j += 2) {
-      uint8 random_number = random() & 0x7f;
-      orig_uv[i * (uvw * 2 + b * 2) + j] = random_number;
-      orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number;
-    }
-  }
-
-  int y_off_0 = b * (yw + b * 2) + b;
-  int uv_off_0 = b * (uvw + b * 2) + b;
-  int y_off_90 = b * (yh + b * 2) + b;
-  int uv_off_90 = b * (uvh + b * 2) + b;
-
-  int y_st_0 = yw + b * 2;
-  int uv_st_0 = uvw + b * 2;
-  int y_st_90 = yh + b * 2;
-  int uv_st_90 = uvh + b * 2;
-
-  NV12ToI420Rotate(orig_y+y_off_0, y_st_0,
-                   orig_uv+y_off_0, y_st_0,
-                   ro90_y+y_off_90, y_st_90,
-                   ro90_u+uv_off_90, uv_st_90,
-                   ro90_v+uv_off_90, uv_st_90,
-                   yw, yh,
-                   kRotateClockwise);
-
-  I420Rotate(ro90_y+y_off_90, y_st_90,
-             ro90_u+uv_off_90, uv_st_90,
-             ro90_v+uv_off_90, uv_st_90,
-             ro0_y+y_off_0, y_st_0,
-             ro0_u+uv_off_0, uv_st_0,
-             ro0_v+uv_off_0, uv_st_0,
-             yh, yw,
-             kRotateCounterClockwise);
-
-  for (i = 0; i < y_plane_size; ++i) {
-    if (orig_y[i] != ro0_y[i])
-      ++err;
-  }
-
-  int zero_cnt = 0;
-
-  for (i = 0; i < uv_plane_size; ++i) {
-    if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) {
-      ++err;
-    }
-    if (ro0_u[i] != 0) {
-      ++zero_cnt;
-    }
-  }
-
-  if (!zero_cnt) {
-    ++err;
-  }
-
-  free_aligned_buffer_16(orig_y)
-  free_aligned_buffer_16(orig_uv)
-  free_aligned_buffer_16(ro0_y)
-  free_aligned_buffer_16(ro0_u)
-  free_aligned_buffer_16(ro0_v)
-  free_aligned_buffer_16(ro90_y)
-  free_aligned_buffer_16(ro90_u)
-  free_aligned_buffer_16(ro90_v)
-
-  EXPECT_EQ(0, err);
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate0_Odd) {
+  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1,
+                 kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, NV12ToI420Rotate270) {
-  int err = 0;
-
-  int yw = 1024;
-  int yh = 768;
-  int b = 128;
-  int uvw = (yw + 1) >> 1;
-  int uvh = (yh + 1) >> 1;
-
-  int i, j;
-
-  int y_plane_size = (yw + b * 2) * (yh + b * 2);
-  int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
-  int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2);
-
-  srandom(time(NULL));
-
-  align_buffer_16(orig_y, y_plane_size)
-  align_buffer_16(orig_uv, o_uv_plane_size)
-  align_buffer_16(ro0_y, y_plane_size)
-  align_buffer_16(ro0_u, uv_plane_size)
-  align_buffer_16(ro0_v, uv_plane_size)
-  align_buffer_16(ro270_y, y_plane_size)
-  align_buffer_16(ro270_u, uv_plane_size)
-  align_buffer_16(ro270_v, uv_plane_size)
-  memset(orig_y, 0, y_plane_size);
-  memset(orig_uv, 0, o_uv_plane_size);
-  memset(ro0_y, 0, y_plane_size);
-  memset(ro0_u, 0, uv_plane_size);
-  memset(ro0_v, 0, uv_plane_size);
-  memset(ro270_y, 0, y_plane_size);
-  memset(ro270_u, 0, uv_plane_size);
-  memset(ro270_v, 0, uv_plane_size);
-
-  // fill image buffers with random data
-  for (i = b; i < (yh + b); ++i) {
-    for (j = b; j < (yw + b); ++j) {
-      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
-    }
-  }
-
-  for (i = b; i < (uvh + b); ++i) {
-    for (j = b; j < (uvw * 2 + b); j += 2) {
-      uint8 random_number = random() & 0x7f;
-      orig_uv[i * (uvw * 2 + b * 2) + j] = random_number;
-      orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number;
-    }
-  }
-
-  int y_off_0 = b * (yw + b * 2) + b;
-  int uv_off_0 = b * (uvw + b * 2) + b;
-  int y_off_270 = b * (yh + b * 2) + b;
-  int uv_off_270 = b * (uvh + b * 2) + b;
-
-  int y_st_0 = yw + b * 2;
-  int uv_st_0 = uvw + b * 2;
-  int y_st_270 = yh + b * 2;
-  int uv_st_270 = uvh + b * 2;
-
-  NV12ToI420Rotate(orig_y+y_off_0, y_st_0,
-                   orig_uv+y_off_0, y_st_0,
-                   ro270_y+y_off_270, y_st_270,
-                   ro270_u+uv_off_270, uv_st_270,
-                   ro270_v+uv_off_270, uv_st_270,
-                   yw, yh,
-                   kRotateCounterClockwise);
-
-  I420Rotate(ro270_y+y_off_270, y_st_270,
-             ro270_u+uv_off_270, uv_st_270,
-             ro270_v+uv_off_270, uv_st_270,
-             ro0_y+y_off_0, y_st_0,
-             ro0_u+uv_off_0, uv_st_0,
-             ro0_v+uv_off_0, uv_st_0,
-             yh, yw,
-             kRotateClockwise);
-
-  for (i = 0; i < y_plane_size; ++i) {
-    if (orig_y[i] != ro0_y[i])
-      ++err;
-  }
-
-  int zero_cnt = 0;
-
-  for (i = 0; i < uv_plane_size; ++i) {
-    if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) {
-      ++err;
-    }
-    if (ro0_u[i] != 0) {
-      ++zero_cnt;
-    }
-  }
-
-  if (!zero_cnt) {
-    ++err;
-  }
-
-  free_aligned_buffer_16(orig_y)
-  free_aligned_buffer_16(orig_uv)
-  free_aligned_buffer_16(ro0_y)
-  free_aligned_buffer_16(ro0_u)
-  free_aligned_buffer_16(ro0_v)
-  free_aligned_buffer_16(ro270_y)
-  free_aligned_buffer_16(ro270_u)
-  free_aligned_buffer_16(ro270_v)
-
-  EXPECT_EQ(0, err);
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate90_Odd) {
+  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3,
+                 kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, NV12ToI420Rotate180) {
-  int err = 0;
-
-  int yw = 1024;
-  int yh = 768;
-  int b = 128;
-  int uvw = (yw + 1) >> 1;
-  int uvh = (yh + 1) >> 1;
-
-  int i, j;
-
-  int y_plane_size = (yw + b * 2) * (yh + b * 2);
-  int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
-  int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2);
-
-  srandom(time(NULL));
-
-  align_buffer_16(orig_y, y_plane_size)
-  align_buffer_16(orig_uv, o_uv_plane_size)
-  align_buffer_16(ro0_y, y_plane_size)
-  align_buffer_16(ro0_u, uv_plane_size)
-  align_buffer_16(ro0_v, uv_plane_size)
-  align_buffer_16(ro180_y, y_plane_size)
-  align_buffer_16(ro180_u, uv_plane_size)
-  align_buffer_16(ro180_v, uv_plane_size)
-  memset(orig_y, 0, y_plane_size);
-  memset(orig_uv, 0, o_uv_plane_size);
-  memset(ro0_y, 0, y_plane_size);
-  memset(ro0_u, 0, uv_plane_size);
-  memset(ro0_v, 0, uv_plane_size);
-  memset(ro180_y, 0, y_plane_size);
-  memset(ro180_u, 0, uv_plane_size);
-  memset(ro180_v, 0, uv_plane_size);
-
-  // fill image buffers with random data
-  for (i = b; i < (yh + b); ++i) {
-    for (j = b; j < (yw + b); ++j) {
-      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
-    }
-  }
-
-  for (i = b; i < (uvh + b); ++i) {
-    for (j = b; j < (uvw * 2 + b); j += 2) {
-      uint8 random_number = random() & 0x7f;
-      orig_uv[i * (uvw * 2 + b * 2) + j] = random_number;
-      orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number;
-    }
-  }
-
-  int y_off = b * (yw + b * 2) + b;
-  int uv_off = b * (uvw + b * 2) + b;
-
-  int y_st = yw + b * 2;
-  int uv_st = uvw + b * 2;
-
-  NV12ToI420Rotate(orig_y+y_off, y_st,
-                   orig_uv+y_off, y_st,
-                   ro180_y+y_off, y_st,
-                   ro180_u+uv_off, uv_st,
-                   ro180_v+uv_off, uv_st,
-                   yw, yh,
-                   kRotate180);
-
-  I420Rotate(ro180_y+y_off, y_st,
-             ro180_u+uv_off, uv_st,
-             ro180_v+uv_off, uv_st,
-             ro0_y+y_off, y_st,
-             ro0_u+uv_off, uv_st,
-             ro0_v+uv_off, uv_st,
-             yw, yh,
-             kRotate180);
-
-  for (i = 0; i < y_plane_size; ++i) {
-    if (orig_y[i] != ro0_y[i]) {
-      ++err;
-    }
-  }
-
-  int zero_cnt = 0;
-
-  for (i = 0; i < uv_plane_size; ++i) {
-    if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) {
-      ++err;
-    }
-    if (ro0_u[i] != 0) {
-      ++zero_cnt;
-    }
-  }
-
-  if (!zero_cnt) {
-    ++err;
-  }
-
-  free_aligned_buffer_16(orig_y)
-  free_aligned_buffer_16(orig_uv)
-  free_aligned_buffer_16(ro0_y)
-  free_aligned_buffer_16(ro0_u)
-  free_aligned_buffer_16(ro0_v)
-  free_aligned_buffer_16(ro180_y)
-  free_aligned_buffer_16(ro180_u)
-  free_aligned_buffer_16(ro180_v)
-
-  EXPECT_EQ(0, err);
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate180_Odd) {
+  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1,
+                 kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) {
-  int y_err = 0, uv_err = 0;
-
-  int yw = 1024;
-  int yh = 768;
-  int b = 128;
-  int uvw = (yw + 1) >> 1;
-  int uvh = (yh + 1) >> 1;
-  int i, j;
-
-  int y_plane_size = (yw + b * 2) * (yh + b * 2);
-  int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
-  int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2);
-
-  srandom(time(NULL));
-
-  align_buffer_16(orig_y, y_plane_size)
-  align_buffer_16(orig_uv, o_uv_plane_size)
-  align_buffer_16(roa_y, y_plane_size)
-  align_buffer_16(roa_u, uv_plane_size)
-  align_buffer_16(roa_v, uv_plane_size)
-  align_buffer_16(rob_y, y_plane_size)
-  align_buffer_16(rob_u, uv_plane_size)
-  align_buffer_16(rob_v, uv_plane_size)
-  align_buffer_16(roc_y, y_plane_size)
-  align_buffer_16(roc_u, uv_plane_size)
-  align_buffer_16(roc_v, uv_plane_size)
-  memset(orig_y, 0, y_plane_size);
-  memset(orig_uv, 0, o_uv_plane_size);
-  memset(roa_y, 0, y_plane_size);
-  memset(roa_u, 0, uv_plane_size);
-  memset(roa_v, 0, uv_plane_size);
-  memset(rob_y, 0, y_plane_size);
-  memset(rob_u, 0, uv_plane_size);
-  memset(rob_v, 0, uv_plane_size);
-  memset(roc_y, 0, y_plane_size);
-  memset(roc_u, 0, uv_plane_size);
-  memset(roc_v, 0, uv_plane_size);
-
-  // fill image buffers with random data
-  for (i = b; i < (yh + b); ++i) {
-    for (j = b; j < (yw + b); ++j) {
-      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
-    }
-  }
-
-  for (i = b; i < (uvh + b); ++i) {
-    for (j = b; j < (uvw * 2 + b); j += 2) {
-      uint8 random_number = random() & 0x7f;
-      orig_uv[i * (uvw * 2 + b * 2) + j] = random_number;
-      orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number;
-    }
-  }
-
-  int y_off_0 = b * (yw + b * 2) + b;
-  int uv_off_0 = b * (uvw + b * 2) + b;
-  int y_off_90 = b * (yh + b * 2) + b;
-  int uv_off_90 = b * (uvh + b * 2) + b;
-
-  int y_st_0 = yw + b * 2;
-  int uv_st_0 = uvw + b * 2;
-  int y_st_90 = yh + b * 2;
-  int uv_st_90 = uvh + b * 2;
-
-  NV12ToI420Rotate(orig_y+y_off_0, y_st_0,
-                   orig_uv+y_off_0, y_st_0,
-                   roa_y+y_off_90, y_st_90,
-                   roa_u+uv_off_90, uv_st_90,
-                   roa_v+uv_off_90, uv_st_90,
-                   yw, -yh,
-                   kRotateClockwise);
-
-  I420Rotate(roa_y+y_off_90, y_st_90,
-             roa_u+uv_off_90, uv_st_90,
-             roa_v+uv_off_90, uv_st_90,
-             rob_y+y_off_0, y_st_0,
-             rob_u+uv_off_0, uv_st_0,
-             rob_v+uv_off_0, uv_st_0,
-             yh, -yw,
-             kRotateCounterClockwise);
-
-  I420Rotate(rob_y+y_off_0, y_st_0,
-             rob_u+uv_off_0, uv_st_0,
-             rob_v+uv_off_0, uv_st_0,
-             roc_y+y_off_0, y_st_0,
-             roc_u+uv_off_0, uv_st_0,
-             roc_v+uv_off_0, uv_st_0,
-             yw, yh,
-             kRotate180);
-
-  for (i = 0; i < y_plane_size; ++i) {
-    if (orig_y[i] != roc_y[i]) {
-      ++y_err;
-    }
-  }
-
-  if (y_err) {
-    printf("input %dx%d \n", yw, yh);
-    PrintArray(orig_y, y_st_0, yh + b * 2);
-
-    printf("rotate a\n");
-    PrintArray(roa_y, y_st_90, y_st_0);
-
-    printf("rotate b\n");
-    PrintArray(rob_y, y_st_90, y_st_0);
-
-    printf("rotate c\n");
-    PrintArray(roc_y, y_st_0, y_st_90);
-  }
-
-  int zero_cnt = 0;
-
-  for (i = 0; i < uv_plane_size; ++i) {
-    if ((signed char)roc_u[i] != -(signed char)roc_v[i]) {
-      ++uv_err;
-    }
-    if (rob_u[i] != 0) {
-      ++zero_cnt;
-    }
-  }
-
-  if (!zero_cnt) {
-    ++uv_err;
-  }
-
-  if (uv_err) {
-    printf("input %dx%d \n", uvw * 2, uvh);
-    PrintArray(orig_uv, y_st_0, uvh + b * 2);
-
-    printf("rotate a\n");
-    PrintArray(roa_u, uv_st_90, uv_st_0);
-    PrintArray(roa_v, uv_st_90, uv_st_0);
-
-    printf("rotate b\n");
-    PrintArray(rob_u, uv_st_90, uv_st_0);
-    PrintArray(rob_v, uv_st_90, uv_st_0);
-
-    printf("rotate c\n");
-    PrintArray(roc_u, uv_st_0, uv_st_90);
-    PrintArray(roc_v, uv_st_0, uv_st_90);
-  }
-
-  free_aligned_buffer_16(orig_y)
-  free_aligned_buffer_16(orig_uv)
-  free_aligned_buffer_16(roa_y)
-  free_aligned_buffer_16(roa_u)
-  free_aligned_buffer_16(roa_v)
-  free_aligned_buffer_16(rob_y)
-  free_aligned_buffer_16(rob_u)
-  free_aligned_buffer_16(rob_v)
-  free_aligned_buffer_16(roc_y)
-  free_aligned_buffer_16(roc_u)
-  free_aligned_buffer_16(roc_v)
-
-  EXPECT_EQ(0, y_err + uv_err);
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate270_Odd) {
+  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3,
+                 kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 
-TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) {
-  int y_err = 0, uv_err = 0;
-
-  int yw = 1024;
-  int yh = 768;
-  int b = 128;
-  int uvw = (yw + 1) >> 1;
-  int uvh = (yh + 1) >> 1;
-  int i, j;
-
-  int y_plane_size = (yw + b * 2) * (yh + b * 2);
-  int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
-  int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2);
-
-  srandom(time(NULL));
-
-  align_buffer_16(orig_y, y_plane_size)
-  align_buffer_16(orig_uv, o_uv_plane_size)
-  align_buffer_16(roa_y, y_plane_size)
-  align_buffer_16(roa_u, uv_plane_size)
-  align_buffer_16(roa_v, uv_plane_size)
-  align_buffer_16(rob_y, y_plane_size)
-  align_buffer_16(rob_u, uv_plane_size)
-  align_buffer_16(rob_v, uv_plane_size)
-  memset(orig_y, 0, y_plane_size);
-  memset(orig_uv, 0, o_uv_plane_size);
-  memset(roa_y, 0, y_plane_size);
-  memset(roa_u, 0, uv_plane_size);
-  memset(roa_v, 0, uv_plane_size);
-  memset(rob_y, 0, y_plane_size);
-  memset(rob_u, 0, uv_plane_size);
-  memset(rob_v, 0, uv_plane_size);
-
-  // fill image buffers with random data
-  for (i = b; i < (yh + b); ++i) {
-    for (j = b; j < (yw + b); ++j) {
-      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
-    }
-  }
-
-  for (i = b; i < (uvh + b); ++i) {
-    for (j = b; j < (uvw * 2 + b); j += 2) {
-      uint8 random_number = random() & 0x7f;
-      orig_uv[i * (uvw * 2 + b * 2) + j] = random_number;
-      orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number;
-    }
-  }
-
-  int y_off = b * (yw + b * 2) + b;
-  int uv_off = b * (uvw + b * 2) + b;
-
-  int y_st = yw + b * 2;
-  int uv_st = uvw + b * 2;
-
-  NV12ToI420Rotate(orig_y+y_off, y_st,
-                   orig_uv+y_off, y_st,
-                   roa_y+y_off, y_st,
-                   roa_u+uv_off, uv_st,
-                   roa_v+uv_off, uv_st,
-                   yw, -yh,
-                   kRotate180);
-
-  I420Rotate(roa_y+y_off, y_st,
-             roa_u+uv_off, uv_st,
-             roa_v+uv_off, uv_st,
-             rob_y+y_off, y_st,
-             rob_u+uv_off, uv_st,
-             rob_v+uv_off, uv_st,
-             yw, -yh,
-             kRotate180);
-
-  for (i = 0; i < y_plane_size; ++i) {
-    if (orig_y[i] != rob_y[i])
-      ++y_err;
-  }
-
-  if (y_err) {
-    printf("input %dx%d \n", yw, yh);
-    PrintArray(orig_y, y_st, yh + b * 2);
-
-    printf("rotate a\n");
-    PrintArray(roa_y, y_st, yh + b * 2);
-
-    printf("rotate b\n");
-    PrintArray(rob_y, y_st, yh + b * 2);
-  }
-
-  int zero_cnt = 0;
+TEST_F(LibYUVRotateTest, NV12Rotate0_Invert) {
+  NV12TestRotate(benchmark_width_, -benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
 
-  for (i = 0; i < uv_plane_size; ++i) {
-    if ((signed char)rob_u[i] != -(signed char)rob_v[i]) {
-      ++uv_err;
-    }
-    if (rob_u[i] != 0) {
-      ++zero_cnt;
-    }
-  }
+TEST_F(LibYUVRotateTest, NV12Rotate90_Invert) {
+  NV12TestRotate(benchmark_width_, -benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
 
-  if (!zero_cnt) {
-    ++uv_err;
-  }
+TEST_F(LibYUVRotateTest, NV12Rotate180_Invert) {
+  NV12TestRotate(benchmark_width_, -benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
 
-  if (uv_err) {
-    printf("input %dx%d \n", uvw * 2, uvh);
-    PrintArray(orig_uv, y_st, uvh + b * 2);
+TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
+  NV12TestRotate(benchmark_width_, -benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
 
-    printf("rotate a\n");
-    PrintArray(roa_u, uv_st, uvh + b * 2);
-    PrintArray(roa_v, uv_st, uvh + b * 2);
 
-    printf("rotate b\n");
-    PrintArray(rob_u, uv_st, uvh + b * 2);
-    PrintArray(rob_v, uv_st, uvh + b * 2);
-  }
 
-  free_aligned_buffer_16(orig_y)
-  free_aligned_buffer_16(orig_uv)
-  free_aligned_buffer_16(roa_y)
-  free_aligned_buffer_16(roa_u)
-  free_aligned_buffer_16(roa_v)
-  free_aligned_buffer_16(rob_y)
-  free_aligned_buffer_16(rob_u)
-  free_aligned_buffer_16(rob_v)
 
-  EXPECT_EQ(0, y_err + uv_err);
-}
 
 }  // namespace libyuv
diff --git a/files/unit_test/scale_argb_test.cc b/files/unit_test/scale_argb_test.cc
index fef96764..f99782f7 100644
--- a/files/unit_test/scale_argb_test.cc
+++ b/files/unit_test/scale_argb_test.cc
@@ -4,69 +4,80 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #include <stdlib.h>
 #include <time.h>
 
+#include "libyuv/convert_argb.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/scale_argb.h"
+#include "libyuv/video_common.h"
 #include "../unit_test/unit_test.h"
 
 namespace libyuv {
 
+#define STRINGIZE(line) #line
+#define FILELINESTR(file, line) file ":" STRINGIZE(line)
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
 static int ARGBTestFilter(int src_width, int src_height,
                           int dst_width, int dst_height,
-                          FilterMode f, int benchmark_iterations) {
-  const int b = 128;
-  int src_argb_plane_size = (src_width + b * 2) * (src_height + b * 2) * 4;
-  int src_stride_argb = (b * 2 + src_width) * 4;
+                          FilterMode f, int benchmark_iterations,
+                          int disable_cpu_flags, int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
 
-  align_buffer_16(src_argb, src_argb_plane_size)
-  memset(src_argb, 1, src_argb_plane_size);
+  int i, j;
+  const int b = 0;  // 128 to test for padding/stride.
+  int64 src_argb_plane_size = (Abs(src_width) + b * 2) *
+      (Abs(src_height) + b * 2) * 4LL;
+  int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
+
+  align_buffer_page_end(src_argb, src_argb_plane_size);
+  if (!src_argb) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  MemRandomize(src_argb, src_argb_plane_size);
 
-  int dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
+  int64 dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4LL;
   int dst_stride_argb = (b * 2 + dst_width) * 4;
 
-  srandom(time(NULL));
-
-  int i, j;
-  for (i = b; i < (src_height + b); ++i) {
-    for (j = b; j < (src_width + b) * 4; ++j) {
-      src_argb[(i * src_stride_argb) + j] = (random() & 0xff);
-    }
+  align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
+  align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
+  if (!dst_argb_c || !dst_argb_opt) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
   }
-
-  align_buffer_16(dst_argb_c, dst_argb_plane_size)
-  align_buffer_16(dst_argb_opt, dst_argb_plane_size)
   memset(dst_argb_c, 2, dst_argb_plane_size);
   memset(dst_argb_opt, 3, dst_argb_plane_size);
 
   // Warm up both versions for consistent benchmarks.
-  MaskCpuFlags(0);  // Disable all CPU optimization.
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
   ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
             src_width, src_height,
             dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
             dst_width, dst_height, f);
-  MaskCpuFlags(-1);  // Enable all CPU optimization.
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
   ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
             src_width, src_height,
             dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
             dst_width, dst_height, f);
 
-  MaskCpuFlags(0);  // Disable all CPU optimization.
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
   double c_time = get_time();
-  for (i = 0; i < benchmark_iterations; ++i) {
-    ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
-              src_width, src_height,
-              dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
-              dst_width, dst_height, f);
-  }
-  c_time = (get_time() - c_time) / benchmark_iterations;
+  ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+            src_width, src_height,
+            dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+            dst_width, dst_height, f);
+
+  c_time = (get_time() - c_time);
 
-  MaskCpuFlags(-1);  // Enable all CPU optimization.
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
   double opt_time = get_time();
   for (i = 0; i < benchmark_iterations; ++i) {
     ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
@@ -78,7 +89,7 @@ static int ARGBTestFilter(int src_width, int src_height,
 
   // Report performance of C vs OPT
   printf("filter %d - %8d us C - %8d us OPT\n",
-         f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
+         f, static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
 
   // C version may be a little off from the optimized. Order of
   //  operations may introduce rounding somewhere. So do a difference
@@ -87,7 +98,7 @@ static int ARGBTestFilter(int src_width, int src_height,
   int max_diff = 0;
   for (i = b; i < (dst_height + b); ++i) {
     for (j = b * 4; j < (dst_width + b) * 4; ++j) {
-      int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] -
+      int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
                          dst_argb_opt[(i * dst_stride_argb) + j]);
       if (abs_diff > max_diff) {
         max_diff = abs_diff;
@@ -95,161 +106,357 @@ static int ARGBTestFilter(int src_width, int src_height,
     }
   }
 
-  free_aligned_buffer_16(dst_argb_c)
-  free_aligned_buffer_16(dst_argb_opt)
-  free_aligned_buffer_16(src_argb)
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  free_aligned_buffer_page_end(src_argb);
   return max_diff;
 }
 
-TEST_F(libyuvTest, ARGBScaleDownBy2) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = src_width / 2;
-  const int dst_height = src_height / 2;
-
-  for (int f = 0; f < 2; ++f) {
-    int max_diff = ARGBTestFilter(src_width, src_height,
-                                  dst_width, dst_height,
-                                  static_cast<FilterMode>(f),
-                                  benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
+static const int kTileX = 8;
+static const int kTileY = 8;
+
+static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
+                         int src_width, int src_height,
+                         uint8* dst_argb, int dst_stride_argb,
+                         int dst_width, int dst_height,
+                         FilterMode filtering) {
+  for (int y = 0; y < dst_height; y += kTileY) {
+    for (int x = 0; x < dst_width; x += kTileX) {
+      int clip_width = kTileX;
+      if (x + clip_width > dst_width) {
+        clip_width = dst_width - x;
+      }
+      int clip_height = kTileY;
+      if (y + clip_height > dst_height) {
+        clip_height = dst_height - y;
+      }
+      int r = ARGBScaleClip(src_argb, src_stride_argb,
+                            src_width, src_height,
+                            dst_argb, dst_stride_argb,
+                            dst_width, dst_height,
+                            x, y, clip_width, clip_height, filtering);
+      if (r) {
+        return r;
+      }
+    }
   }
+  return 0;
 }
 
-TEST_F(libyuvTest, ARGBScaleDownBy4) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = src_width / 4;
-  const int dst_height = src_height / 4;
+static int ARGBClipTestFilter(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              FilterMode f, int benchmark_iterations) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
 
-  for (int f = 0; f < 2; ++f) {
-    int max_diff = ARGBTestFilter(src_width, src_height,
-                                  dst_width, dst_height,
-                                  static_cast<FilterMode>(f),
-                                  benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
+  const int b = 128;
+  int64 src_argb_plane_size = (Abs(src_width) + b * 2) *
+      (Abs(src_height) + b * 2) * 4;
+  int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
+
+  align_buffer_page_end(src_argb, src_argb_plane_size);
+  if (!src_argb) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
   }
-}
+  memset(src_argb, 1, src_argb_plane_size);
 
-TEST_F(libyuvTest, ARGBScaleDownBy5) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = src_width / 5;
-  const int dst_height = src_height / 5;
+  int64 dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
+  int dst_stride_argb = (b * 2 + dst_width) * 4;
 
-  for (int f = 0; f < 2; ++f) {
-    int max_diff = ARGBTestFilter(src_width, src_height,
-                                  dst_width, dst_height,
-                                  static_cast<FilterMode>(f),
-                                  benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
+  int i, j;
+  for (i = b; i < (Abs(src_height) + b); ++i) {
+    for (j = b; j < (Abs(src_width) + b) * 4; ++j) {
+      src_argb[(i * src_stride_argb) + j] = (fastrand() & 0xff);
+    }
   }
-}
 
-TEST_F(libyuvTest, ARGBScaleDownBy8) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = src_width / 8;
-  const int dst_height = src_height / 8;
-
-  for (int f = 0; f < 2; ++f) {
-    int max_diff = ARGBTestFilter(src_width, src_height,
-                                  dst_width, dst_height,
-                                  static_cast<FilterMode>(f),
-                                  benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
+  align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
+  align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
+  if (!dst_argb_c || !dst_argb_opt) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
   }
-}
+  memset(dst_argb_c, 2, dst_argb_plane_size);
+  memset(dst_argb_opt, 3, dst_argb_plane_size);
 
-TEST_F(libyuvTest, ARGBScaleDownBy16) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = src_width / 16;
-  const int dst_height = src_height / 16;
+  // Do full image, no clipping.
+  double c_time = get_time();
+  ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+            src_width, src_height,
+            dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+            dst_width, dst_height, f);
+  c_time = (get_time() - c_time);
 
-  for (int f = 0; f < 2; ++f) {
-    int max_diff = ARGBTestFilter(src_width, src_height,
-                                  dst_width, dst_height,
-                                  static_cast<FilterMode>(f),
-                                  benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
+  // Do tiled image, clipping scale to a tile at a time.
+  double opt_time = get_time();
+  for (i = 0; i < benchmark_iterations; ++i) {
+    TileARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+                  src_width, src_height,
+                  dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+                  dst_width, dst_height, f);
   }
-}
+  opt_time = (get_time() - opt_time) / benchmark_iterations;
 
-TEST_F(libyuvTest, ARGBScaleDownBy34) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = src_width * 3 / 4;
-  const int dst_height = src_height * 3 / 4;
+  // Report performance of Full vs Tiled.
+  printf("filter %d - %8d us Full - %8d us Tiled\n",
+         f, static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
 
-  for (int f = 0; f < 2; ++f) {
-    int max_diff = ARGBTestFilter(src_width, src_height,
-                                  dst_width, dst_height,
-                                  static_cast<FilterMode>(f),
-                                  benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
+  // Compare full scaled image vs tiled image.
+  int max_diff = 0;
+  for (i = b; i < (dst_height + b); ++i) {
+    for (j = b * 4; j < (dst_width + b) * 4; ++j) {
+      int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
+                         dst_argb_opt[(i * dst_stride_argb) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
   }
+
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  free_aligned_buffer_page_end(src_argb);
+  return max_diff;
 }
 
-TEST_F(libyuvTest, ARGBScaleDownBy38) {
-  int src_width = 1280;
-  int src_height = 720;
-  int dst_width = src_width * 3 / 8;
-  int dst_height = src_height * 3 / 8;
+// The following adjustments in dimensions ensure the scale factor will be
+// exactly achieved.
+#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
+#define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
+
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                       \
+    TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) {                \
+      int diff = ARGBTestFilter(SX(benchmark_width_, nom, denom),              \
+                                SX(benchmark_height_, nom, denom),             \
+                                DX(benchmark_width_, nom, denom),              \
+                                DX(benchmark_height_, nom, denom),             \
+                                kFilter##filter, benchmark_iterations_,        \
+                                disable_cpu_flags_, benchmark_cpu_info_);      \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest, ARGBScaleDownClipBy##name##_##filter) {            \
+      int diff = ARGBClipTestFilter(SX(benchmark_width_, nom, denom),          \
+                                    SX(benchmark_height_, nom, denom),         \
+                                    DX(benchmark_width_, nom, denom),          \
+                                    DX(benchmark_height_, nom, denom),         \
+                                    kFilter##filter, benchmark_iterations_);   \
+      EXPECT_LE(diff, max_diff);                                               \
+    }
 
-  for (int f = 0; f < 2; ++f) {
-    int max_diff = ARGBTestFilter(src_width, src_height,
-                                  dst_width, dst_height,
-                                  static_cast<FilterMode>(f),
-                                  benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
-  }
-}
+// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
+// filtering is different fixed point implementations for SSSE3, Neon and C.
+#define TEST_FACTOR(name, nom, denom)                                          \
+    TEST_FACTOR1(name, None, nom, denom, 0)                                    \
+    TEST_FACTOR1(name, Linear, nom, denom, 3)                                  \
+    TEST_FACTOR1(name, Bilinear, nom, denom, 3)                                \
+    TEST_FACTOR1(name, Box, nom, denom, 3)
+
+TEST_FACTOR(2, 1, 2)
+TEST_FACTOR(4, 1, 4)
+TEST_FACTOR(8, 1, 8)
+TEST_FACTOR(3by4, 3, 4)
+TEST_FACTOR(3by8, 3, 8)
+TEST_FACTOR(3, 1, 3)
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+#undef SX
+#undef DX
+
+#define TEST_SCALETO1(name, width, height, filter, max_diff)                   \
+    TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) {           \
+      int diff = ARGBTestFilter(benchmark_width_, benchmark_height_,           \
+                                width, height,                                 \
+                                kFilter##filter, benchmark_iterations_,        \
+                                disable_cpu_flags_, benchmark_cpu_info_);      \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {         \
+      int diff = ARGBTestFilter(width, height,                                 \
+                                Abs(benchmark_width_), Abs(benchmark_height_), \
+                                kFilter##filter, benchmark_iterations_,        \
+                                disable_cpu_flags_, benchmark_cpu_info_);      \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest, name##ClipTo##width##x##height##_##filter) {       \
+      int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_,       \
+                                    width, height,                             \
+                                    kFilter##filter, benchmark_iterations_);   \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest, name##ClipFrom##width##x##height##_##filter) {     \
+      int diff = ARGBClipTestFilter(width, height,                             \
+                                    Abs(benchmark_width_),                     \
+                                    Abs(benchmark_height_),                    \
+                                    kFilter##filter, benchmark_iterations_);   \
+      EXPECT_LE(diff, max_diff);                                               \
+    }
 
-TEST_F(libyuvTest, ARGBScaleTo1366) {
-  int src_width = 1280;
-  int src_height = 720;
-  int dst_width = 1366;
-  int dst_height = 768;
+/// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height)                                      \
+    TEST_SCALETO1(name, width, height, None, 0)                                \
+    TEST_SCALETO1(name, width, height, Linear, 3)                              \
+    TEST_SCALETO1(name, width, height, Bilinear, 3)
+
+TEST_SCALETO(ARGBScale, 1, 1)
+TEST_SCALETO(ARGBScale, 320, 240)
+TEST_SCALETO(ARGBScale, 352, 288)
+TEST_SCALETO(ARGBScale, 569, 480)
+TEST_SCALETO(ARGBScale, 640, 360)
+TEST_SCALETO(ARGBScale, 1280, 720)
+#undef TEST_SCALETO1
+#undef TEST_SCALETO
+
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleReference2(const uint8* src_y, int src_stride_y,
+                             const uint8* src_u, int src_stride_u,
+                             const uint8* src_v, int src_stride_v,
+                             uint32 src_fourcc,
+                             int src_width, int src_height,
+                             uint8* dst_argb, int dst_stride_argb,
+                             uint32 dst_fourcc,
+                             int dst_width, int dst_height,
+                             int clip_x, int clip_y,
+                             int clip_width, int clip_height,
+                             enum FilterMode filtering) {
+  uint8* argb_buffer = static_cast<uint8*>(malloc(src_width * src_height * 4));
+  int r;
+  I420ToARGB(src_y, src_stride_y,
+             src_u, src_stride_u,
+             src_v, src_stride_v,
+             argb_buffer, src_width * 4,
+             src_width, src_height);
+
+  r = ARGBScaleClip(argb_buffer, src_width * 4,
+                    src_width, src_height,
+                    dst_argb, dst_stride_argb,
+                    dst_width, dst_height,
+                    clip_x, clip_y, clip_width, clip_height,
+                    filtering);
+  free(argb_buffer);
+  return r;
+}
 
-  for (int f = 0; f < 2; ++f) {
-    int max_diff = ARGBTestFilter(src_width, src_height,
-                                  dst_width, dst_height,
-                                  static_cast<FilterMode>(f),
-                                  benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
+static void FillRamp(uint8* buf, int width, int height, int v, int dx, int dy) {
+  int rv = v;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      *buf++ = v;
+      v += dx;
+      if (v < 0 || v > 255) {
+        dx = -dx;
+        v += dx;
+      }
+    }
+    v = rv + dy;
+    if (v < 0 || v > 255) {
+      dy = -dy;
+      v += dy;
+    }
+    rv = v;
   }
 }
 
-TEST_F(libyuvTest, ARGBScaleTo4074) {
-  int src_width = 2880 * 2;
-  int src_height = 1800;
-  int dst_width = 4074;
-  int dst_height = 1272;
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int YUVToARGBTestFilter(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               FilterMode f, int benchmark_iterations,
+                               int disable_cpu_flags, int benchmark_cpu_info) {
+  int64 src_y_plane_size = Abs(src_width) * Abs(src_height);
+  int64 src_uv_plane_size = ((Abs(src_width) + 1) / 2) *
+      ((Abs(src_height) + 1) / 2);
+  int src_stride_y = Abs(src_width);
+  int src_stride_uv = (Abs(src_width) + 1) / 2;
+
+  align_buffer_page_end(src_y, src_y_plane_size);
+  align_buffer_page_end(src_u, src_uv_plane_size);
+  align_buffer_page_end(src_v, src_uv_plane_size);
+
+  int64 dst_argb_plane_size = (dst_width) * (dst_height) * 4LL;
+  int dst_stride_argb = (dst_width) * 4;
+  align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
+  align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
+  if (!dst_argb_c || !dst_argb_opt || !src_y || !src_u || !src_v) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  // Fill YUV image with continuous ramp, which is less sensitive to
+  // subsampling and filtering differences for test purposes.
+  FillRamp(src_y, Abs(src_width), Abs(src_height), 128, 1, 1);
+  FillRamp(src_u, (Abs(src_width) + 1) / 2, (Abs(src_height) + 1) / 2, 3, 1, 1);
+  FillRamp(src_v, (Abs(src_width) + 1) / 2, (Abs(src_height) + 1) / 2, 4, 1, 1);
+  memset(dst_argb_c, 2, dst_argb_plane_size);
+  memset(dst_argb_opt, 3, dst_argb_plane_size);
 
-  for (int f = 0; f < 2; ++f) {
-    int max_diff = ARGBTestFilter(src_width, src_height,
-                                  dst_width, dst_height,
-                                  static_cast<FilterMode>(f),
-                                  benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
+  YUVToARGBScaleReference2(src_y, src_stride_y,
+                           src_u, src_stride_uv,
+                           src_v, src_stride_uv,
+                           libyuv::FOURCC_I420,
+                           src_width, src_height,
+                           dst_argb_c, dst_stride_argb,
+                           libyuv::FOURCC_I420,
+                           dst_width, dst_height,
+                           0, 0, dst_width, dst_height,
+                           f);
+
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    YUVToARGBScaleClip(src_y, src_stride_y,
+                       src_u, src_stride_uv,
+                       src_v, src_stride_uv,
+                       libyuv::FOURCC_I420,
+                       src_width, src_height,
+                       dst_argb_opt, dst_stride_argb,
+                       libyuv::FOURCC_I420,
+                       dst_width, dst_height,
+                       0, 0, dst_width, dst_height,
+                       f);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < dst_height; ++i) {
+    for (int j = 0; j < dst_width * 4; ++j) {
+      int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
+                         dst_argb_opt[(i * dst_stride_argb) + j]);
+      if (abs_diff > max_diff) {
+        printf("error %d at %d,%d c %d opt %d",
+               abs_diff,
+               j, i,
+               dst_argb_c[(i * dst_stride_argb) + j],
+               dst_argb_opt[(i * dst_stride_argb) + j]);
+        EXPECT_LE(abs_diff, 40);
+        max_diff = abs_diff;
+      }
+    }
   }
-}
 
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_u);
+  free_aligned_buffer_page_end(src_v);
+  return max_diff;
+}
 
-TEST_F(libyuvTest, ARGBScaleTo853) {
-  int src_width = 1280;
-  int src_height = 720;
-  int dst_width = 853;
-  int dst_height = 480;
+TEST_F(LibYUVScaleTest, YUVToRGBScaleUp) {
+  int diff = YUVToARGBTestFilter(benchmark_width_, benchmark_height_,
+                                 benchmark_width_ * 3 / 2,
+                                 benchmark_height_ * 3 / 2,
+                                 libyuv::kFilterBilinear,
+                                 benchmark_iterations_,
+                                 disable_cpu_flags_, benchmark_cpu_info_);
+  EXPECT_LE(diff, 10);
+}
 
-  for (int f = 0; f < 2; ++f) {
-    int max_diff = ARGBTestFilter(src_width, src_height,
-                                  dst_width, dst_height,
-                                  static_cast<FilterMode>(f),
-                                  benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
-  }
+TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) {
+  int diff = YUVToARGBTestFilter(benchmark_width_ * 3 / 2,
+                                 benchmark_height_ * 3 / 2,
+                                 benchmark_width_, benchmark_height_,
+                                 libyuv::kFilterBilinear,
+                                 benchmark_iterations_,
+                                 disable_cpu_flags_, benchmark_cpu_info_);
+  EXPECT_LE(diff, 10);
 }
 
+
 }  // namespace libyuv
diff --git a/files/unit_test/scale_test.cc b/files/unit_test/scale_test.cc
index 55b4148d..f40443e2 100644
--- a/files/unit_test/scale_test.cc
+++ b/files/unit_test/scale_test.cc
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -15,59 +15,65 @@
 #include "libyuv/scale.h"
 #include "../unit_test/unit_test.h"
 
+#define STRINGIZE(line) #line
+#define FILELINESTR(file, line) file ":" STRINGIZE(line)
+
 namespace libyuv {
 
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
 static int TestFilter(int src_width, int src_height,
                       int dst_width, int dst_height,
-                      FilterMode f, int rounding, int benchmark_iterations) {
-  const int b = 128 * rounding;
-  int src_width_uv = (src_width + rounding) >> 1;
-  int src_height_uv = (src_height + rounding) >> 1;
+                      FilterMode f, int benchmark_iterations,
+                      int disable_cpu_flags, int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i, j;
+  const int b = 0;  // 128 to test for padding/stride.
+  int src_width_uv = (Abs(src_width) + 1) >> 1;
+  int src_height_uv = (Abs(src_height) + 1) >> 1;
 
-  int src_y_plane_size = (src_width + b * 2) * (src_height + b * 2);
-  int src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
+  int64 src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2);
+  int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
 
-  int src_stride_y = b * 2 + src_width;
+  int src_stride_y = b * 2 + Abs(src_width);
   int src_stride_uv = b * 2 + src_width_uv;
 
   align_buffer_page_end(src_y, src_y_plane_size)
   align_buffer_page_end(src_u, src_uv_plane_size)
   align_buffer_page_end(src_v, src_uv_plane_size)
+  if (!src_y || !src_u || !src_v) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  MemRandomize(src_y, src_y_plane_size);
+  MemRandomize(src_u, src_uv_plane_size);
+  MemRandomize(src_v, src_uv_plane_size);
 
-  int dst_width_uv = (dst_width + rounding) >> 1;
-  int dst_height_uv = (dst_height + rounding) >> 1;
+  int dst_width_uv = (dst_width + 1) >> 1;
+  int dst_height_uv = (dst_height + 1) >> 1;
 
-  int dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
-  int dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
+  int64 dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
+  int64 dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
 
   int dst_stride_y = b * 2 + dst_width;
   int dst_stride_uv = b * 2 + dst_width_uv;
 
-  srandom(time(NULL));
-
-  int i, j;
-  for (i = b; i < (src_height + b); ++i) {
-    for (j = b; j < (src_width + b); ++j) {
-      src_y[(i * src_stride_y) + j] = (random() & 0xff);
-    }
-  }
-
-  for (i = b; i < (src_height_uv + b); ++i) {
-    for (j = b; j < (src_width_uv + b); ++j) {
-      src_u[(i * src_stride_uv) + j] = (random() & 0xff);
-      src_v[(i * src_stride_uv) + j] = (random() & 0xff);
-    }
-  }
-
   align_buffer_page_end(dst_y_c, dst_y_plane_size)
   align_buffer_page_end(dst_u_c, dst_uv_plane_size)
   align_buffer_page_end(dst_v_c, dst_uv_plane_size)
   align_buffer_page_end(dst_y_opt, dst_y_plane_size)
   align_buffer_page_end(dst_u_opt, dst_uv_plane_size)
   align_buffer_page_end(dst_v_opt, dst_uv_plane_size)
+  if (!dst_y_c || !dst_u_c || !dst_v_c ||
+      !dst_y_opt|| !dst_u_opt|| !dst_v_opt) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
 
-  // Warm up both versions for consistent benchmarks.
-  MaskCpuFlags(0);  // Disable all CPU optimization.
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  double c_time = get_time();
   I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
             src_u + (src_stride_uv * b) + b, src_stride_uv,
             src_v + (src_stride_uv * b) + b, src_stride_uv,
@@ -76,31 +82,9 @@ static int TestFilter(int src_width, int src_height,
             dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
             dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
             dst_width, dst_height, f);
-  MaskCpuFlags(-1);  // Enable all CPU optimization.
-  I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
-            src_u + (src_stride_uv * b) + b, src_stride_uv,
-            src_v + (src_stride_uv * b) + b, src_stride_uv,
-            src_width, src_height,
-            dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
-            dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
-            dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
-            dst_width, dst_height, f);
-
-  MaskCpuFlags(0);  // Disable all CPU optimization.
-  double c_time = get_time();
-  for (i = 0; i < benchmark_iterations; ++i) {
-    I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
-              src_u + (src_stride_uv * b) + b, src_stride_uv,
-              src_v + (src_stride_uv * b) + b, src_stride_uv,
-              src_width, src_height,
-              dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
-              dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
-              dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
-              dst_width, dst_height, f);
-  }
-  c_time = (get_time() - c_time) / benchmark_iterations;
+  c_time = (get_time() - c_time);
 
-  MaskCpuFlags(-1);  // Enable all CPU optimization.
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
   double opt_time = get_time();
   for (i = 0; i < benchmark_iterations; ++i) {
     I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
@@ -113,10 +97,11 @@ static int TestFilter(int src_width, int src_height,
               dst_width, dst_height, f);
   }
   opt_time = (get_time() - opt_time) / benchmark_iterations;
-
   // Report performance of C vs OPT
   printf("filter %d - %8d us C - %8d us OPT\n",
-         f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
+         f,
+         static_cast<int>(c_time * 1e6),
+         static_cast<int>(opt_time * 1e6));
 
   // C version may be a little off from the optimized. Order of
   //  operations may introduce rounding somewhere. So do a difference
@@ -125,7 +110,7 @@ static int TestFilter(int src_width, int src_height,
   int max_diff = 0;
   for (i = b; i < (dst_height + b); ++i) {
     for (j = b; j < (dst_width + b); ++j) {
-      int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
+      int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] -
                          dst_y_opt[(i * dst_stride_y) + j]);
       if (abs_diff > max_diff) {
         max_diff = abs_diff;
@@ -135,12 +120,12 @@ static int TestFilter(int src_width, int src_height,
 
   for (i = b; i < (dst_height_uv + b); ++i) {
     for (j = b; j < (dst_width_uv + b); ++j) {
-      int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] -
+      int abs_diff = Abs(dst_u_c[(i * dst_stride_uv) + j] -
                          dst_u_opt[(i * dst_stride_uv) + j]);
       if (abs_diff > max_diff) {
         max_diff = abs_diff;
       }
-      abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] -
+      abs_diff = Abs(dst_v_c[(i * dst_stride_uv) + j] -
                      dst_v_opt[(i * dst_stride_uv) + j]);
       if (abs_diff > max_diff) {
         max_diff = abs_diff;
@@ -162,215 +147,226 @@ static int TestFilter(int src_width, int src_height,
   return max_diff;
 }
 
-TEST_F(libyuvTest, ScaleDownBy2) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = src_width / 2;
-  const int dst_height = src_height / 2;
-
-  for (int f = 0; f < 3; ++f) {
-    int max_diff = TestFilter(src_width, src_height,
-                              dst_width, dst_height,
-                              static_cast<FilterMode>(f), 1,
-                              benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
+// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
+// 0 = exact.
+static int TestFilter_16(int src_width, int src_height,
+                         int dst_width, int dst_height,
+                         FilterMode f, int benchmark_iterations) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
   }
-}
 
-TEST_F(libyuvTest, ScaleDownBy4) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = src_width / 4;
-  const int dst_height = src_height / 4;
-
-  for (int f = 0; f < 3; ++f) {
-    int max_diff = TestFilter(src_width, src_height,
-                              dst_width, dst_height,
-                              static_cast<FilterMode>(f), 1,
-                              benchmark_iterations_);
-    EXPECT_LE(max_diff, 2);  // This is the only scale factor with error of 2.
-  }
-}
+  int i, j;
+  const int b = 0;  // 128 to test for padding/stride.
+  int src_width_uv = (Abs(src_width) + 1) >> 1;
+  int src_height_uv = (Abs(src_height) + 1) >> 1;
 
-TEST_F(libyuvTest, ScaleDownBy5) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = src_width / 5;
-  const int dst_height = src_height / 5;
-
-  for (int f = 0; f < 3; ++f) {
-    int max_diff = TestFilter(src_width, src_height,
-                              dst_width, dst_height,
-                              static_cast<FilterMode>(f), 1,
-                              benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
-  }
-}
+  int64 src_y_plane_size = (Abs(src_width) + b * 2) *
+      (Abs(src_height) + b * 2);
+  int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
 
-TEST_F(libyuvTest, ScaleDownBy8) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = src_width / 8;
-  const int dst_height = src_height / 8;
-
-  for (int f = 0; f < 3; ++f) {
-    int max_diff = TestFilter(src_width, src_height,
-                              dst_width, dst_height,
-                              static_cast<FilterMode>(f), 1,
-                              benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
-  }
-}
+  int src_stride_y = b * 2 + Abs(src_width);
+  int src_stride_uv = b * 2 + src_width_uv;
 
-TEST_F(libyuvTest, ScaleDownBy16) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = src_width / 16;
-  const int dst_height = src_height / 16;
-
-  for (int f = 0; f < 3; ++f) {
-    int max_diff = TestFilter(src_width, src_height,
-                              dst_width, dst_height,
-                              static_cast<FilterMode>(f), 1,
-                              benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
+  align_buffer_page_end(src_y, src_y_plane_size)
+  align_buffer_page_end(src_u, src_uv_plane_size)
+  align_buffer_page_end(src_v, src_uv_plane_size)
+  align_buffer_page_end(src_y_16, src_y_plane_size * 2)
+  align_buffer_page_end(src_u_16, src_uv_plane_size * 2)
+  align_buffer_page_end(src_v_16, src_uv_plane_size * 2)
+  uint16* p_src_y_16 = reinterpret_cast<uint16*>(src_y_16);
+  uint16* p_src_u_16 = reinterpret_cast<uint16*>(src_u_16);
+  uint16* p_src_v_16 = reinterpret_cast<uint16*>(src_v_16);
+
+  MemRandomize(src_y, src_y_plane_size);
+  MemRandomize(src_u, src_uv_plane_size);
+  MemRandomize(src_v, src_uv_plane_size);
+
+  for (i = b; i < src_height + b; ++i) {
+    for (j = b; j < src_width + b; ++j) {
+      p_src_y_16[(i * src_stride_y) + j] = src_y[(i * src_stride_y) + j];
+    }
   }
-}
 
-TEST_F(libyuvTest, ScaleDownBy34) {
-  const int src_width = 1280;
-  const int src_height = 720;
-  const int dst_width = src_width * 3 / 4;
-  const int dst_height = src_height * 3 / 4;
-
-  for (int f = 0; f < 3; ++f) {
-    int max_diff = TestFilter(src_width, src_height,
-                              dst_width, dst_height,
-                              static_cast<FilterMode>(f), 1,
-                              benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
+  for (i = b; i < (src_height_uv + b); ++i) {
+    for (j = b; j < (src_width_uv + b); ++j) {
+      p_src_u_16[(i * src_stride_uv) + j] = src_u[(i * src_stride_uv) + j];
+      p_src_v_16[(i * src_stride_uv) + j] = src_v[(i * src_stride_uv) + j];
+    }
   }
-}
 
-TEST_F(libyuvTest, ScaleDownBy38) {
-  int src_width = 1280;
-  int src_height = 720;
-  int dst_width = src_width * 3 / 8;
-  int dst_height = src_height * 3 / 8;
-
-  for (int f = 0; f < 3; ++f) {
-    int max_diff = TestFilter(src_width, src_height,
-                              dst_width, dst_height,
-                              static_cast<FilterMode>(f), 1,
-                              benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
-  }
-}
+  int dst_width_uv = (dst_width + 1) >> 1;
+  int dst_height_uv = (dst_height + 1) >> 1;
 
-TEST_F(libyuvTest, ScaleTo1366) {
-  int src_width = 1280;
-  int src_height = 720;
-  int dst_width = 1366;
-  int dst_height = 768;
-
-  for (int f = 0; f < 3; ++f) {
-    int max_diff = TestFilter(src_width, src_height,
-                              dst_width, dst_height,
-                              static_cast<FilterMode>(f), 1,
-                              benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
-  }
-}
+  int dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
+  int dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
 
-TEST_F(libyuvTest, ScaleTo4074) {
-  int src_width = 2880 * 2;
-  int src_height = 1800;
-  int dst_width = 4074;
-  int dst_height = 1272;
-
-  for (int f = 0; f < 3; ++f) {
-    int max_diff = TestFilter(src_width, src_height,
-                              dst_width, dst_height,
-                              static_cast<FilterMode>(f), 1,
-                              benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
-  }
-}
+  int dst_stride_y = b * 2 + dst_width;
+  int dst_stride_uv = b * 2 + dst_width_uv;
 
-TEST_F(libyuvTest, ScaleTo853) {
-  int src_width = 1280;
-  int src_height = 720;
-  int dst_width = 853;
-  int dst_height = 480;
-
-  for (int f = 0; f < 3; ++f) {
-    int max_diff = TestFilter(src_width, src_height,
-                              dst_width, dst_height,
-                              static_cast<FilterMode>(f), 1,
-                              benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
-  }
-}
+  align_buffer_page_end(dst_y_8, dst_y_plane_size)
+  align_buffer_page_end(dst_u_8, dst_uv_plane_size)
+  align_buffer_page_end(dst_v_8, dst_uv_plane_size)
+  align_buffer_page_end(dst_y_16, dst_y_plane_size * 2)
+  align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2)
+  align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2)
 
-TEST_F(libyuvTest, ScaleTo853Wrong) {
-  int src_width = 1280;
-  int src_height = 720;
-  int dst_width = 853;
-  int dst_height = 480;
-
-  for (int f = 0; f < 3; ++f) {
-    int max_diff = TestFilter(src_width, src_height,
-                              dst_width, dst_height,
-                              static_cast<FilterMode>(f), 0,
-                              benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
-  }
-}
+  uint16* p_dst_y_16 = reinterpret_cast<uint16*>(dst_y_16);
+  uint16* p_dst_u_16 = reinterpret_cast<uint16*>(dst_u_16);
+  uint16* p_dst_v_16 = reinterpret_cast<uint16*>(dst_v_16);
+
+  I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+            src_u + (src_stride_uv * b) + b, src_stride_uv,
+            src_v + (src_stride_uv * b) + b, src_stride_uv,
+            src_width, src_height,
+            dst_y_8 + (dst_stride_y * b) + b, dst_stride_y,
+            dst_u_8 + (dst_stride_uv * b) + b, dst_stride_uv,
+            dst_v_8 + (dst_stride_uv * b) + b, dst_stride_uv,
+            dst_width, dst_height, f);
 
-// A one off test for a screen cast resolution scale.
-TEST_F(libyuvTest, ScaleTo684) {
-  int src_width = 686;
-  int src_height = 557;
-  int dst_width = 684;
-  int dst_height = 552;
-
-  for (int f = 0; f < 3; ++f) {
-    int max_diff = TestFilter(src_width, src_height,
-                              dst_width, dst_height,
-                              static_cast<FilterMode>(f), 1,
-                              benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
+  for (i = 0; i < benchmark_iterations; ++i) {
+    I420Scale_16(p_src_y_16 + (src_stride_y * b) + b, src_stride_y,
+                 p_src_u_16 + (src_stride_uv * b) + b, src_stride_uv,
+                 p_src_v_16 + (src_stride_uv * b) + b, src_stride_uv,
+                 src_width, src_height,
+                 p_dst_y_16 + (dst_stride_y * b) + b, dst_stride_y,
+                 p_dst_u_16 + (dst_stride_uv * b) + b, dst_stride_uv,
+                 p_dst_v_16 + (dst_stride_uv * b) + b, dst_stride_uv,
+                 dst_width, dst_height, f);
   }
-}
 
-TEST_F(libyuvTest, ScaleTo342) {
-  int src_width = 686;
-  int src_height = 557;
-  int dst_width = 342;
-  int dst_height = 276;
-
-  for (int f = 0; f < 3; ++f) {
-    int max_diff = TestFilter(src_width, src_height,
-                              dst_width, dst_height,
-                              static_cast<FilterMode>(f), 1,
-                              benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
+  // Expect an exact match
+  int max_diff = 0;
+  for (i = b; i < (dst_height + b); ++i) {
+    for (j = b; j < (dst_width + b); ++j) {
+      int abs_diff = Abs(dst_y_8[(i * dst_stride_y) + j] -
+                         p_dst_y_16[(i * dst_stride_y) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
   }
-}
 
-TEST_F(libyuvTest, ScaleToHalf342) {
-  int src_width = 684;
-  int src_height = 552;
-  int dst_width = 342;
-  int dst_height = 276;
-
-  for (int f = 0; f < 3; ++f) {
-    int max_diff = TestFilter(src_width, src_height,
-                              dst_width, dst_height,
-                              static_cast<FilterMode>(f), 1,
-                              benchmark_iterations_);
-    EXPECT_LE(max_diff, 1);
+  for (i = b; i < (dst_height_uv + b); ++i) {
+    for (j = b; j < (dst_width_uv + b); ++j) {
+      int abs_diff = Abs(dst_u_8[(i * dst_stride_uv) + j] -
+                         p_dst_u_16[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+      abs_diff = Abs(dst_v_8[(i * dst_stride_uv) + j] -
+                     p_dst_v_16[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
   }
+
+  free_aligned_buffer_page_end(dst_y_8)
+  free_aligned_buffer_page_end(dst_u_8)
+  free_aligned_buffer_page_end(dst_v_8)
+  free_aligned_buffer_page_end(dst_y_16)
+  free_aligned_buffer_page_end(dst_u_16)
+  free_aligned_buffer_page_end(dst_v_16)
+
+  free_aligned_buffer_page_end(src_y)
+  free_aligned_buffer_page_end(src_u)
+  free_aligned_buffer_page_end(src_v)
+  free_aligned_buffer_page_end(src_y_16)
+  free_aligned_buffer_page_end(src_u_16)
+  free_aligned_buffer_page_end(src_v_16)
+
+  return max_diff;
 }
 
+// The following adjustments in dimensions ensure the scale factor will be
+// exactly achieved.
+// 2 is chroma subsample
+#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
+#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
+
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                       \
+    TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter) {                    \
+      int diff = TestFilter(SX(benchmark_width_, nom, denom),                  \
+                            SX(benchmark_height_, nom, denom),                 \
+                            DX(benchmark_width_, nom, denom),                  \
+                            DX(benchmark_height_, nom, denom),                 \
+                            kFilter##filter, benchmark_iterations_,            \
+                            disable_cpu_flags_, benchmark_cpu_info_);          \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest, DISABLED_ScaleDownBy##name##_##filter##_16) {      \
+      int diff = TestFilter_16(SX(benchmark_width_, nom, denom),               \
+                               SX(benchmark_height_, nom, denom),              \
+                               DX(benchmark_width_, nom, denom),               \
+                               DX(benchmark_height_, nom, denom),              \
+                               kFilter##filter, benchmark_iterations_);        \
+      EXPECT_LE(diff, max_diff);                                               \
+    }
+
+// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
+// filtering is different fixed point implementations for SSSE3, Neon and C.
+#define TEST_FACTOR(name, nom, denom, boxdiff)                                 \
+    TEST_FACTOR1(name, None, nom, denom, 0)                                    \
+    TEST_FACTOR1(name, Linear, nom, denom, 3)                                  \
+    TEST_FACTOR1(name, Bilinear, nom, denom, 3)                                \
+    TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+
+TEST_FACTOR(2, 1, 2, 0)
+TEST_FACTOR(4, 1, 4, 0)
+TEST_FACTOR(8, 1, 8, 0)
+TEST_FACTOR(3by4, 3, 4, 1)
+TEST_FACTOR(3by8, 3, 8, 1)
+TEST_FACTOR(3, 1, 3, 0)
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+#undef SX
+#undef DX
+
+#define TEST_SCALETO1(name, width, height, filter, max_diff)                   \
+    TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) {           \
+      int diff = TestFilter(benchmark_width_, benchmark_height_,               \
+                            width, height,                                     \
+                            kFilter##filter, benchmark_iterations_,            \
+                            disable_cpu_flags_, benchmark_cpu_info_);          \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {         \
+      int diff = TestFilter(width, height,                                     \
+                            Abs(benchmark_width_), Abs(benchmark_height_),     \
+                            kFilter##filter, benchmark_iterations_,            \
+                            disable_cpu_flags_, benchmark_cpu_info_);          \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest,                                                    \
+        DISABLED_##name##To##width##x##height##_##filter##_16) {               \
+      int diff = TestFilter_16(benchmark_width_, benchmark_height_,            \
+                               width, height,                                  \
+                               kFilter##filter, benchmark_iterations_);        \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest,                                                    \
+        DISABLED_##name##From##width##x##height##_##filter##_16) {             \
+      int diff = TestFilter_16(width, height,                                  \
+                               Abs(benchmark_width_), Abs(benchmark_height_),  \
+                               kFilter##filter, benchmark_iterations_);        \
+      EXPECT_LE(diff, max_diff);                                               \
+    }
+
+// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height)                                      \
+    TEST_SCALETO1(name, width, height, None, 0)                                \
+    TEST_SCALETO1(name, width, height, Linear, 0)                              \
+    TEST_SCALETO1(name, width, height, Bilinear, 0)                            \
+    TEST_SCALETO1(name, width, height, Box, 0)
+
+TEST_SCALETO(Scale, 1, 1)
+TEST_SCALETO(Scale, 320, 240)
+TEST_SCALETO(Scale, 352, 288)
+TEST_SCALETO(Scale, 569, 480)
+TEST_SCALETO(Scale, 640, 360)
+TEST_SCALETO(Scale, 1280, 720)
+#undef TEST_SCALETO1
+#undef TEST_SCALETO
+
 }  // namespace libyuv
diff --git a/files/unit_test/testdata/juno.txt b/files/unit_test/testdata/juno.txt
new file mode 100644
index 00000000..c275be74
--- /dev/null
+++ b/files/unit_test/testdata/juno.txt
@@ -0,0 +1,15 @@
+Processor       : AArch64 Processor rev 0 (aarch64)
+processor       : 0
+processor       : 1
+processor       : 2
+processor       : 3
+processor       : 4
+processor       : 5
+Features        : fp asimd evtstrm aes pmull sha1 sha2 crc32
+CPU implementer : 0x41
+CPU architecture: AArch64
+CPU variant     : 0x0
+CPU part        : 0xd07
+CPU revision    : 0
+
+Hardware        : Juno
diff --git a/files/unit_test/unit_test.cc b/files/unit_test/unit_test.cc
index 007c81f0..e75510fd 100644
--- a/files/unit_test/unit_test.cc
+++ b/files/unit_test/unit_test.cc
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -14,20 +14,343 @@
 
 #include <cstring>
 
+#include "gflags/gflags.h"
+
 // Change this to 1000 for benchmarking.
 // TODO(fbarchard): Add command line parsing to pass this as option.
 #define BENCHMARK_ITERATIONS 1
 
-libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128),
-    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(1280),
-    benchmark_height_(720) {
-    const char* repeat = getenv("LIBYUV_REPEAT");
-    if (repeat) {
-      benchmark_iterations_ = atoi(repeat);  // NOLINT
-    }
+unsigned int fastrand_seed = 0xfb;
+
+DEFINE_int32(libyuv_width, 0, "width of test image.");
+DEFINE_int32(libyuv_height, 0, "height of test image.");
+DEFINE_int32(libyuv_repeat, 0, "number of times to repeat test.");
+DEFINE_int32(libyuv_flags, 0,
+             "cpu flags for reference code. 1 = C, -1 = SIMD");
+DEFINE_int32(libyuv_cpu_info, 0,
+             "cpu flags for benchmark code. 1 = C, -1 = SIMD");
+
+// For quicker unittests, default is 128 x 72.  But when benchmarking,
+// default to 720p.  Allow size to specify.
+// Set flags to -1 for benchmarking to avoid slower C code.
+
+LibYUVConvertTest::LibYUVConvertTest() :
+    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
+    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+  const char* repeat = getenv("LIBYUV_REPEAT");
+  if (repeat) {
+    benchmark_iterations_ = atoi(repeat);  // NOLINT
+  }
+  if (FLAGS_libyuv_repeat) {
+    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  }
+  if (benchmark_iterations_ > 1) {
+    benchmark_width_ = 1280;
+    benchmark_height_ = 720;
+  }
+  const char* width = getenv("LIBYUV_WIDTH");
+  if (width) {
+    benchmark_width_ = atoi(width);  // NOLINT
+  }
+  if (FLAGS_libyuv_width) {
+    benchmark_width_ = FLAGS_libyuv_width;
+  }
+  const char* height = getenv("LIBYUV_HEIGHT");
+  if (height) {
+    benchmark_height_ = atoi(height);  // NOLINT
+  }
+  if (FLAGS_libyuv_height) {
+    benchmark_height_ = FLAGS_libyuv_height;
+  }
+  const char* cpu_flags = getenv("LIBYUV_FLAGS");
+  if (cpu_flags) {
+    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_flags) {
+    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  }
+  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+  if (cpu_info) {
+    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_cpu_info) {
+    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  }
+  benchmark_pixels_div256_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
+  benchmark_pixels_div1280_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
+}
+
+LibYUVColorTest::LibYUVColorTest() :
+    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
+    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+  const char* repeat = getenv("LIBYUV_REPEAT");
+  if (repeat) {
+    benchmark_iterations_ = atoi(repeat);  // NOLINT
+  }
+  if (FLAGS_libyuv_repeat) {
+    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  }
+  if (benchmark_iterations_ > 1) {
+    benchmark_width_ = 1280;
+    benchmark_height_ = 720;
+  }
+  const char* width = getenv("LIBYUV_WIDTH");
+  if (width) {
+    benchmark_width_ = atoi(width);  // NOLINT
+  }
+  if (FLAGS_libyuv_width) {
+    benchmark_width_ = FLAGS_libyuv_width;
+  }
+  const char* height = getenv("LIBYUV_HEIGHT");
+  if (height) {
+    benchmark_height_ = atoi(height);  // NOLINT
+  }
+  if (FLAGS_libyuv_height) {
+    benchmark_height_ = FLAGS_libyuv_height;
+  }
+  const char* cpu_flags = getenv("LIBYUV_FLAGS");
+  if (cpu_flags) {
+    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_flags) {
+    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  }
+  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+  if (cpu_info) {
+    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_cpu_info) {
+    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  }
+  benchmark_pixels_div256_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
+  benchmark_pixels_div1280_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
+}
+
+LibYUVScaleTest::LibYUVScaleTest() :
+    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
+    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+  const char* repeat = getenv("LIBYUV_REPEAT");
+  if (repeat) {
+    benchmark_iterations_ = atoi(repeat);  // NOLINT
+  }
+  if (FLAGS_libyuv_repeat) {
+    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  }
+  if (benchmark_iterations_ > 1) {
+    benchmark_width_ = 1280;
+    benchmark_height_ = 720;
+  }
+  const char* width = getenv("LIBYUV_WIDTH");
+  if (width) {
+    benchmark_width_ = atoi(width);  // NOLINT
+  }
+  if (FLAGS_libyuv_width) {
+    benchmark_width_ = FLAGS_libyuv_width;
+  }
+  const char* height = getenv("LIBYUV_HEIGHT");
+  if (height) {
+    benchmark_height_ = atoi(height);  // NOLINT
+  }
+  if (FLAGS_libyuv_height) {
+    benchmark_height_ = FLAGS_libyuv_height;
+  }
+  const char* cpu_flags = getenv("LIBYUV_FLAGS");
+  if (cpu_flags) {
+    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_flags) {
+    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  }
+  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+  if (cpu_info) {
+    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_cpu_info) {
+    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  }
+  benchmark_pixels_div256_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
+  benchmark_pixels_div1280_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
+}
+
+LibYUVRotateTest::LibYUVRotateTest() :
+    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
+    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+  const char* repeat = getenv("LIBYUV_REPEAT");
+  if (repeat) {
+    benchmark_iterations_ = atoi(repeat);  // NOLINT
+  }
+  if (FLAGS_libyuv_repeat) {
+    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  }
+  if (benchmark_iterations_ > 1) {
+    benchmark_width_ = 1280;
+    benchmark_height_ = 720;
+  }
+  const char* width = getenv("LIBYUV_WIDTH");
+  if (width) {
+    benchmark_width_ = atoi(width);  // NOLINT
+  }
+  if (FLAGS_libyuv_width) {
+    benchmark_width_ = FLAGS_libyuv_width;
+  }
+  const char* height = getenv("LIBYUV_HEIGHT");
+  if (height) {
+    benchmark_height_ = atoi(height);  // NOLINT
+  }
+  if (FLAGS_libyuv_height) {
+    benchmark_height_ = FLAGS_libyuv_height;
+  }
+  const char* cpu_flags = getenv("LIBYUV_FLAGS");
+  if (cpu_flags) {
+    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_flags) {
+    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  }
+  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+  if (cpu_info) {
+    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_cpu_info) {
+    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  }
+  benchmark_pixels_div256_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
+  benchmark_pixels_div1280_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
+}
+
+LibYUVPlanarTest::LibYUVPlanarTest() :
+    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
+    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+  const char* repeat = getenv("LIBYUV_REPEAT");
+  if (repeat) {
+    benchmark_iterations_ = atoi(repeat);  // NOLINT
+  }
+  if (FLAGS_libyuv_repeat) {
+    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  }
+  if (benchmark_iterations_ > 1) {
+    benchmark_width_ = 1280;
+    benchmark_height_ = 720;
+  }
+  const char* width = getenv("LIBYUV_WIDTH");
+  if (width) {
+    benchmark_width_ = atoi(width);  // NOLINT
+  }
+  if (FLAGS_libyuv_width) {
+    benchmark_width_ = FLAGS_libyuv_width;
+  }
+  const char* height = getenv("LIBYUV_HEIGHT");
+  if (height) {
+    benchmark_height_ = atoi(height);  // NOLINT
+  }
+  if (FLAGS_libyuv_height) {
+    benchmark_height_ = FLAGS_libyuv_height;
+  }
+  const char* cpu_flags = getenv("LIBYUV_FLAGS");
+  if (cpu_flags) {
+    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_flags) {
+    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  }
+  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+  if (cpu_info) {
+    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_cpu_info) {
+    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  }
+  benchmark_pixels_div256_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
+  benchmark_pixels_div1280_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
+}
+
+LibYUVBaseTest::LibYUVBaseTest() :
+    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
+    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+  const char* repeat = getenv("LIBYUV_REPEAT");
+  if (repeat) {
+    benchmark_iterations_ = atoi(repeat);  // NOLINT
+  }
+  if (FLAGS_libyuv_repeat) {
+    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  }
+  if (benchmark_iterations_ > 1) {
+    benchmark_width_ = 1280;
+    benchmark_height_ = 720;
+  }
+  const char* width = getenv("LIBYUV_WIDTH");
+  if (width) {
+    benchmark_width_ = atoi(width);  // NOLINT
+  }
+  if (FLAGS_libyuv_width) {
+    benchmark_width_ = FLAGS_libyuv_width;
+  }
+  const char* height = getenv("LIBYUV_HEIGHT");
+  if (height) {
+    benchmark_height_ = atoi(height);  // NOLINT
+  }
+  if (FLAGS_libyuv_height) {
+    benchmark_height_ = FLAGS_libyuv_height;
+  }
+  const char* cpu_flags = getenv("LIBYUV_FLAGS");
+  if (cpu_flags) {
+    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_flags) {
+    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  }
+  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+  if (cpu_info) {
+    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_cpu_info) {
+    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  }
+  benchmark_pixels_div256_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
+  benchmark_pixels_div1280_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
 }
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
+  // AllowCommandLineParsing allows us to ignore flags passed on to us by
+  // Chromium build bots without having to explicitly disable them.
+  google::AllowCommandLineReparsing();
+  google::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/files/unit_test/unit_test.h b/files/unit_test/unit_test.h
index 62521e88..f2c4bef0 100644
--- a/files/unit_test/unit_test.h
+++ b/files/unit_test/unit_test.h
@@ -4,53 +4,85 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef UNIT_TEST_UNIT_TEST_H_
+#ifndef UNIT_TEST_UNIT_TEST_H_  // NOLINT
 #define UNIT_TEST_UNIT_TEST_H_
 
+#ifdef WIN32
+#include <windows.h>
+#else
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
 #include <gtest/gtest.h>
 
-#define align_buffer_16(var, size)                                             \
-  uint8* var;                                                                  \
-  uint8* var##_mem;                                                            \
-  var##_mem = reinterpret_cast<uint8*>(malloc((size) + 15));                   \
-  var = reinterpret_cast<uint8*>                                               \
-        ((reinterpret_cast<intptr_t>(var##_mem) + 15) & ~15);
+#include "libyuv/basic_types.h"
 
-#define free_aligned_buffer_16(var) \
-  free(var##_mem);  \
-  var = 0;
+#ifndef SIMD_ALIGNED
+#if defined(_MSC_VER) && !defined(__CLR_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#elif defined(__GNUC__) && !defined(__pnacl__)
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#else
+#define SIMD_ALIGNED(var) var
+#endif
+#endif
 
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+#define OFFBY 0
+
+// Scaling uses 16.16 fixed point to step thru the source image, so a
+// maximum size of 32767.999 can be expressed.  32768 is valid because
+// the step is 1 beyond the image but not used.
+// Destination size is mainly constrained by valid scale step not the
+// absolute size, so it may be possible to relax the destination size
+// constraint.
+// Source size is unconstrained for most specialized scalers.  e.g.
+// An image of 65536 scaled to half size would be valid.  The test
+// could be relaxed for special scale factors.
+// If this test is removed, the scaling function should gracefully
+// fail with a return code.  The test could be changed to know that
+// libyuv failed in a controlled way.
+
+static const int kMaxWidth = 32768;
+static const int kMaxHeight = 32768;
+
+static inline bool SizeValid(int src_width, int src_height,
+                             int dst_width, int dst_height) {
+  if (src_width > kMaxWidth || src_height > kMaxHeight ||
+      dst_width > kMaxWidth || dst_height > kMaxHeight) {
+    printf("Warning - size too large to test.  Skipping\n");
+    return false;
+  }
+  return true;
+}
 
 #define align_buffer_page_end(var, size)                                       \
   uint8* var;                                                                  \
   uint8* var##_mem;                                                            \
-  var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095) & ~4095));       \
-  var = var##_mem + (-(size) & 4095);
+  var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095));  \
+  var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) & ~4095) -       \
+      (size)) & ~63);
 
 #define free_aligned_buffer_page_end(var) \
   free(var##_mem);  \
   var = 0;
 
 #ifdef WIN32
-#include <windows.h>
 static inline double get_time() {
   LARGE_INTEGER t, f;
   QueryPerformanceCounter(&t);
   QueryPerformanceFrequency(&f);
   return static_cast<double>(t.QuadPart) / static_cast<double>(f.QuadPart);
 }
-
-#define random rand
-#define srandom srand
 #else
-
-#include <sys/time.h>
-#include <sys/resource.h>
-
 static inline double get_time() {
   struct timeval t;
   struct timezone tzp;
@@ -59,16 +91,109 @@ static inline double get_time() {
 }
 #endif
 
-class libyuvTest : public ::testing::Test {
+#ifndef SIMD_ALIGNED
+#if defined(_MSC_VER) && !defined(__CLR_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#elif defined(__GNUC__) && !defined(__pnacl__)
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#else
+#define SIMD_ALIGNED(var) var
+#endif
+#endif
+
+extern unsigned int fastrand_seed;
+inline int fastrand() {
+  fastrand_seed = fastrand_seed * 214013u + 2531011u;
+  return static_cast<int>((fastrand_seed >> 16) & 0xffff);
+}
+
+static inline void MemRandomize(uint8* dst, int64 len) {
+  int64 i;
+  for (i = 0; i < len - 1; i += 2) {
+    *reinterpret_cast<uint16*>(dst) = fastrand();
+    dst += 2;
+  }
+  for (; i < len; ++i) {
+    *dst++ = fastrand();
+  }
+}
+
+class LibYUVColorTest : public ::testing::Test {
+ protected:
+  LibYUVColorTest();
+
+  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
+  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
+  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
+  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
+  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
+  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
+  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
+};
+
+class LibYUVConvertTest : public ::testing::Test {
+ protected:
+  LibYUVConvertTest();
+
+  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
+  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
+  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
+  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
+  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
+  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
+  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
+};
+
+class LibYUVScaleTest : public ::testing::Test {
  protected:
-  libyuvTest();
+  LibYUVScaleTest();
+
+  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
+  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
+  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
+  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
+  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
+  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
+  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
+};
 
-  const int rotate_max_w_;
-  const int rotate_max_h_;
+class LibYUVRotateTest : public ::testing::Test {
+ protected:
+  LibYUVRotateTest();
+
+  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
+  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
+  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
+  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
+  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
+  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
+  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
+};
+
+class LibYUVPlanarTest : public ::testing::Test {
+ protected:
+  LibYUVPlanarTest();
+
+  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
+  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
+  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
+  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
+  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
+  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
+  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
+};
+
+class LibYUVBaseTest : public ::testing::Test {
+ protected:
+  LibYUVBaseTest();
 
-  int benchmark_iterations_;
-  const int benchmark_width_;
-  const int benchmark_height_;
+  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
+  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
+  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
+  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
+  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
+  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
+  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
 };
 
-#endif  // UNIT_TEST_UNIT_TEST_H_
+#endif  // UNIT_TEST_UNIT_TEST_H_  NOLINT
diff --git a/files/unit_test/version_test.cc b/files/unit_test/version_test.cc
deleted file mode 100644
index c53d754c..00000000
--- a/files/unit_test/version_test.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "libyuv/basic_types.h"
-#include "libyuv/version.h"
-#include "../unit_test/unit_test.h"
-
-namespace libyuv {
-
-// Tests SVN version against include/libyuv/version.h
-// SVN version is bumped by documentation changes as well as code.
-// Although the versions should match, once checked in, a tolerance is allowed.
-TEST_F(libyuvTest, TestVersion) {
-  EXPECT_GE(LIBYUV_VERSION, 169);  // 169 is first version to support version.
-  printf("LIBYUV_VERSION %d\n", LIBYUV_VERSION);
-#ifdef LIBYUV_SVNREVISION
-  const char *ver = strchr(LIBYUV_SVNREVISION, ':');
-  if (ver) {
-    ++ver;
-  } else {
-    ver = LIBYUV_SVNREVISION;
-  }
-  int svn_revision = atoi(ver);  // NOLINT
-  printf("LIBYUV_SVNREVISION %d\n", svn_revision);
-  EXPECT_NEAR(LIBYUV_VERSION, svn_revision, 3);  // Allow version to be close.
-  if (LIBYUV_VERSION != svn_revision) {
-    printf("WARNING - Versions do not match.\n");
-  }
-#endif
-}
-
-}  // namespace libyuv
diff --git a/files/unit_test/video_common_test.cc b/files/unit_test/video_common_test.cc
new file mode 100644
index 00000000..ac97d0f3
--- /dev/null
+++ b/files/unit_test/video_common_test.cc
@@ -0,0 +1,107 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/video_common.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+// Tests FourCC codes in video common, which are used for ConvertToI420().
+
+static bool TestValidChar(uint32 onecc) {
+  if ((onecc >= '0' && onecc <= '9') ||
+      (onecc >= 'A' && onecc <= 'Z') ||
+      (onecc >= 'a' && onecc <= 'z') ||
+      (onecc == ' ') || (onecc == 0xff)) {
+    return true;
+  }
+  return false;
+}
+
+static bool TestValidFourCC(uint32 fourcc, int bpp) {
+  if (!TestValidChar(fourcc & 0xff) ||
+      !TestValidChar((fourcc >> 8) & 0xff) ||
+      !TestValidChar((fourcc >> 16) & 0xff) ||
+      !TestValidChar((fourcc >> 24) & 0xff)) {
+    return false;
+  }
+  if (bpp < 0 || bpp > 32) {
+    return false;
+  }
+  return true;
+}
+
+TEST_F(LibYUVBaseTest, TestCanonicalFourCC) {
+  EXPECT_EQ(FOURCC_I420, CanonicalFourCC(FOURCC_IYUV));
+  EXPECT_EQ(FOURCC_I420, CanonicalFourCC(FOURCC_YU12));
+  EXPECT_EQ(FOURCC_I422, CanonicalFourCC(FOURCC_YU16));
+  EXPECT_EQ(FOURCC_I444, CanonicalFourCC(FOURCC_YU24));
+  EXPECT_EQ(FOURCC_YUY2, CanonicalFourCC(FOURCC_YUYV));
+  EXPECT_EQ(FOURCC_YUY2, CanonicalFourCC(FOURCC_YUVS));
+  EXPECT_EQ(FOURCC_UYVY, CanonicalFourCC(FOURCC_HDYC));
+  EXPECT_EQ(FOURCC_UYVY, CanonicalFourCC(FOURCC_2VUY));
+  EXPECT_EQ(FOURCC_MJPG, CanonicalFourCC(FOURCC_JPEG));
+  EXPECT_EQ(FOURCC_MJPG, CanonicalFourCC(FOURCC_DMB1));
+  EXPECT_EQ(FOURCC_RAW,  CanonicalFourCC(FOURCC_RGB3));
+  EXPECT_EQ(FOURCC_24BG, CanonicalFourCC(FOURCC_BGR3));
+  EXPECT_EQ(FOURCC_BGRA, CanonicalFourCC(FOURCC_CM32));
+  EXPECT_EQ(FOURCC_RAW,  CanonicalFourCC(FOURCC_CM24));
+  EXPECT_EQ(FOURCC_RGBO, CanonicalFourCC(FOURCC_L555));
+  EXPECT_EQ(FOURCC_RGBP, CanonicalFourCC(FOURCC_L565));
+  EXPECT_EQ(FOURCC_RGBO, CanonicalFourCC(FOURCC_5551));
+}
+
+TEST_F(LibYUVBaseTest, TestFourCC) {
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I422, FOURCC_BPP_I422));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I444, FOURCC_BPP_I444));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I411, FOURCC_BPP_I411));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I400, FOURCC_BPP_I400));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_NV21, FOURCC_BPP_NV21));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_NV12, FOURCC_BPP_NV12));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YUY2, FOURCC_BPP_YUY2));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_UYVY, FOURCC_BPP_UYVY));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420));  // deprecated.
+  EXPECT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RAW,  FOURCC_BPP_RAW));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_R444, FOURCC_BPP_R444));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YV24, FOURCC_BPP_YV24));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YU12, FOURCC_BPP_YU12));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_IYUV, FOURCC_BPP_IYUV));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YU16, FOURCC_BPP_YU16));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YU24, FOURCC_BPP_YU24));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YUYV, FOURCC_BPP_YUYV));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YUVS, FOURCC_BPP_YUVS));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_HDYC, FOURCC_BPP_HDYC));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_2VUY, FOURCC_BPP_2VUY));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_JPEG, FOURCC_BPP_JPEG));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_DMB1, FOURCC_BPP_DMB1));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_BA81, FOURCC_BPP_BA81));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGB3, FOURCC_BPP_RGB3));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_BGR3, FOURCC_BPP_BGR3));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H264, FOURCC_BPP_H264));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_ANY,  FOURCC_BPP_ANY));
+}
+
+}  // namespace libyuv
diff --git a/files/util/Makefile b/files/util/Makefile
new file mode 100644
index 00000000..6044d2ad
--- /dev/null
+++ b/files/util/Makefile
@@ -0,0 +1,6 @@
+psnr: psnr.cc ssim.cc psnr_main.cc
+ifeq ($(CXX),icl)
+	$(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
+else
+	$(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all
+endif
diff --git a/files/util/android/test_runner.py b/files/util/android/test_runner.py
new file mode 100755
index 00000000..8b06b7ea
--- /dev/null
+++ b/files/util/android/test_runner.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""
+Runs tests on Android devices.
+
+This script exists to avoid Libyuv being broken by changes in the Chrome Android
+test execution toolchain. It also conveniently sets the CHECKOUT_SOURCE_ROOT
+environment variable.
+"""
+
+import os
+import sys
+
+SCRIPT_DIR = os.path.dirname(__file__)
+ROOT_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, os.pardir, os.pardir))
+CHROMIUM_BUILD_ANDROID_DIR = os.path.join(ROOT_DIR, 'build', 'android')
+sys.path.insert(0, CHROMIUM_BUILD_ANDROID_DIR)
+
+
+import test_runner  # pylint: disable=W0406
+
+def main():
+  # Override environment variable to make it possible for the scripts to find
+  # the root directory (our symlinking of the Chromium build toolchain would
+  # otherwise make them fail to do so).
+  os.environ['CHECKOUT_SOURCE_ROOT'] = ROOT_DIR
+  return test_runner.main()
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/files/util/compare.cc b/files/util/compare.cc
index f030c799..c36c0fa5 100644
--- a/files/util/compare.cc
+++ b/files/util/compare.cc
@@ -4,7 +4,7 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
+ *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
@@ -38,10 +38,10 @@ int main(int argc, char** argv) {
   int amt1 = 0;
   int amt2 = 0;
   do {
-    amt1 = fread(buf1, 1, kBlockSize, fin1);
+    amt1 = static_cast<int>(fread(buf1, 1, kBlockSize, fin1));
     if (amt1 > 0) hash1 = libyuv::HashDjb2(buf1, amt1, hash1);
     if (fin2) {
-      amt2 = fread(buf2, 1, kBlockSize, fin2);
+      amt2 = static_cast<int>(fread(buf2, 1, kBlockSize, fin2));
       if (amt2 > 0) hash2 = libyuv::HashDjb2(buf2, amt2, hash2);
       int amt_min = (amt1 < amt2) ? amt1 : amt2;
       size_min += amt_min;
@@ -61,4 +61,3 @@ int main(int argc, char** argv) {
   }
   fclose(fin1);
 }
-
diff --git a/files/util/convert.cc b/files/util/convert.cc
new file mode 100644
index 00000000..5f071416
--- /dev/null
+++ b/files/util/convert.cc
@@ -0,0 +1,365 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Convert an ARGB image to YUV.
+// Usage: convert src_argb.raw dst_yuv.raw
+
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/scale_argb.h"
+
+// options
+bool verbose = false;
+bool attenuate = false;
+bool unattenuate = false;
+int image_width = 0, image_height = 0;  // original width and height
+int dst_width = 0, dst_height = 0;  // new width and height
+int fileindex_org = 0;  // argv argument contains the original file name.
+int fileindex_rec = 0;  // argv argument contains the reconstructed file name.
+int num_rec = 0;  // Number of reconstructed images.
+int num_skip_org = 0;  // Number of frames to skip in original.
+int num_frames = 0;  // Number of frames to convert.
+int filter = 1;  // Bilinear filter for scaling.
+
+static __inline uint32 Abs(int32 v) {
+  return v >= 0 ? v : -v;
+}
+
+// Parse PYUV format. ie name.1920x800_24Hz_P420.yuv
+bool ExtractResolutionFromFilename(const char* name,
+                                   int* width_ptr,
+                                   int* height_ptr) {
+  // Isolate the .width_height. section of the filename by searching for a
+  // dot or underscore followed by a digit.
+  for (int i = 0; name[i]; ++i) {
+    if ((name[i] == '.' || name[i] == '_') &&
+        name[i + 1] >= '0' && name[i + 1] <= '9') {
+      int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr);  // NOLINT
+      if (2 == n) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+void PrintHelp(const char * program) {
+  printf("%s [-options] src_argb.raw dst_yuv.raw\n", program);
+  printf(" -s <width> <height> .... specify source resolution.  "
+         "Optional if name contains\n"
+         "                          resolution (ie. "
+         "name.1920x800_24Hz_P420.yuv)\n"
+         "                          Negative value mirrors.\n");
+  printf(" -d <width> <height> .... specify destination resolution.\n");
+  printf(" -f <filter> ............ 0 = point, 1 = bilinear (default).\n");
+  printf(" -skip <src_argb> ....... Number of frame to skip of src_argb\n");
+  printf(" -frames <num> .......... Number of frames to convert\n");
+  printf(" -attenuate ............. Attenuate the ARGB image\n");
+  printf(" -unattenuate ........... Unattenuate the ARGB image\n");
+  printf(" -v ..................... verbose\n");
+  printf(" -h ..................... this help\n");
+  exit(0);
+}
+
+void ParseOptions(int argc, const char* argv[]) {
+  if (argc <= 1) PrintHelp(argv[0]);
+  for (int c = 1; c < argc; ++c) {
+    if (!strcmp(argv[c], "-v")) {
+      verbose = true;
+    } else if (!strcmp(argv[c], "-attenuate")) {
+      attenuate = true;
+    } else if (!strcmp(argv[c], "-unattenuate")) {
+      unattenuate = true;
+    } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
+      PrintHelp(argv[0]);
+    } else if (!strcmp(argv[c], "-s") && c + 2 < argc) {
+      image_width = atoi(argv[++c]);    // NOLINT
+      image_height = atoi(argv[++c]);   // NOLINT
+    } else if (!strcmp(argv[c], "-d") && c + 2 < argc) {
+      dst_width = atoi(argv[++c]);    // NOLINT
+      dst_height = atoi(argv[++c]);   // NOLINT
+    } else if (!strcmp(argv[c], "-skip") && c + 1 < argc) {
+      num_skip_org = atoi(argv[++c]);   // NOLINT
+    } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) {
+      num_frames = atoi(argv[++c]);     // NOLINT
+    } else if (!strcmp(argv[c], "-f") && c + 1 < argc) {
+      filter = atoi(argv[++c]);     // NOLINT
+    } else if (argv[c][0] == '-') {
+      fprintf(stderr, "Unknown option. %s\n", argv[c]);
+    } else if (fileindex_org == 0) {
+      fileindex_org = c;
+    } else if (fileindex_rec == 0) {
+      fileindex_rec = c;
+      num_rec = 1;
+    } else {
+      ++num_rec;
+    }
+  }
+  if (fileindex_org == 0 || fileindex_rec == 0) {
+    fprintf(stderr, "Missing filenames\n");
+    PrintHelp(argv[0]);
+  }
+  if (num_skip_org < 0) {
+    fprintf(stderr, "Skipped frames incorrect\n");
+    PrintHelp(argv[0]);
+  }
+  if (num_frames < 0) {
+    fprintf(stderr, "Number of frames incorrect\n");
+    PrintHelp(argv[0]);
+  }
+
+  int org_width, org_height;
+  int rec_width, rec_height;
+  bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org],
+                                                     &org_width,
+                                                     &org_height);
+  bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec],
+                                                     &rec_width,
+                                                     &rec_height);
+  if (image_width == 0 || image_height == 0) {
+    if (org_res_avail) {
+      image_width = org_width;
+      image_height = org_height;
+    } else if (rec_res_avail) {
+      image_width = rec_width;
+      image_height = rec_height;
+    } else {
+      fprintf(stderr, "Missing dimensions.\n");
+      PrintHelp(argv[0]);
+    }
+  }
+  if (dst_width == 0 || dst_height == 0) {
+    if (rec_res_avail) {
+      dst_width = rec_width;
+      dst_height = rec_height;
+    } else {
+      dst_width = Abs(image_width);
+      dst_height = Abs(image_height);
+    }
+  }
+}
+
+static const int kTileX = 32;
+static const int kTileY = 32;
+
+static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
+                         int src_width, int src_height,
+                         uint8* dst_argb, int dst_stride_argb,
+                         int dst_width, int dst_height,
+                         libyuv::FilterMode filtering) {
+  for (int y = 0; y < dst_height; y += kTileY) {
+    for (int x = 0; x < dst_width; x += kTileX) {
+      int clip_width = kTileX;
+      if (x + clip_width > dst_width) {
+        clip_width = dst_width - x;
+      }
+      int clip_height = kTileY;
+      if (y + clip_height > dst_height) {
+        clip_height = dst_height - y;
+      }
+      int r = libyuv::ARGBScaleClip(src_argb, src_stride_argb,
+                                    src_width, src_height,
+                                    dst_argb, dst_stride_argb,
+                                    dst_width, dst_height,
+                                    x, y, clip_width, clip_height, filtering);
+      if (r) {
+        return r;
+      }
+    }
+  }
+  return 0;
+}
+
+int main(int argc, const char* argv[]) {
+  ParseOptions(argc, argv);
+
+  // Open original file (first file argument)
+  FILE* const file_org = fopen(argv[fileindex_org], "rb");
+  if (file_org == NULL) {
+    fprintf(stderr, "Cannot open %s\n", argv[fileindex_org]);
+    exit(1);
+  }
+
+  // Open all files to convert to
+  FILE** file_rec = new FILE* [num_rec];
+  memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "wb");
+    if (file_rec[cur_rec] == NULL) {
+      fprintf(stderr, "Cannot open %s\n", argv[fileindex_rec + cur_rec]);
+      fclose(file_org);
+      for (int i = 0; i < cur_rec; ++i) {
+        fclose(file_rec[i]);
+      }
+      delete[] file_rec;
+      exit(1);
+    }
+  }
+
+  bool org_is_yuv = strstr(argv[fileindex_org], "_P420.") != NULL;
+  bool org_is_argb = strstr(argv[fileindex_org], "_ARGB.") != NULL;
+  if (!org_is_yuv && !org_is_argb) {
+    fprintf(stderr, "Original format unknown %s\n", argv[fileindex_org]);
+    exit(1);
+  }
+  int org_size = Abs(image_width) * Abs(image_height) * 4;  // ARGB
+  // Input is YUV
+  if (org_is_yuv) {
+    const int y_size = Abs(image_width) * Abs(image_height);
+    const int uv_size = ((Abs(image_width) + 1) / 2) *
+        ((Abs(image_height) + 1) / 2);
+    org_size = y_size + 2 * uv_size;  // YUV original.
+  }
+
+  const int dst_size = dst_width * dst_height * 4;  // ARGB scaled
+  const int y_size = dst_width * dst_height;
+  const int uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+  const size_t total_size = y_size + 2 * uv_size;
+#if defined(_MSC_VER)
+  _fseeki64(file_org,
+            static_cast<__int64>(num_skip_org) *
+            static_cast<__int64>(org_size), SEEK_SET);
+#else
+  fseek(file_org, num_skip_org * total_size, SEEK_SET);
+#endif
+
+  uint8* const ch_org = new uint8[org_size];
+  uint8* const ch_dst = new uint8[dst_size];
+  uint8* const ch_rec = new uint8[total_size];
+  if (ch_org == NULL || ch_rec == NULL) {
+    fprintf(stderr, "No memory available\n");
+    fclose(file_org);
+    for (int i = 0; i < num_rec; ++i) {
+      fclose(file_rec[i]);
+    }
+    delete[] ch_org;
+    delete[] ch_dst;
+    delete[] ch_rec;
+    delete[] file_rec;
+    exit(1);
+  }
+
+  if (verbose) {
+    printf("Size: %dx%d to %dx%d\n", image_width, image_height,
+           dst_width, dst_height);
+  }
+
+  int number_of_frames;
+  for (number_of_frames = 0; ; ++number_of_frames) {
+    if (num_frames && number_of_frames >= num_frames)
+      break;
+
+    // Load original YUV or ARGB frame.
+    size_t bytes_org = fread(ch_org, sizeof(uint8),
+                             static_cast<size_t>(org_size), file_org);
+    if (bytes_org < static_cast<size_t>(org_size))
+      break;
+
+    // TODO(fbarchard): Attenuate doesnt need to know dimensions.
+    // ARGB attenuate frame
+    if (org_is_argb && attenuate) {
+      libyuv::ARGBAttenuate(ch_org, 0, ch_org, 0, org_size / 4, 1);
+    }
+    // ARGB unattenuate frame
+    if (org_is_argb && unattenuate) {
+      libyuv::ARGBUnattenuate(ch_org, 0, ch_org, 0, org_size / 4, 1);
+    }
+
+    for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+      // Scale YUV or ARGB frame.
+      if (org_is_yuv) {
+        int src_width = Abs(image_width);
+        int src_height = Abs(image_height);
+        int half_src_width = (src_width + 1) / 2;
+        int half_src_height = (src_height + 1) / 2;
+        int half_dst_width = (dst_width + 1) / 2;
+        int half_dst_height = (dst_height + 1) / 2;
+        I420Scale(ch_org, src_width,
+                  ch_org + src_width * src_height, half_src_width,
+                  ch_org + src_width * src_height +
+                      half_src_width * half_src_height,  half_src_width,
+                  image_width, image_height,
+                  ch_rec, dst_width,
+                  ch_rec + dst_width * dst_height, half_dst_width,
+                  ch_rec + dst_width * dst_height +
+                      half_dst_width * half_dst_height,  half_dst_width,
+                  dst_width, dst_height,
+                      static_cast<libyuv::FilterMode>(filter));
+      } else {
+        TileARGBScale(ch_org, Abs(image_width) * 4,
+                      image_width, image_height,
+                      ch_dst, dst_width * 4,
+                      dst_width, dst_height,
+                      static_cast<libyuv::FilterMode>(filter));
+      }
+      bool rec_is_yuv = strstr(argv[fileindex_rec + cur_rec], "_P420.") != NULL;
+      bool rec_is_argb =
+          strstr(argv[fileindex_rec + cur_rec], "_ARGB.") != NULL;
+      if (!rec_is_yuv && !rec_is_argb) {
+        fprintf(stderr, "Output format unknown %s\n",
+                argv[fileindex_rec + cur_rec]);
+        continue;  // Advance to next file.
+      }
+
+      // Convert ARGB to YUV.
+      if (!org_is_yuv && rec_is_yuv) {
+        int half_width = (dst_width + 1) / 2;
+        int half_height = (dst_height + 1) / 2;
+        libyuv::ARGBToI420(ch_dst, dst_width * 4,
+                           ch_rec, dst_width,
+                           ch_rec + dst_width * dst_height, half_width,
+                           ch_rec + dst_width * dst_height +
+                               half_width * half_height,  half_width,
+                           dst_width, dst_height);
+      }
+
+      // Output YUV or ARGB frame.
+      if (rec_is_yuv) {
+        size_t bytes_rec = fwrite(ch_rec, sizeof(uint8),
+                                  static_cast<size_t>(total_size),
+                                  file_rec[cur_rec]);
+        if (bytes_rec < static_cast<size_t>(total_size))
+          break;
+      } else {
+        size_t bytes_rec = fwrite(ch_dst, sizeof(uint8),
+                                  static_cast<size_t>(dst_size),
+                                  file_rec[cur_rec]);
+        if (bytes_rec < static_cast<size_t>(dst_size))
+          break;
+      }
+      if (verbose) {
+        printf("%5d", number_of_frames);
+      }
+      if (verbose) {
+        printf("\t%s", argv[fileindex_rec + cur_rec]);
+        printf("\n");
+      }
+    }
+  }
+
+  fclose(file_org);
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    fclose(file_rec[cur_rec]);
+  }
+  delete[] ch_org;
+  delete[] ch_dst;
+  delete[] ch_rec;
+  delete[] file_rec;
+  return 0;
+}
diff --git a/files/util/cpuid.c b/files/util/cpuid.c
new file mode 100644
index 00000000..94e245b1
--- /dev/null
+++ b/files/util/cpuid.c
@@ -0,0 +1,94 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define INCLUDE_LIBYUV_COMPARE_H_
+#include "libyuv.h"
+#include "./psnr.h"
+#include "./ssim.h"
+
+int main(int argc, const char* argv[]) {
+  int cpu_flags = TestCpuFlag(-1);
+  int has_arm = TestCpuFlag(kCpuHasARM);
+  int has_mips = TestCpuFlag(kCpuHasMIPS);
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+#if defined(__i386__) || defined(__x86_64__) || \
+    defined(_M_IX86) || defined(_M_X64)
+  if (has_x86) {
+    uint32 family, model, cpu_info[4];
+    // Vendor ID:
+    // AuthenticAMD AMD processor
+    // CentaurHauls Centaur processor
+    // CyrixInstead Cyrix processor
+    // GenuineIntel Intel processor
+    // GenuineTMx86 Transmeta processor
+    // Geode by NSC National Semiconductor processor
+    // NexGenDriven NexGen processor
+    // RiseRiseRise Rise Technology processor
+    // SiS SiS SiS  SiS processor
+    // UMC UMC UMC  UMC processor
+    CpuId(0, 0, &cpu_info[0]);
+    cpu_info[0] = cpu_info[1];  // Reorder output
+    cpu_info[1] = cpu_info[3];
+    cpu_info[3] = 0;
+    printf("Cpu Vendor: %s\n", (char*)(&cpu_info[0]));
+
+    // CPU Family and Model
+    // 3:0 - Stepping
+    // 7:4 - Model
+    // 11:8 - Family
+    // 13:12 - Processor Type
+    // 19:16 - Extended Model
+    // 27:20 - Extended Family
+    CpuId(1, 0, &cpu_info[0]);
+    family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
+    model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
+    printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
+           model, model);
+  }
+#endif
+  printf("Cpu Flags %x\n", cpu_flags);
+  printf("Has ARM %x\n", has_arm);
+  printf("Has MIPS %x\n", has_mips);
+  printf("Has X86 %x\n", has_x86);
+  if (has_arm) {
+    int has_neon = TestCpuFlag(kCpuHasNEON);
+    printf("Has NEON %x\n", has_neon);
+  }
+  if (has_mips) {
+    int has_dspr2 = TestCpuFlag(kCpuHasDSPR2);
+    printf("Has DSPR2 %x\n", has_dspr2);
+  }
+  if (has_x86) {
+    int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+    int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+    int has_sse41 = TestCpuFlag(kCpuHasSSE41);
+    int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+    int has_avx = TestCpuFlag(kCpuHasAVX);
+    int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+    int has_avx3 = TestCpuFlag(kCpuHasAVX3);
+    int has_erms = TestCpuFlag(kCpuHasERMS);
+    int has_fma3 = TestCpuFlag(kCpuHasFMA3);
+    printf("Has SSE2 %x\n", has_sse2);
+    printf("Has SSSE3 %x\n", has_ssse3);
+    printf("Has SSE4.1 %x\n", has_sse41);
+    printf("Has SSE4.2 %x\n", has_sse42);
+    printf("Has AVX %x\n", has_avx);
+    printf("Has AVX2 %x\n", has_avx2);
+    printf("Has AVX3 %x\n", has_avx3);
+    printf("Has ERMS %x\n", has_erms);
+    printf("Has FMA3 %x\n", has_fma3);
+  }
+  return 0;
+}
+
diff --git a/files/util/psnr.cc b/files/util/psnr.cc
new file mode 100644
index 00000000..52b04bd5
--- /dev/null
+++ b/files/util/psnr.cc
@@ -0,0 +1,288 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./psnr.h"  // NOLINT
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#ifdef _MSC_VER
+#include <intrin.h>  // For __cpuid()
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned int uint32;  // NOLINT
+#ifdef _MSC_VER
+typedef unsigned __int64 uint64;
+#else  // COMPILER_MSVC
+#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long uint64;  // NOLINT
+#else  // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long long uint64;  // NOLINT
+#endif  // __LP64__
+#endif  // _MSC_VER
+
+// libyuv provides this function when linking library for jpeg support.
+#if !defined(HAVE_JPEG)
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+#define HAS_SUMSQUAREERROR_NEON
+static uint32 SumSquareError_NEON(const uint8* src_a,
+                                  const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "vmov.u8    q7, #0                         \n"
+    "vmov.u8    q9, #0                         \n"
+    "vmov.u8    q8, #0                         \n"
+    "vmov.u8    q10, #0                        \n"
+
+  "1:                                          \n"
+    "vld1.u8    {q0}, [%0]!                    \n"
+    "vld1.u8    {q1}, [%1]!                    \n"
+    "vsubl.u8   q2, d0, d2                     \n"
+    "vsubl.u8   q3, d1, d3                     \n"
+    "vmlal.s16  q7, d4, d4                     \n"
+    "vmlal.s16  q8, d6, d6                     \n"
+    "vmlal.s16  q8, d5, d5                     \n"
+    "vmlal.s16  q10, d7, d7                    \n"
+    "subs       %2, %2, #16                    \n"
+    "bhi        1b                             \n"
+
+    "vadd.u32   q7, q7, q8                     \n"
+    "vadd.u32   q9, q9, q10                    \n"
+    "vadd.u32   q10, q7, q9                    \n"
+    "vpaddl.u32 q1, q10                        \n"
+    "vadd.u64   d0, d2, d3                     \n"
+    "vmov.32    %3, d0[0]                      \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
+  return sse;
+}
+#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_SUMSQUAREERROR_NEON
+static uint32 SumSquareError_NEON(const uint8* src_a,
+                                  const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "eor        v16.16b, v16.16b, v16.16b      \n"
+    "eor        v18.16b, v18.16b, v18.16b      \n"
+    "eor        v17.16b, v17.16b, v17.16b      \n"
+    "eor        v19.16b, v19.16b, v19.16b      \n"
+
+  "1:                                          \n"
+    "ld1        {v0.16b}, [%0], #16            \n"
+    "ld1        {v1.16b}, [%1], #16            \n"
+    "subs       %w2, %w2, #16                  \n"
+    "usubl      v2.8h, v0.8b, v1.8b            \n"
+    "usubl2     v3.8h, v0.16b, v1.16b          \n"
+    "smlal      v16.4s, v2.4h, v2.4h           \n"
+    "smlal      v17.4s, v3.4h, v3.4h           \n"
+    "smlal2     v18.4s, v2.8h, v2.8h           \n"
+    "smlal2     v19.4s, v3.8h, v3.8h           \n"
+    "b.gt       1b                             \n"
+
+    "add        v16.4s, v16.4s, v17.4s         \n"
+    "add        v18.4s, v18.4s, v19.4s         \n"
+    "add        v19.4s, v16.4s, v18.4s         \n"
+    "addv       s0, v19.4s                     \n"
+    "fmov       %w3, s0                        \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+  return sse;
+}
+#elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#define HAS_SUMSQUAREERROR_SSE2
+__declspec(naked)
+static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
+                                  const uint8* /*src_b*/, int /*count*/) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    pxor       xmm0, xmm0
+    pxor       xmm5, xmm5
+    sub        edx, eax
+
+  wloop:
+    movdqu     xmm1, [eax]
+    movdqu     xmm2, [eax + edx]
+    lea        eax,  [eax + 16]
+    movdqu     xmm3, xmm1
+    psubusb    xmm1, xmm2
+    psubusb    xmm2, xmm3
+    por        xmm1, xmm2
+    movdqu     xmm2, xmm1
+    punpcklbw  xmm1, xmm5
+    punpckhbw  xmm2, xmm5
+    pmaddwd    xmm1, xmm1
+    pmaddwd    xmm2, xmm2
+    paddd      xmm0, xmm1
+    paddd      xmm0, xmm2
+    sub        ecx, 16
+    ja         wloop
+
+    pshufd     xmm1, xmm0, 0EEh
+    paddd      xmm0, xmm1
+    pshufd     xmm1, xmm0, 01h
+    paddd      xmm0, xmm1
+    movd       eax, xmm0
+    ret
+  }
+}
+#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_SUMSQUAREERROR_SSE2
+static uint32 SumSquareError_SSE2(const uint8* src_a,
+                                  const uint8* src_b, int count) {
+  uint32 sse;
+  asm volatile (  // NOLINT
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "sub       %0,%1                           \n"
+
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm1                     \n"
+    "movdqu    (%0,%1,1),%%xmm2                \n"
+    "lea       0x10(%0),%0                     \n"
+    "movdqu    %%xmm1,%%xmm3                   \n"
+    "psubusb   %%xmm2,%%xmm1                   \n"
+    "psubusb   %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqu    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpckhbw %%xmm5,%%xmm2                   \n"
+    "pmaddwd   %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm2,%%xmm2                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "ja        1b                              \n"
+
+    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0,%3                       \n"
+
+  : "+r"(src_a),      // %0
+    "+r"(src_b),      // %1
+    "+r"(count),      // %2
+    "=g"(sse)         // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );  // NOLINT
+  return sse;
+}
+#endif  // LIBYUV_DISABLE_X86 etc
+
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
+static __inline void __cpuid(int cpu_info[4], int info_type) {
+  asm volatile (  // NOLINT
+    "mov %%ebx, %%edi                          \n"
+    "cpuid                                     \n"
+    "xchg %%edi, %%ebx                         \n"
+    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type));
+}
+// For gcc/clang but not clangcl.
+#elif (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
+static __inline void __cpuid(int cpu_info[4], int info_type) {
+  asm volatile (  // NOLINT
+    "cpuid                                     \n"
+    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type));
+}
+#endif
+
+static int CpuHasSSE2() {
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)
+  int cpu_info[4];
+  __cpuid(cpu_info, 1);
+  if (cpu_info[3] & 0x04000000) {
+    return 1;
+  }
+#endif
+  return 0;
+}
+#endif  // HAS_SUMSQUAREERROR_SSE2
+
+static uint32 SumSquareError_C(const uint8* src_a,
+                               const uint8* src_b, int count) {
+  uint32 sse = 0u;
+  for (int x = 0; x < count; ++x) {
+    int diff = src_a[x] - src_b[x];
+    sse += static_cast<uint32>(diff * diff);
+  }
+  return sse;
+}
+
+double ComputeSumSquareError(const uint8* src_a,
+                             const uint8* src_b, int count) {
+  uint32 (*SumSquareError)(const uint8* src_a,
+                           const uint8* src_b, int count) = SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+  SumSquareError = SumSquareError_NEON;
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+  if (CpuHasSSE2()) {
+    SumSquareError = SumSquareError_SSE2;
+  }
+#endif
+  const int kBlockSize = 1 << 15;
+  uint64 sse = 0;
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sse)
+#endif
+  for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+  }
+  src_a += count & ~(kBlockSize - 1);
+  src_b += count & ~(kBlockSize - 1);
+  int remainder = count & (kBlockSize - 1) & ~15;
+  if (remainder) {
+    sse += SumSquareError(src_a, src_b, remainder);
+    src_a += remainder;
+    src_b += remainder;
+  }
+  remainder = count & 15;
+  if (remainder) {
+    sse += SumSquareError_C(src_a, src_b, remainder);
+  }
+  return static_cast<double>(sse);
+}
+#endif
+
+// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
+// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
+double ComputePSNR(double sse, double size) {
+  const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
+  if (sse <= kMINSSE)
+    sse = kMINSSE;  // Produces max PSNR of 128
+  return 10.0 * log10(255.0 * 255.0 * size / sse);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/files/util/psnr.h b/files/util/psnr.h
new file mode 100644
index 00000000..0816b976
--- /dev/null
+++ b/files/util/psnr.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Get PSNR for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
+
+#ifndef UTIL_PSNR_H_  // NOLINT
+#define UTIL_PSNR_H_
+
+#include <math.h>  // For log10()
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED)
+typedef unsigned char uint8;
+#define UINT8_TYPE_DEFINED
+#endif
+
+static const double kMaxPSNR = 128.0;
+
+// libyuv provides this function when linking library for jpeg support.
+// TODO(fbarchard): make psnr lib compatible subset of libyuv.
+#if !defined(HAVE_JPEG)
+// Computer Sum of Squared Error (SSE).
+// Pass this to ComputePSNR for final result.
+double ComputeSumSquareError(const uint8* org, const uint8* rec, int size);
+#endif
+
+// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
+// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
+double ComputePSNR(double sse, double size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // UTIL_PSNR_H_  // NOLINT
diff --git a/files/util/psnr_main.cc b/files/util/psnr_main.cc
new file mode 100644
index 00000000..0518ab84
--- /dev/null
+++ b/files/util/psnr_main.cc
@@ -0,0 +1,648 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Get PSNR or SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
+// To build: g++ -O3 -o psnr psnr.cc ssim.cc psnr_main.cc
+// or VisualC: cl /Ox psnr.cc ssim.cc psnr_main.cc
+//
+// To enable OpenMP and SSE2
+// gcc: g++ -msse2 -O3 -fopenmp -o psnr psnr.cc ssim.cc psnr_main.cc
+// vc:  cl /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
+//
+// Usage: psnr org_seq rec_seq -s width height [-skip skip_org skip_rec]
+
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "./psnr.h"
+#include "./ssim.h"
+#ifdef HAVE_JPEG
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#endif
+
+struct metric {
+  double y, u, v, all;
+  double min_y, min_u, min_v, min_all;
+  double global_y, global_u, global_v, global_all;
+  int min_frame;
+};
+
+// options
+bool verbose = false;
+bool quiet = false;
+bool show_name = false;
+bool do_swap_uv = false;
+bool do_psnr = false;
+bool do_ssim = false;
+bool do_mse = false;
+bool do_lssim = false;
+int image_width = 0, image_height = 0;
+int fileindex_org = 0;  // argv argument contains the source file name.
+int fileindex_rec = 0;  // argv argument contains the destination file name.
+int num_rec = 0;
+int num_skip_org = 0;
+int num_skip_rec = 0;
+int num_frames = 0;
+#ifdef _OPENMP
+int num_threads = 0;
+#endif
+
+// Parse PYUV format. ie name.1920x800_24Hz_P420.yuv
+bool ExtractResolutionFromFilename(const char* name,
+                                   int* width_ptr,
+                                   int* height_ptr) {
+  // Isolate the .width_height. section of the filename by searching for a
+  // dot or underscore followed by a digit.
+  for (int i = 0; name[i]; ++i) {
+    if ((name[i] == '.' || name[i] == '_') &&
+        name[i + 1] >= '0' && name[i + 1] <= '9') {
+      int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr);  // NOLINT
+      if (2 == n) {
+        return true;
+      }
+    }
+  }
+
+#ifdef HAVE_JPEG
+  // Try parsing file as a jpeg.
+  FILE* const file_org = fopen(name, "rb");
+  if (file_org == NULL) {
+    fprintf(stderr, "Cannot open %s\n", name);
+    return false;
+  }
+  fseek(file_org, 0, SEEK_END);
+  size_t total_size  = ftell(file_org);
+  fseek(file_org, 0, SEEK_SET);
+  uint8* const ch_org = new uint8[total_size];
+  memset(ch_org, 0, total_size);
+  size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org);
+  fclose(file_org);
+  if (bytes_org == total_size) {
+    if (0 == libyuv::MJPGSize(ch_org, total_size, width_ptr, height_ptr)) {
+      delete[] ch_org;
+      return true;
+    }
+  }
+  delete[] ch_org;
+#endif  // HAVE_JPEG
+  return false;
+}
+
+// Scale Y channel from 16..240 to 0..255.
+// This can be useful when comparing codecs that are inconsistant about Y
+uint8 ScaleY(uint8 y) {
+  int ny = (y - 16) * 256 / 224;
+  if (ny < 0) ny = 0;
+  if (ny > 255) ny = 255;
+  return static_cast<uint8>(ny);
+}
+
+// MSE = Mean Square Error
+double GetMSE(double sse, double size) {
+  return sse / size;
+}
+
+void PrintHelp(const char * program) {
+  printf("%s [-options] org_seq rec_seq [rec_seq2.. etc]\n", program);
+#ifdef HAVE_JPEG
+  printf("jpeg or raw YUV 420 supported.\n");
+#endif
+  printf("options:\n");
+  printf(" -s <width> <height> .... specify YUV size, mandatory if none of the "
+         "sequences have the\n");
+  printf("                          resolution embedded in their filename (ie. "
+         "name.1920x800_24Hz_P420.yuv)\n");
+  printf(" -psnr .................. compute PSNR (default)\n");
+  printf(" -ssim .................. compute SSIM\n");
+  printf(" -mse ................... compute MSE\n");
+  printf(" -swap .................. Swap U and V plane\n");
+  printf(" -skip <org> <rec> ...... Number of frame to skip of org and rec\n");
+  printf(" -frames <num> .......... Number of frames to compare\n");
+#ifdef _OPENMP
+  printf(" -t <num> ............... Number of threads\n");
+#endif
+  printf(" -n ..................... Show file name\n");
+  printf(" -v ..................... verbose++\n");
+  printf(" -q ..................... quiet\n");
+  printf(" -h ..................... this help\n");
+  exit(0);
+}
+
+void ParseOptions(int argc, const char* argv[]) {
+  if (argc <= 1) PrintHelp(argv[0]);
+  for (int c = 1; c < argc; ++c) {
+    if (!strcmp(argv[c], "-v")) {
+      verbose = true;
+    } else if (!strcmp(argv[c], "-q")) {
+      quiet = true;
+    } else if (!strcmp(argv[c], "-n")) {
+      show_name = true;
+    } else if (!strcmp(argv[c], "-psnr")) {
+      do_psnr = true;
+    } else if (!strcmp(argv[c], "-mse")) {
+      do_mse = true;
+    } else if (!strcmp(argv[c], "-ssim")) {
+      do_ssim = true;
+    } else if (!strcmp(argv[c], "-lssim")) {
+      do_ssim = true;
+      do_lssim = true;
+    } else if (!strcmp(argv[c], "-swap")) {
+      do_swap_uv = true;
+    } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
+      PrintHelp(argv[0]);
+    } else if (!strcmp(argv[c], "-s") && c + 2 < argc) {
+      image_width = atoi(argv[++c]);    // NOLINT
+      image_height = atoi(argv[++c]);   // NOLINT
+    } else if (!strcmp(argv[c], "-skip") && c + 2 < argc) {
+      num_skip_org = atoi(argv[++c]);   // NOLINT
+      num_skip_rec = atoi(argv[++c]);   // NOLINT
+    } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) {
+      num_frames = atoi(argv[++c]);     // NOLINT
+#ifdef _OPENMP
+    } else if (!strcmp(argv[c], "-t") && c + 1 < argc) {
+      num_threads = atoi(argv[++c]);    // NOLINT
+#endif
+    } else if (argv[c][0] == '-') {
+      fprintf(stderr, "Unknown option. %s\n", argv[c]);
+    } else if (fileindex_org == 0) {
+      fileindex_org = c;
+    } else if (fileindex_rec == 0) {
+      fileindex_rec = c;
+      num_rec = 1;
+    } else {
+      ++num_rec;
+    }
+  }
+  if (fileindex_org == 0 || fileindex_rec == 0) {
+    fprintf(stderr, "Missing filenames\n");
+    PrintHelp(argv[0]);
+  }
+  if (num_skip_org < 0 || num_skip_rec < 0) {
+    fprintf(stderr, "Skipped frames incorrect\n");
+    PrintHelp(argv[0]);
+  }
+  if (num_frames < 0) {
+    fprintf(stderr, "Number of frames incorrect\n");
+    PrintHelp(argv[0]);
+  }
+  if (image_width == 0 || image_height == 0) {
+    int org_width, org_height;
+    int rec_width, rec_height;
+    bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org],
+                                                       &org_width,
+                                                       &org_height);
+    bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec],
+                                                       &rec_width,
+                                                       &rec_height);
+    if (org_res_avail) {
+      if (rec_res_avail) {
+        if ((org_width == rec_width) && (org_height == rec_height)) {
+          image_width = org_width;
+          image_height = org_height;
+        } else {
+          fprintf(stderr, "Sequences have different resolutions.\n");
+          PrintHelp(argv[0]);
+        }
+      } else {
+        image_width = org_width;
+        image_height = org_height;
+      }
+    } else if (rec_res_avail) {
+      image_width = rec_width;
+      image_height = rec_height;
+    } else {
+      fprintf(stderr, "Missing dimensions.\n");
+      PrintHelp(argv[0]);
+    }
+  }
+}
+
+bool UpdateMetrics(uint8* ch_org, uint8* ch_rec,
+                   const int y_size, const int uv_size, const size_t total_size,
+                   int number_of_frames,
+                   metric* cur_distortion_psnr,
+                   metric* distorted_frame, bool do_psnr) {
+  const int uv_offset = (do_swap_uv ? uv_size : 0);
+  const uint8* const u_org = ch_org + y_size + uv_offset;
+  const uint8* const u_rec = ch_rec + y_size;
+  const uint8* const v_org = ch_org + y_size + (uv_size - uv_offset);
+  const uint8* const v_rec = ch_rec + y_size + uv_size;
+  if (do_psnr) {
+#ifdef HAVE_JPEG
+    double y_err = static_cast<double>(
+      libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size));
+    double u_err = static_cast<double>(
+      libyuv::ComputeSumSquareError(u_org, u_rec, uv_size));
+    double v_err = static_cast<double>(
+      libyuv::ComputeSumSquareError(v_org, v_rec, uv_size));
+#else
+    double y_err = ComputeSumSquareError(ch_org, ch_rec, y_size);
+    double u_err = ComputeSumSquareError(u_org, u_rec, uv_size);
+    double v_err = ComputeSumSquareError(v_org, v_rec, uv_size);
+#endif
+    const double total_err = y_err + u_err + v_err;
+    cur_distortion_psnr->global_y += y_err;
+    cur_distortion_psnr->global_u += u_err;
+    cur_distortion_psnr->global_v += v_err;
+    cur_distortion_psnr->global_all += total_err;
+    distorted_frame->y = ComputePSNR(y_err, static_cast<double>(y_size));
+    distorted_frame->u = ComputePSNR(u_err, static_cast<double>(uv_size));
+    distorted_frame->v = ComputePSNR(v_err, static_cast<double>(uv_size));
+    distorted_frame->all = ComputePSNR(total_err,
+                                       static_cast<double>(total_size));
+  } else {
+    distorted_frame->y = CalcSSIM(ch_org, ch_rec, image_width, image_height);
+    distorted_frame->u = CalcSSIM(u_org, u_rec, (image_width + 1) / 2,
+                                 (image_height + 1) / 2);
+    distorted_frame->v = CalcSSIM(v_org, v_rec, (image_width + 1) / 2,
+                                 (image_height + 1) / 2);
+    distorted_frame->all =
+      (distorted_frame->y + distorted_frame->u + distorted_frame->v)
+        / total_size;
+    distorted_frame->y /= y_size;
+    distorted_frame->u /= uv_size;
+    distorted_frame->v /= uv_size;
+
+    if (do_lssim) {
+      distorted_frame->all = CalcLSSIM(distorted_frame->all);
+      distorted_frame->y = CalcLSSIM(distorted_frame->y);
+      distorted_frame->u = CalcLSSIM(distorted_frame->u);
+      distorted_frame->v = CalcLSSIM(distorted_frame->v);
+    }
+  }
+
+  cur_distortion_psnr->y += distorted_frame->y;
+  cur_distortion_psnr->u += distorted_frame->u;
+  cur_distortion_psnr->v += distorted_frame->v;
+  cur_distortion_psnr->all += distorted_frame->all;
+
+  bool ismin = false;
+  if (distorted_frame->y < cur_distortion_psnr->min_y)
+    cur_distortion_psnr->min_y = distorted_frame->y;
+  if (distorted_frame->u < cur_distortion_psnr->min_u)
+    cur_distortion_psnr->min_u = distorted_frame->u;
+  if (distorted_frame->v < cur_distortion_psnr->min_v)
+    cur_distortion_psnr->min_v = distorted_frame->v;
+  if (distorted_frame->all < cur_distortion_psnr->min_all) {
+    cur_distortion_psnr->min_all = distorted_frame->all;
+    cur_distortion_psnr->min_frame = number_of_frames;
+    ismin = true;
+  }
+  return ismin;
+}
+
+int main(int argc, const char* argv[]) {
+  ParseOptions(argc, argv);
+  if (!do_psnr && !do_ssim) {
+    do_psnr = true;
+  }
+
+#ifdef _OPENMP
+  if (num_threads) {
+    omp_set_num_threads(num_threads);
+  }
+  if (verbose) {
+    printf("OpenMP %d procs\n", omp_get_num_procs());
+  }
+#endif
+  // Open original file (first file argument)
+  FILE* const file_org = fopen(argv[fileindex_org], "rb");
+  if (file_org == NULL) {
+    fprintf(stderr, "Cannot open %s\n", argv[fileindex_org]);
+    exit(1);
+  }
+
+  // Open all files to compare to
+  FILE** file_rec = new FILE* [num_rec];
+  memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "rb");
+    if (file_rec[cur_rec] == NULL) {
+      fprintf(stderr, "Cannot open %s\n", argv[fileindex_rec + cur_rec]);
+      fclose(file_org);
+      for (int i = 0; i < cur_rec; ++i) {
+        fclose(file_rec[i]);
+      }
+      delete[] file_rec;
+      exit(1);
+    }
+  }
+
+  const int y_size = image_width * image_height;
+  const int uv_size = ((image_width + 1) / 2) * ((image_height + 1) / 2);
+  const size_t total_size = y_size + 2 * uv_size;    // NOLINT
+#if defined(_MSC_VER)
+  _fseeki64(file_org,
+            static_cast<__int64>(num_skip_org) *
+            static_cast<__int64>(total_size), SEEK_SET);
+#else
+  fseek(file_org, num_skip_org * total_size, SEEK_SET);
+#endif
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+#if defined(_MSC_VER)
+    _fseeki64(file_rec[cur_rec],
+              static_cast<__int64>(num_skip_rec) *
+              static_cast<__int64>(total_size),
+              SEEK_SET);
+#else
+    fseek(file_rec[cur_rec], num_skip_rec * total_size, SEEK_SET);
+#endif
+  }
+
+  uint8* const ch_org = new uint8[total_size];
+  uint8* const ch_rec = new uint8[total_size];
+  if (ch_org == NULL || ch_rec == NULL) {
+    fprintf(stderr, "No memory available\n");
+    fclose(file_org);
+    for (int i = 0; i < num_rec; ++i) {
+      fclose(file_rec[i]);
+    }
+    delete[] ch_org;
+    delete[] ch_rec;
+    delete[] file_rec;
+    exit(1);
+  }
+
+  metric* const distortion_psnr = new metric[num_rec];
+  metric* const distortion_ssim = new metric[num_rec];
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
+    cur_distortion_psnr->y = 0.0;
+    cur_distortion_psnr->u = 0.0;
+    cur_distortion_psnr->v = 0.0;
+    cur_distortion_psnr->all = 0.0;
+    cur_distortion_psnr->min_y = kMaxPSNR;
+    cur_distortion_psnr->min_u = kMaxPSNR;
+    cur_distortion_psnr->min_v = kMaxPSNR;
+    cur_distortion_psnr->min_all = kMaxPSNR;
+    cur_distortion_psnr->min_frame = 0;
+    cur_distortion_psnr->global_y = 0.0;
+    cur_distortion_psnr->global_u = 0.0;
+    cur_distortion_psnr->global_v = 0.0;
+    cur_distortion_psnr->global_all = 0.0;
+    distortion_ssim[cur_rec] = cur_distortion_psnr[cur_rec];
+  }
+
+  if (verbose) {
+    printf("Size: %dx%d\n", image_width, image_height);
+  }
+
+  if (!quiet) {
+    printf("Frame");
+    if (do_psnr) {
+      printf("\t PSNR-Y \t PSNR-U \t PSNR-V \t PSNR-All \t Frame");
+    }
+    if (do_ssim) {
+      printf("\t  SSIM-Y\t  SSIM-U\t  SSIM-V\t  SSIM-All\t Frame");
+    }
+    if (show_name) {
+      printf("\tName\n");
+    } else {
+      printf("\n");
+    }
+  }
+
+  int number_of_frames;
+  for (number_of_frames = 0; ; ++number_of_frames) {
+    if (num_frames && number_of_frames >= num_frames)
+      break;
+
+    size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org);
+    if (bytes_org < total_size) {
+#ifdef HAVE_JPEG
+      // Try parsing file as a jpeg.
+      uint8* const ch_jpeg = new uint8[bytes_org];
+      memcpy(ch_jpeg, ch_org, bytes_org);
+      memset(ch_org, 0, total_size);
+
+      if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org,
+                                  ch_org,
+                                  image_width,
+                                  ch_org + y_size,
+                                  (image_width + 1) / 2,
+                                  ch_org + y_size + uv_size,
+                                  (image_width + 1) / 2,
+                                  image_width,
+                                  image_height,
+                                  image_width,
+                                  image_height)) {
+        delete[] ch_jpeg;
+        break;
+      }
+      delete[] ch_jpeg;
+#else
+      break;
+#endif  // HAVE_JPEG
+    }
+
+    for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+      size_t bytes_rec = fread(ch_rec, sizeof(uint8),
+                               total_size, file_rec[cur_rec]);
+      if (bytes_rec < total_size) {
+#ifdef HAVE_JPEG
+        // Try parsing file as a jpeg.
+        uint8* const ch_jpeg = new uint8[bytes_rec];
+        memcpy(ch_jpeg, ch_rec, bytes_rec);
+        memset(ch_rec, 0, total_size);
+
+        if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec,
+                                    ch_rec,
+                                    image_width,
+                                    ch_rec + y_size,
+                                    (image_width + 1) / 2,
+                                    ch_rec + y_size + uv_size,
+                                    (image_width + 1) / 2,
+                                    image_width,
+                                    image_height,
+                                    image_width,
+                                    image_height)) {
+          delete[] ch_jpeg;
+          break;
+        }
+        delete[] ch_jpeg;
+#else
+        break;
+#endif  // HAVE_JPEG
+      }
+
+      if (verbose) {
+        printf("%5d", number_of_frames);
+      }
+      if (do_psnr) {
+        metric distorted_frame;
+        metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
+        bool ismin = UpdateMetrics(ch_org, ch_rec,
+                                   y_size, uv_size, total_size,
+                                   number_of_frames,
+                                   cur_distortion_psnr,
+                                   &distorted_frame, true);
+        if (verbose) {
+          printf("\t%10.6f", distorted_frame.y);
+          printf("\t%10.6f", distorted_frame.u);
+          printf("\t%10.6f", distorted_frame.v);
+          printf("\t%10.6f", distorted_frame.all);
+          printf("\t%5s", ismin ? "min" : "");
+        }
+      }
+      if (do_ssim) {
+        metric distorted_frame;
+        metric* cur_distortion_ssim = &distortion_ssim[cur_rec];
+        bool ismin = UpdateMetrics(ch_org, ch_rec,
+                                   y_size, uv_size, total_size,
+                                   number_of_frames,
+                                   cur_distortion_ssim,
+                                   &distorted_frame, false);
+        if (verbose) {
+          printf("\t%10.6f", distorted_frame.y);
+          printf("\t%10.6f", distorted_frame.u);
+          printf("\t%10.6f", distorted_frame.v);
+          printf("\t%10.6f", distorted_frame.all);
+          printf("\t%5s", ismin ? "min" : "");
+        }
+      }
+      if (verbose) {
+        if (show_name) {
+          printf("\t%s", argv[fileindex_rec + cur_rec]);
+        }
+        printf("\n");
+      }
+    }
+  }
+
+  // Final PSNR computation.
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
+    metric* cur_distortion_ssim = &distortion_ssim[cur_rec];
+    if (number_of_frames > 0) {
+      const double norm = 1. / static_cast<double>(number_of_frames);
+      cur_distortion_psnr->y *= norm;
+      cur_distortion_psnr->u *= norm;
+      cur_distortion_psnr->v *= norm;
+      cur_distortion_psnr->all *= norm;
+      cur_distortion_ssim->y *= norm;
+      cur_distortion_ssim->u *= norm;
+      cur_distortion_ssim->v *= norm;
+      cur_distortion_ssim->all *= norm;
+    }
+
+    if (do_psnr) {
+      const double global_psnr_y = ComputePSNR(
+          cur_distortion_psnr->global_y,
+          static_cast<double>(y_size) * number_of_frames);
+      const double global_psnr_u = ComputePSNR(
+          cur_distortion_psnr->global_u,
+          static_cast<double>(uv_size) * number_of_frames);
+      const double global_psnr_v = ComputePSNR(
+          cur_distortion_psnr->global_v,
+          static_cast<double>(uv_size) * number_of_frames);
+      const double global_psnr_all = ComputePSNR(
+          cur_distortion_psnr->global_all,
+          static_cast<double>(total_size) * number_of_frames);
+      printf("Global:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+          global_psnr_y,
+          global_psnr_u,
+          global_psnr_v,
+          global_psnr_all,
+          number_of_frames);
+      if (show_name) {
+        printf("\t%s", argv[fileindex_rec + cur_rec]);
+      }
+      printf("\n");
+    }
+
+    if (!quiet) {
+      printf("Avg:");
+      if (do_psnr) {
+        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+             cur_distortion_psnr->y,
+             cur_distortion_psnr->u,
+             cur_distortion_psnr->v,
+             cur_distortion_psnr->all,
+             number_of_frames);
+      }
+      if (do_ssim) {
+        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+             cur_distortion_ssim->y,
+             cur_distortion_ssim->u,
+             cur_distortion_ssim->v,
+             cur_distortion_ssim->all,
+             number_of_frames);
+      }
+      if (show_name) {
+        printf("\t%s", argv[fileindex_rec + cur_rec]);
+      }
+      printf("\n");
+    }
+    if (!quiet) {
+      printf("Min:");
+      if (do_psnr) {
+        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+            cur_distortion_psnr->min_y,
+            cur_distortion_psnr->min_u,
+            cur_distortion_psnr->min_v,
+            cur_distortion_psnr->min_all,
+            cur_distortion_psnr->min_frame);
+      }
+      if (do_ssim) {
+        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+            cur_distortion_ssim->min_y,
+            cur_distortion_ssim->min_u,
+            cur_distortion_ssim->min_v,
+            cur_distortion_ssim->min_all,
+            cur_distortion_ssim->min_frame);
+      }
+      if (show_name) {
+        printf("\t%s", argv[fileindex_rec + cur_rec]);
+      }
+      printf("\n");
+    }
+
+    if (do_mse) {
+      double global_mse_y = GetMSE(cur_distortion_psnr->global_y,
+        static_cast<double>(y_size) * number_of_frames);
+      double global_mse_u = GetMSE(cur_distortion_psnr->global_u,
+        static_cast<double>(uv_size) * number_of_frames);
+      double global_mse_v = GetMSE(cur_distortion_psnr->global_v,
+        static_cast<double>(uv_size) * number_of_frames);
+      double global_mse_all = GetMSE(cur_distortion_psnr->global_all,
+        static_cast<double>(total_size) * number_of_frames);
+      printf("MSE:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+          global_mse_y,
+          global_mse_u,
+          global_mse_v,
+          global_mse_all,
+          number_of_frames);
+      if (show_name) {
+        printf("\t%s", argv[fileindex_rec + cur_rec]);
+      }
+      printf("\n");
+    }
+  }
+  fclose(file_org);
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    fclose(file_rec[cur_rec]);
+  }
+  delete[] distortion_psnr;
+  delete[] distortion_ssim;
+  delete[] ch_org;
+  delete[] ch_rec;
+  delete[] file_rec;
+  return 0;
+}
diff --git a/files/util/ssim.cc b/files/util/ssim.cc
new file mode 100644
index 00000000..5a6399b7
--- /dev/null
+++ b/files/util/ssim.cc
@@ -0,0 +1,336 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "../util/ssim.h"  // NOLINT
+
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned int uint32;     // NOLINT
+typedef unsigned short uint16;   // NOLINT
+
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
+  (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
+#define __SSE2__
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
+#include <emmintrin.h>
+#endif
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+// SSIM
+enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
+
+// Symmetric Gaussian kernel:  K[i] = ~11 * exp(-0.3 * i * i)
+// The maximum value (11 x 11) must be less than 128 to avoid sign
+// problems during the calls to _mm_mullo_epi16().
+static const int K[KERNEL_SIZE] = {
+  1, 3, 7, 11, 7, 3, 1    // ~11 * exp(-0.3 * i * i)
+};
+static const double kiW[KERNEL + 1 + 1] = {
+  1. / 1089.,   // 1 / sum(i:0..6, j..6) K[i]*K[j]
+  1. / 1089.,   // 1 / sum(i:0..6, j..6) K[i]*K[j]
+  1. / 1056.,   // 1 / sum(i:0..5, j..6) K[i]*K[j]
+  1. / 957.,    // 1 / sum(i:0..4, j..6) K[i]*K[j]
+  1. / 726.,    // 1 / sum(i:0..3, j..6) K[i]*K[j]
+};
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
+
+#define PWEIGHT(A, B)  static_cast<uint16>(K[(A)] * K[(B)])   // weight product
+#define MAKE_WEIGHT(L)                                               \
+  { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3),  \
+        PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } }
+
+// We need this union trick to be able to initialize constant static __m128i
+// values. We can't call _mm_set_epi16() for static compile-time initialization.
+static const struct {
+  union {
+    uint16 i16_[8];
+    __m128i m_;
+  } values_;
+} W0 = MAKE_WEIGHT(0),
+  W1 = MAKE_WEIGHT(1),
+  W2 = MAKE_WEIGHT(2),
+  W3 = MAKE_WEIGHT(3);
+  // ... the rest is symmetric.
+#undef MAKE_WEIGHT
+#undef PWEIGHT
+#endif
+
+// Common final expression for SSIM, once the weighted sums are known.
+static double FinalizeSSIM(double iw, double xm, double ym,
+                           double xxm, double xym, double yym) {
+  const double iwx = xm * iw;
+  const double iwy = ym * iw;
+  double sxx = xxm * iw - iwx * iwx;
+  double syy = yym * iw - iwy * iwy;
+  // small errors are possible, due to rounding. Clamp to zero.
+  if (sxx < 0.) sxx = 0.;
+  if (syy < 0.) syy = 0.;
+  const double sxsy = sqrt(sxx * syy);
+  const double sxy = xym * iw - iwx * iwy;
+  static const double C11 = (0.01 * 0.01) * (255 * 255);
+  static const double C22 = (0.03 * 0.03) * (255 * 255);
+  static const double C33 = (0.015 * 0.015) * (255 * 255);
+  const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
+  const double c = (2. * sxsy      + C22) / (sxx + syy + C22);
+  const double s = (sxy + C33) / (sxsy + C33);
+  return l * c * s;
+}
+
+// GetSSIM() does clipping.  GetSSIMFullKernel() does not
+
+// TODO(skal): use summed tables?
+// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
+// with a diff of 255, squared. The maximum error is thus 0x4388241,
+// which fits into 32 bits integers.
+double GetSSIM(const uint8 *org, const uint8 *rec,
+               int xo, int yo, int W, int H, int stride) {
+  uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
+  org += (yo - KERNEL) * stride;
+  org += (xo - KERNEL);
+  rec += (yo - KERNEL) * stride;
+  rec += (xo - KERNEL);
+  for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
+    if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue;
+    const int Wy = K[y_];
+    for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
+      const int Wxy = Wy * K[x_];
+      if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
+        const int org_x = org[x_];
+        const int rec_x = rec[x_];
+        ws += Wxy;
+        xm  += Wxy * org_x;
+        ym  += Wxy * rec_x;
+        xxm += Wxy * org_x * org_x;
+        xym += Wxy * org_x * rec_x;
+        yym += Wxy * rec_x * rec_x;
+      }
+    }
+  }
+  return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
+}
+
+double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
+                         int xo, int yo, int stride,
+                         double area_weight) {
+  uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
+
+#if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
+
+  org += yo * stride + xo;
+  rec += yo * stride + xo;
+  for (int y = 1; y <= KERNEL; y++) {
+    const int dy1 = y * stride;
+    const int dy2 = y * stride;
+    const int Wy = K[KERNEL + y];
+
+    for (int x = 1; x <= KERNEL; x++) {
+      // Compute the contributions of upper-left (ul), upper-right (ur)
+      // lower-left (ll) and lower-right (lr) points (see the diagram below).
+      // Symmetric Kernel will have same weight on those points.
+      //       -  -  -  -  -  -  -
+      //       -  ul -  -  -  ur -
+      //       -  -  -  -  -  -  -
+      //       -  -  -  0  -  -  -
+      //       -  -  -  -  -  -  -
+      //       -  ll -  -  -  lr -
+      //       -  -  -  -  -  -  -
+      const int Wxy = Wy * K[KERNEL + x];
+      const int ul1 = org[-dy1 - x];
+      const int ur1 = org[-dy1 + x];
+      const int ll1 = org[dy1 - x];
+      const int lr1 = org[dy1 + x];
+
+      const int ul2 = rec[-dy2 - x];
+      const int ur2 = rec[-dy2 + x];
+      const int ll2 = rec[dy2 - x];
+      const int lr2 = rec[dy2 + x];
+
+      xm  += Wxy * (ul1 + ur1 + ll1 + lr1);
+      ym  += Wxy * (ul2 + ur2 + ll2 + lr2);
+      xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
+      xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
+      yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
+    }
+
+    // Compute the contributions of up (u), down (d), left (l) and right (r)
+    // points across the main axes (see the diagram below).
+    // Symmetric Kernel will have same weight on those points.
+    //       -  -  -  -  -  -  -
+    //       -  -  -  u  -  -  -
+    //       -  -  -  -  -  -  -
+    //       -  l  -  0  -  r  -
+    //       -  -  -  -  -  -  -
+    //       -  -  -  d  -  -  -
+    //       -  -  -  -  -  -  -
+    const int Wxy = Wy * K[KERNEL];
+    const int u1 = org[-dy1];
+    const int d1 = org[dy1];
+    const int l1 = org[-y];
+    const int r1 = org[y];
+
+    const int u2 = rec[-dy2];
+    const int d2 = rec[dy2];
+    const int l2 = rec[-y];
+    const int r2 = rec[y];
+
+    xm  += Wxy * (u1 + d1 + l1 + r1);
+    ym  += Wxy * (u2 + d2 + l2 + r2);
+    xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
+    xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
+    yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
+  }
+
+  // Lastly the contribution of (x0, y0) point.
+  const int Wxy = K[KERNEL] * K[KERNEL];
+  const int s1 = org[0];
+  const int s2 = rec[0];
+
+  xm  += Wxy * s1;
+  ym  += Wxy * s2;
+  xxm += Wxy * s1 * s1;
+  xym += Wxy * s1 * s2;
+  yym += Wxy * s2 * s2;
+
+#else   // __SSE2__
+
+  org += (yo - KERNEL) * stride + (xo - KERNEL);
+  rec += (yo - KERNEL) * stride + (xo - KERNEL);
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x = zero;
+  __m128i y = zero;
+  __m128i xx = zero;
+  __m128i xy = zero;
+  __m128i yy = zero;
+
+// Read 8 pixels at line #L, and convert to 16bit, perform weighting
+// and acccumulate.
+#define LOAD_LINE_PAIR(L, WEIGHT) do {                                       \
+  const __m128i v0 =                                                         \
+      _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \
+  const __m128i v1 =                                                         \
+      _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \
+  const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                            \
+  const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                            \
+  const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);              \
+  const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);              \
+  x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                       \
+  y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                       \
+  x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                       \
+  y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                       \
+  xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                           \
+  xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                           \
+  yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                           \
+} while (0)
+
+#define ADD_AND_STORE_FOUR_EPI32(M, OUT) do {                                \
+  uint32 tmp[4];                                                             \
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M));                    \
+  (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0];                                 \
+} while (0)
+
+  LOAD_LINE_PAIR(0, W0);
+  LOAD_LINE_PAIR(1, W1);
+  LOAD_LINE_PAIR(2, W2);
+  LOAD_LINE_PAIR(3, W3);
+  LOAD_LINE_PAIR(4, W2);
+  LOAD_LINE_PAIR(5, W1);
+  LOAD_LINE_PAIR(6, W0);
+
+  ADD_AND_STORE_FOUR_EPI32(x, xm);
+  ADD_AND_STORE_FOUR_EPI32(y, ym);
+  ADD_AND_STORE_FOUR_EPI32(xx, xxm);
+  ADD_AND_STORE_FOUR_EPI32(xy, xym);
+  ADD_AND_STORE_FOUR_EPI32(yy, yym);
+
+#undef LOAD_LINE_PAIR
+#undef ADD_AND_STORE_FOUR_EPI32
+#endif
+
+  return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
+}
+
+static int start_max(int x, int y) { return (x > y) ? x : y; }
+
+double CalcSSIM(const uint8 *org, const uint8 *rec,
+                const int image_width, const int image_height) {
+  double SSIM = 0.;
+  const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
+  const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
+  const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
+  const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
+  const int stride = image_width;
+
+  for (int j = 0; j < KERNEL_Y; ++j) {
+    for (int i = 0; i < image_width; ++i) {
+      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
+    }
+  }
+
+#ifdef _OPENMP
+  #pragma omp parallel for reduction(+: SSIM)
+#endif
+  for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
+    for (int i = 0; i < KERNEL_X; ++i) {
+      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
+    }
+    for (int i = KERNEL_X; i < start_x; ++i) {
+      SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
+    }
+    if (start_x < image_width) {
+      // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
+      // copy the 8 rightmost pixels on a cache area, and pad this area with
+      // zeros which won't contribute to the overall SSIM value (but we need
+      // to pass the correct normalizing constant!). By using this cache, we can
+      // still call GetSSIMFullKernel() instead of the slower GetSSIM().
+      // NOTE: we could use similar method for the left-most pixels too.
+      const int kScratchWidth = 8;
+      const int kScratchStride = kScratchWidth + KERNEL + 1;
+      uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 };
+      uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 };
+
+      for (int k = 0; k < KERNEL_SIZE; ++k) {
+        const int offset =
+            (j - KERNEL + k) * stride + image_width - kScratchWidth;
+        memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
+        memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
+      }
+      for (int k = 0;  k <= KERNEL_X + 1; ++k) {
+        SSIM += GetSSIMFullKernel(scratch_org, scratch_rec,
+                                  KERNEL + k, KERNEL, kScratchStride, kiW[k]);
+      }
+    }
+  }
+
+  for (int j = start_y; j < image_height; ++j) {
+    for (int i = 0; i < image_width; ++i) {
+      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
+    }
+  }
+  return SSIM;
+}
+
+double CalcLSSIM(double ssim) {
+  return -10.0 * log10(1.0 - ssim);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
diff --git a/files/util/ssim.h b/files/util/ssim.h
new file mode 100644
index 00000000..430eb71c
--- /dev/null
+++ b/files/util/ssim.h
@@ -0,0 +1,36 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Get SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
+
+#ifndef UTIL_SSIM_H_  // NOLINT
+#define UTIL_SSIM_H_
+
+#include <math.h>  // For log10()
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED)
+typedef unsigned char uint8;
+#define UINT8_TYPE_DEFINED
+#endif
+
+double CalcSSIM(const uint8* org, const uint8* rec,
+                const int image_width, const int image_height);
+
+double CalcLSSIM(double ssim);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // UTIL_SSIM_H_  // NOLINT
diff --git a/files/winarm.mk b/files/winarm.mk
new file mode 100644
index 00000000..c4307a43
--- /dev/null
+++ b/files/winarm.mk
@@ -0,0 +1,46 @@
+# This is a generic makefile for libyuv for Windows Arm.
+# call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
+# nmake /f winarm.mk
+# make -f winarm.mk
+# nmake /f winarm.mk clean
+# consider /arch:ARMv7VE
+CC=cl
+CCFLAGS=/Ox /nologo /Iinclude /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP
+AR=lib
+ARFLAGS=/MACHINE:ARM /NOLOGO /SUBSYSTEM:NATIVE
+RM=cmd /c del
+
+LOCAL_OBJ_FILES = \
+	source/compare.o\
+	source/compare_common.o\
+	source/convert.o\
+	source/convert_argb.o\
+	source/convert_from.o\
+	source/convert_from_argb.o\
+	source/convert_to_argb.o\
+	source/convert_to_i420.o\
+	source/cpu_id.o\
+	source/planar_functions.o\
+	source/rotate.o\
+	source/rotate_any.o\
+	source/rotate_argb.o\
+	source/rotate_common.o\
+	source/row_any.o\
+	source/row_common.o\
+	source/scale.o\
+	source/scale_any.o\
+	source/scale_argb.o\
+	source/scale_common.o\
+	source/video_common.o
+
+.cc.o:
+	$(CC) /c $(CCFLAGS) $*.cc /Fo$@
+
+all: libyuv_arm.lib winarm.mk
+
+libyuv_arm.lib: $(LOCAL_OBJ_FILES) winarm.mk
+	$(AR) $(ARFLAGS) /OUT:$@ $(LOCAL_OBJ_FILES)
+
+clean:
+	$(RM) "source\*.o" libyuv_arm.lib
+
author	Hangyu Kuang <hkuang@google.com>	2016-07-06 14:21:45 -0700
committer	Hangyu Kuang <hkuang@google.com>	2016-07-08 09:51:10 -0700
commit	f047e7ca6983218eed7703c7afd51fed7bd3b5c9 (patch)
tree	2667579566b6270c21ee4b495b4cd119af5ccf5b
parent	bb74e3e19b98261031216de8cadcef34cccd9e4a (diff)
download	libyuv-nougat-mr2-pixel-release.tar.gz