Update libyuv to r397

Change-Id: I70f5a527de52ae8ae80b189873c9a094035dfa2c Signed-off-by: Hendrik Dahlkamp <hendrik@google.com>
author: Hendrik Dahlkamp <hendrik@google.com> 2013-01-23 18:27:37 -0800
committer: Adam Hampson <ahampson@google.com> 2013-01-28 15:39:41 -0800
commit: 33cfdeb7b267ab635413797fffb046b73272f7ec (patch)
tree: 8ff16b765a83ba911233a1d7bfa27cce9cee3b7c
parent: a88a10a6ed9f9801852929bac34bdf10510116f4 (diff)
download: libyuv-33cfdeb7b267ab635413797fffb046b73272f7ec.tar.gz
64 files changed, 27342 insertions, 8822 deletions
diff --git a/Android.mk b/Android.mk
index 626f7a1f..d1c565b1 100644
--- a/Android.mk
+++ b/Android.mk
@@ -5,19 +5,32 @@ ifeq ($(TARGET_ARCH),arm)
 LOCAL_PATH := $(call my-dir)
 
 common_SRC_FILES := \
+    files/source/compare.cc \
     files/source/convert.cc \
+    files/source/convert_argb.cc \
+    files/source/convert_from.cc \
+    files/source/cpu_id.cc \
     files/source/format_conversion.cc \
     files/source/planar_functions.cc \
-    files/source/row_posix.cc \
-    files/source/video_common.cc \
-    files/source/cpu_id.cc \
-    files/source/general.cc \
     files/source/rotate.cc \
-    files/source/row_table.cc \
-    files/source/scale.cc
+    files/source/rotate_argb.cc \
+    files/source/row_common.cc \
+    files/source/row_posix.cc \
+    files/source/scale.cc \
+    files/source/scale_argb.cc \
+    files/source/video_common.cc
 
 common_CFLAGS := -Wall -fexceptions
 
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+    common_CFLAGS += -DLIBYUV_NEON
+    common_SRC_FILES += \
+        files/source/compare_neon.cc.neon \
+        files/source/rotate_neon.cc.neon \
+        files/source/row_neon.cc.neon \
+        files/source/scale_neon.cc.neon
+endif
+
 common_C_INCLUDES = $(LOCAL_PATH)/files/include
 
 # For the device
diff --git a/README.google b/README.google
index 79828ab8..c887302d 100644
--- a/README.google
+++ b/README.google
@@ -1,10 +1,11 @@
-URL: http://libyuv.googlecode.com/svn-history/r52/trunk/
-Version: r52
+URL: http://libyuv.googlecode.com/svn-history/r397/trunk/
+Version: r397
 License: BSD
 License File: LICENSE
 
 Description:
-libyuv is an open-source library for yuv conversion and scaling.
+libyuv is an open-source library for yuv scaling, conversion, comparison
+and rendering.
 Specifically libyuv is optimized for SSE2/SSSE3 and Neon and has demonstrated
 speed up to 10x to 16x compared to C code.
 
diff --git a/files/AUTHORS b/files/AUTHORS
new file mode 100644
index 00000000..9686ac13
--- /dev/null
+++ b/files/AUTHORS
@@ -0,0 +1,4 @@
+# Names should be added to this file like so:
+# Name or Organization <email address>
+
+Google Inc.
diff --git a/files/codereview.settings b/files/codereview.settings
new file mode 100644
index 00000000..11270bba
--- /dev/null
+++ b/files/codereview.settings
@@ -0,0 +1,12 @@
+# This file is used by gcl to get repository specific information.
+# The LibYuv code review is via WebRtc's code review
+CODE_REVIEW_SERVER: webrtc-codereview.appspot.com
+#CC_LIST:
+#VIEW_VC:
+#STATUS:
+TRY_ON_UPLOAD: False
+TRYSERVER_HTTP_HOST: webrtc-cb-linux-master.cbf.corp.google.com
+TRYSERVER_HTTP_PORT: 9020
+#TRYSERVER_SVN_URL:
+#GITCL_PREUPLOAD:
+#GITCL_PREDCOMMIT:
diff --git a/files/include/libyuv.h b/files/include/libyuv.h
index 5a30e2d0..06f26aae 100644
--- a/files/include/libyuv.h
+++ b/files/include/libyuv.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,17 +8,22 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#ifndef LIBYUV_INCLUDE_LIBYUV_H_
-#define LIBYUV_INCLUDE_LIBYUV_H_
+#ifndef INCLUDE_LIBYUV_H_  // NOLINT
+#define INCLUDE_LIBYUV_H_
 
 #include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
 #include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/format_conversion.h"
-#include "libyuv/general.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
+#include "libyuv/rotate_argb.h"
 #include "libyuv/scale.h"
+#include "libyuv/scale_argb.h"
+#include "libyuv/version.h"
+#include "libyuv/video_common.h"
 
-#endif  // LIBYUV_INCLUDE_LIBYUV_H_
+#endif  // INCLUDE_LIBYUV_H_  NOLINT
diff --git a/files/include/libyuv/basic_types.h b/files/include/libyuv/basic_types.h
index 5adc2bfd..9e9f2abc 100644
--- a/files/include/libyuv/basic_types.h
+++ b/files/include/libyuv/basic_types.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,27 +8,18 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_  // NOLINT
 #define INCLUDE_LIBYUV_BASIC_TYPES_H_
 
 #include <stddef.h>  // for NULL, size_t
 
-#ifndef WIN32
+#if !(defined(_MSC_VER) && (_MSC_VER < 1600))
 #include <stdint.h>  // for uintptr_t
 #endif
 
 #ifndef INT_TYPES_DEFINED
 #define INT_TYPES_DEFINED
 #ifdef COMPILER_MSVC
-typedef __int64 int64;
-#else
-typedef long long int64;
-#endif /* COMPILER_MSVC */
-typedef int int32;
-typedef short int16;
-typedef char int8;
-
-#ifdef COMPILER_MSVC
 typedef unsigned __int64 uint64;
 typedef __int64 int64;
 #ifndef INT64_C
@@ -38,9 +29,20 @@ typedef __int64 int64;
 #define UINT64_C(x) x ## UI64
 #endif
 #define INT64_F "I64"
-#else
-typedef unsigned long long uint64;
-typedef long long int64;
+#else  // COMPILER_MSVC
+#ifdef __LP64__
+typedef unsigned long uint64;  // NOLINT
+typedef long int64;  // NOLINT
+#ifndef INT64_C
+#define INT64_C(x) x ## L
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UL
+#endif
+#define INT64_F "l"
+#else  // __LP64__
+typedef unsigned long long uint64;  // NOLINT
+typedef long long int64;  // NOLINT
 #ifndef INT64_C
 #define INT64_C(x) x ## LL
 #endif
@@ -48,10 +50,14 @@ typedef long long int64;
 #define UINT64_C(x) x ## ULL
 #endif
 #define INT64_F "ll"
-#endif /* COMPILER_MSVC */
+#endif  // __LP64__
+#endif  // COMPILER_MSVC
 typedef unsigned int uint32;
-typedef unsigned short uint16;
+typedef int int32;
+typedef unsigned short uint16;  // NOLINT
+typedef short int16;  // NOLINT
 typedef unsigned char uint8;
+typedef signed char int8;
 #endif  // INT_TYPES_DEFINED
 
 // Detect compiler is for x86 or x64.
@@ -59,10 +65,33 @@ typedef unsigned char uint8;
     defined(__i386__) || defined(_M_IX86)
 #define CPU_X86 1
 #endif
+// Detect compiler is for ARM.
+#if defined(__arm__) || defined(_M_ARM)
+#define CPU_ARM 1
+#endif
 
-#define IS_ALIGNED(p, a) (0==(reinterpret_cast<uintptr_t>(p) & ((a)-1)))
+#ifndef ALIGNP
 #define ALIGNP(p, t) \
-  (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
-  ((t)-1)) & ~((t)-1))))
+    (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
+    ((t) - 1)) & ~((t) - 1))))
+#endif
+
+#if !defined(LIBYUV_API)
+#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllexport)
+#elif defined(LIBYUV_USING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllimport)
+#else
+#define LIBYUV_API
+#endif  // LIBYUV_BUILDING_SHARED_LIBRARY
+#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
+    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
+    defined(LIBYUV_USING_SHARED_LIBRARY))
+#define LIBYUV_API __attribute__ ((visibility ("default")))
+#else
+#define LIBYUV_API
+#endif  // __GNUC__
+#endif  // LIBYUV_API
 
-#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_
+#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_  NOLINT
diff --git a/files/include/libyuv/compare.h b/files/include/libyuv/compare.h
new file mode 100644
index 00000000..5fd924b8
--- /dev/null
+++ b/files/include/libyuv/compare.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_COMPARE_H_  // NOLINT
+#define INCLUDE_LIBYUV_COMPARE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Compute a hash for specified memory. Seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
+
+// Sum Square Error - used to compute Mean Square Error or PSNR.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a,
+                             const uint8* src_b, int count);
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+                                  const uint8* src_b, int stride_b,
+                                  int width, int height);
+
+static const int kMaxPsnr = 128;
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count);
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height);
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height);
+
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height);
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_COMPARE_H_  NOLINT
diff --git a/files/include/libyuv/convert.h b/files/include/libyuv/convert.h
index fa3b6446..1d4b6a5b 100644
--- a/files/include/libyuv/convert.h
+++ b/files/include/libyuv/convert.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,90 +8,243 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#ifndef INCLUDE_LIBYUV_CONVERT_H_
+#ifndef INCLUDE_LIBYUV_CONVERT_H_  // NOLINT
 #define INCLUDE_LIBYUV_CONVERT_H_
 
 #include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
 
+#ifdef __cplusplus
 namespace libyuv {
+extern "C" {
+#endif
+
+// Alias.
+#define I420ToI420 I420Copy
+
+// Copy I420 to I420.
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
+// Convert I422 to I420.
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
 
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_frame, int dst_stride_frame,
-                int width, int height);
-
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_frame, int dst_stride_frame,
-                   int width, int height);
-
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_u, int src_stride_u,
-                 const uint8* src_v, int src_stride_v,
-                 uint8* dst_frame, int dst_stride_frame,
-                 int width, int height);
-
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_frame, int dst_stride_frame,
-                   int width, int height);
-
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
+// Convert I444 to I420.
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
+// Convert I411 to I420.
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-// TODO(fbarchard): Deprecated - this is same as BG24ToARGB with -height
-int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
-                uint8* dst_frame, int dst_stride_frame,
-                int width, int height);
+// Convert I400 (grey) to I420.
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
 
-int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height);
+// Convert NV12 to I420. Also used for NV21.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
 
-int RAWToI420(const uint8* src_frame, int src_stride_frame,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height);
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
 
-int ABGRToI420(const uint8* src_frame, int src_stride_frame,
+// Convert Q420 to I420.
+LIBYUV_API
+int Q420ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_yuy2, int src_stride_yuy2,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert V210 to I420.
+LIBYUV_API
+int V210ToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// ARGB little endian (bgra in memory) to I420.
+LIBYUV_API
 int ARGBToI420(const uint8* src_frame, int src_stride_frame,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_frame, int dst_stride_frame,
+// BGRA little endian (argb in memory) to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// ABGR little endian (rgba in memory) to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// RGBA little endian (abgr in memory) to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// RGB little endian (bgr in memory) to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height);
+
+// RGB big endian (rgb in memory) to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_frame, int src_stride_frame,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
+                 uint8* dst_y, int dst_stride_y,
+                 uint8* dst_u, int dst_stride_u,
+                 uint8* dst_v, int dst_stride_v,
                  int width, int height);
 
-} //  namespace libyuv
+// RGB15 (RGBO fourcc) little endian to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
 
-#endif // INCLUDE_LIBYUV_CONVERT_H_
+// RGB12 (R444 fourcc) little endian to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture.
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToI420(const uint8* sample, size_t sample_size,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int src_width, int src_height,
+               int dst_width, int dst_height);
+#endif
+
+// Note Bayer formats (BGGR) To I420 are in format_conversion.h
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_y" number of bytes in a row of the dst_y plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "dst_width" / "dst_height" is size of destination to crop to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToI420(const uint8* src_frame, size_t src_size,
+                  uint8* dst_y, int dst_stride_y,
+                  uint8* dst_u, int dst_stride_u,
+                  uint8* dst_v, int dst_stride_v,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int dst_width, int dst_height,
+                  RotationMode rotation,
+                  uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_H_  NOLINT
diff --git a/files/include/libyuv/convert_argb.h b/files/include/libyuv/convert_argb.h
new file mode 100644
index 00000000..86085252
--- /dev/null
+++ b/files/include/libyuv/convert_argb.h
@@ -0,0 +1,228 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+// TODO(fbarchard): This set of functions should exactly match convert.h
+// Add missing V210 and Q420.
+// TODO(fbarchard): Add tests. Create random content of right size and convert
+// with C vs Opt and or to I420 and compare.
+// TODO(fbarchard): Some of these functions lack parameter setting.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Alias.
+#define ARGBToARGB ARGBCopy
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I400 (grey) to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I400 to ARGB. Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+                         uint8* dst_argb, int dst_stride_argb,
+                         int width, int height);
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// TODO(fbarchard): Convert Q420 to ARGB.
+// LIBYUV_API
+// int Q420ToARGB(const uint8* src_y, int src_stride_y,
+//                const uint8* src_yuy2, int src_stride_yuy2,
+//                uint8* dst_argb, int dst_stride_argb,
+//                int width, int height);
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// TODO(fbarchard): Convert V210 to ARGB.
+// LIBYUV_API
+// int V210ToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+//                uint8* dst_argb, int dst_stride_argb,
+//                int width, int height);
+
+// BGRA little endian (argb in memory) to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// ABGR little endian (rgba in memory) to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// RGBA little endian (abgr in memory) to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Deprecated function name.
+#define BG24ToARGB RGB24ToARGB
+
+// RGB little endian (bgr in memory) to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height);
+
+// RGB big endian (rgb in memory) to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_frame, int src_stride_frame,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height);
+
+// RGB12 (R444 fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+               uint8* dst_argb, int dst_stride_argb,
+               int src_width, int src_height,
+               int dst_width, int dst_height);
+#endif
+
+// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "dst_width" / "dst_height" is size of destination to crop to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToARGB(const uint8* src_frame, size_t src_size,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int dst_width, int dst_height,
+                  RotationMode rotation,
+                  uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_  NOLINT
diff --git a/files/include/libyuv/convert_from.h b/files/include/libyuv/convert_from.h
new file mode 100644
index 00000000..4eae950c
--- /dev/null
+++ b/files/include/libyuv/convert_from.h
@@ -0,0 +1,165 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// See Also convert.h for conversions from formats to I420.
+
+// I420Copy in convert to I420ToI420.
+
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+             uint8* dst_y, int dst_stride_y,
+             int width, int height);
+
+// TODO(fbarchard): I420ToNV12
+// TODO(fbarchard): I420ToM420
+// TODO(fbarchard): I420ToQ420
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+LIBYUV_API
+int I420ToV210(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_frame, int dst_stride_frame,
+                int width, int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              uint8* dst_frame, int dst_stride_frame,
+              int width, int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_u, int src_stride_u,
+                 const uint8* src_v, int src_stride_v,
+                 uint8* dst_frame, int dst_stride_frame,
+                 int width, int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_frame, int dst_stride_frame,
+                   int width, int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_frame, int dst_stride_frame,
+                   int width, int height);
+
+// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
+
+// Convert I420 to specified format.
+// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
+//    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+                    const uint8* u, int u_stride,
+                    const uint8* v, int v_stride,
+                    uint8* dst_sample, int dst_sample_stride,
+                    int width, int height,
+                    uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_  NOLINT
diff --git a/files/include/libyuv/cpu_id.h b/files/include/libyuv/cpu_id.h
index c1000e86..0914f1d2 100644
--- a/files/include/libyuv/cpu_id.h
+++ b/files/include/libyuv/cpu_id.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,28 +8,63 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CPU_ID_H_
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_  // NOLINT
 #define INCLUDE_LIBYUV_CPU_ID_H_
 
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
 namespace libyuv {
+extern "C" {
+#endif
 
-// These flags are only valid on x86 processors
-static const int kCpuHasSSE2 = 1;
-static const int kCpuHasSSSE3 = 2;
+// Internal flag to indicate cpuid is initialized.
+static const int kCpuInitialized = 0x1;
 
-// These flags are only valid on ARM processors
-static const int kCpuHasNEON = 4;
+// These flags are only valid on ARM processors.
+static const int kCpuHasARM = 0x2;
+static const int kCpuHasNEON = 0x4;
+// 0x8 reserved for future ARM flag.
 
-// Internal flag to indicate cpuid is initialized.
-static const int kCpuInitialized = 8;
+// These flags are only valid on x86 processors.
+static const int kCpuHasX86 = 0x10;
+static const int kCpuHasSSE2 = 0x20;
+static const int kCpuHasSSSE3 = 0x40;
+static const int kCpuHasSSE41 = 0x80;
+static const int kCpuHasSSE42 = 0x100;
+static const int kCpuHasAVX = 0x200;
+static const int kCpuHasAVX2 = 0x400;
+
+// Internal function used to auto-init.
+LIBYUV_API
+int InitCpuFlags(void);
+
+// Internal function for parsing /proc/cpuinfo.
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name);
 
 // Detect CPU has SSE2 etc.
-bool TestCpuFlag(int flag);
+// Test_flag parameter should be one of kCpuHas constants above.
+// returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+  LIBYUV_API extern int cpu_info_;
+  return (cpu_info_ ? cpu_info_ : InitCpuFlags()) & test_flag;
+}
 
 // For testing, allow CPU flags to be disabled.
-// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.  -1 to enable all.
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// MaskCpuFlags(-1) to enable all cpu specific optimizations.
+// MaskCpuFlags(0) to disable all cpu specific optimizations.
+LIBYUV_API
 void MaskCpuFlags(int enable_flags);
 
+// Low level cpuid for X86. Returns zeros on other CPUs.
+LIBYUV_API
+void CpuId(int cpu_info[4], int info_type);
+
+#ifdef __cplusplus
+}  // extern "C"
 }  // namespace libyuv
+#endif
 
-#endif  // INCLUDE_LIBYUV_CPU_ID_H_
+#endif  // INCLUDE_LIBYUV_CPU_ID_H_  NOLINT
diff --git a/files/include/libyuv/format_conversion.h b/files/include/libyuv/format_conversion.h
index d3d36f38..06bd387f 100644
--- a/files/include/libyuv/format_conversion.h
+++ b/files/include/libyuv/format_conversion.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,34 +8,161 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_
+#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_FORMATCONVERSION_H_
 
 #include "libyuv/basic_types.h"
 
+#ifdef __cplusplus
 namespace libyuv {
+extern "C" {
+#endif
+
+// Convert Bayer RGB formats to I420.
+LIBYUV_API
+int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_y, int dst_stride_y,
+                    uint8* dst_u, int dst_stride_u,
+                    uint8* dst_v, int dst_stride_v,
+                    int width, int height);
+
+LIBYUV_API
+int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_y, int dst_stride_y,
+                    uint8* dst_u, int dst_stride_u,
+                    uint8* dst_v, int dst_stride_v,
+                    int width, int height);
+
+LIBYUV_API
+int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_y, int dst_stride_y,
+                    uint8* dst_u, int dst_stride_u,
+                    uint8* dst_v, int dst_stride_v,
+                    int width, int height);
+
+LIBYUV_API
+int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_y, int dst_stride_y,
+                    uint8* dst_u, int dst_stride_u,
+                    uint8* dst_v, int dst_stride_v,
+                    int width, int height);
+
+// Temporary API mapper.
+#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \
+    BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f)
+
+LIBYUV_API
+int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height,
+                uint32 src_fourcc_bayer);
+
+// Convert I420 to Bayer RGB formats.
+LIBYUV_API
+int I420ToBayerBGGR(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    uint8* dst_frame, int dst_stride_frame,
+                    int width, int height);
+
+LIBYUV_API
+int I420ToBayerGBRG(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    uint8* dst_frame, int dst_stride_frame,
+                    int width, int height);
+
+LIBYUV_API
+int I420ToBayerGRBG(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    uint8* dst_frame, int dst_stride_frame,
+                    int width, int height);
+
+LIBYUV_API
+int I420ToBayerRGGB(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    uint8* dst_frame, int dst_stride_frame,
+                    int width, int height);
+
+// Temporary API mapper.
+#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \
+    I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f)
+
+LIBYUV_API
+int I420ToBayer(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_frame, int dst_stride_frame,
+                int width, int height,
+                uint32 dst_fourcc_bayer);
+
+// Convert Bayer RGB formats to ARGB.
+LIBYUV_API
+int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height);
+
+LIBYUV_API
+int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height);
+
+LIBYUV_API
+int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height);
+
+LIBYUV_API
+int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height);
+
+// Temporary API mapper.
+#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f)
+
+LIBYUV_API
+int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height,
+                uint32 src_fourcc_bayer);
+
+// Converts ARGB to Bayer RGB formats.
+LIBYUV_API
+int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_bayer, int dst_stride_bayer,
+                    int width, int height);
+
+LIBYUV_API
+int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_bayer, int dst_stride_bayer,
+                    int width, int height);
+
+LIBYUV_API
+int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_bayer, int dst_stride_bayer,
+                    int width, int height);
+
+LIBYUV_API
+int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_bayer, int dst_stride_bayer,
+                    int width, int height);
+
+// Temporary API mapper.
+#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f)
 
-// Converts any Bayer RGB format to I420.
-int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
-                   uint32 src_fourcc_bayer,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height);
-
-// Converts any Bayer RGB format to ARGB.
-int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer,
-                   uint32 src_fourcc_bayer,
-                   uint8* dst_rgb, int dst_stride_rgb,
-                   int width, int height);
-
-// Converts ARGB to any Bayer RGB format.
-int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb,
-                   uint8* dst_bayer, int dst_stride_bayer,
-                   uint32 dst_fourcc_bayer,
-                   int width, int height);
+LIBYUV_API
+int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_bayer, int dst_stride_bayer,
+                int width, int height,
+                uint32 dst_fourcc_bayer);
 
+#ifdef __cplusplus
+}  // extern "C"
 }  // namespace libyuv
+#endif
 
-#endif  // INCLUDE_LIBYUV_FORMATCONVERSION_H_
+#endif  // INCLUDE_LIBYUV_FORMATCONVERSION_H_  NOLINT
diff --git a/files/include/libyuv/general.h b/files/include/libyuv/general.h
deleted file mode 100644
index 58943c86..00000000
--- a/files/include/libyuv/general.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/*
- * General operations on YUV images.
- */
-
-#ifndef INCLUDE_LIBYUV_GENERAL_H_
-#define INCLUDE_LIBYUV_GENERAL_H_
-
-#include "libyuv/basic_types.h"
-
-namespace libyuv {
-
-// I420 mirror
-int
-I420Mirror(const uint8* src_yplane, int src_ystride,
-           const uint8* src_uplane, int src_ustride,
-           const uint8* src_vplane, int src_vstride,
-           uint8* dst_yplane, int dst_ystride,
-           uint8* dst_uplane, int dst_ustride,
-           uint8* dst_vplane, int dst_vstride,
-           int width, int height);
-
-// Crop/Pad I420 frame to match required dimensions.
-int
-I420CropPad(const uint8* src_frame, int src_width,
-           int src_height, uint8* dst_frame,
-           int dst_width, int dst_height);
-
-// I420 Crop - crop a rectangle from image
-int
-I420Crop(uint8* frame,
-         int src_width, int src_height,
-         int dst_width, int dst_height);
-
-} // namespace libyuv
-
-#endif // INCLUDE_LIBYUV_GENERAL_H_
diff --git a/files/include/libyuv/mjpeg_decoder.h b/files/include/libyuv/mjpeg_decoder.h
new file mode 100644
index 00000000..67090cf0
--- /dev/null
+++ b/files/include/libyuv/mjpeg_decoder.h
@@ -0,0 +1,188 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_  // NOLINT
+#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
+
+#include "libyuv/basic_types.h"
+
+// NOTE: For a simplified public API use convert.h MJPGToI420().
+
+struct jpeg_common_struct;
+struct jpeg_decompress_struct;
+struct jpeg_source_mgr;
+
+namespace libyuv {
+
+static const uint32 kUnknownDataSize = 0xFFFFFFFF;
+
+enum JpegSubsamplingType {
+  kJpegYuv420,
+  kJpegYuv422,
+  kJpegYuv411,
+  kJpegYuv444,
+  kJpegYuv400,
+  kJpegUnknown
+};
+
+struct SetJmpErrorMgr;
+
+// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
+// simply independent JPEG images with a fixed huffman table (which is omitted).
+// It is rarely used in video transmission, but is common as a camera capture
+// format, especially in Logitech devices. This class implements a decoder for
+// MJPEG frames.
+//
+// See http://tools.ietf.org/html/rfc2435
+class MJpegDecoder {
+ public:
+  typedef void (*CallbackFunction)(void* opaque,
+                                   const uint8* const* data,
+                                   const int* strides,
+                                   int rows);
+
+  static const int kColorSpaceUnknown;
+  static const int kColorSpaceGrayscale;
+  static const int kColorSpaceRgb;
+  static const int kColorSpaceYCbCr;
+  static const int kColorSpaceCMYK;
+  static const int kColorSpaceYCCK;
+
+  MJpegDecoder();
+  ~MJpegDecoder();
+
+  // Loads a new frame, reads its headers, and determines the uncompressed
+  // image format. Returns true if image looks valid and format is supported.
+  // If return value is true, then the values for all the following getters
+  // are populated.
+  // src_len is the size of the compressed mjpeg frame in bytes.
+  bool LoadFrame(const uint8* src, size_t src_len);
+
+  // Returns width of the last loaded frame in pixels.
+  int GetWidth();
+
+  // Returns height of the last loaded frame in pixels.
+  int GetHeight();
+
+  // Returns format of the last loaded frame. The return value is one of the
+  // kColorSpace* constants.
+  int GetColorSpace();
+
+  // Number of color components in the color space.
+  int GetNumComponents();
+
+  // Sample factors of the n-th component.
+  int GetHorizSampFactor(int component);
+
+  int GetVertSampFactor(int component);
+
+  int GetHorizSubSampFactor(int component);
+
+  int GetVertSubSampFactor(int component);
+
+  // Public for testability.
+  int GetImageScanlinesPerImcuRow();
+
+  // Public for testability.
+  int GetComponentScanlinesPerImcuRow(int component);
+
+  // Width of a component in bytes.
+  int GetComponentWidth(int component);
+
+  // Height of a component.
+  int GetComponentHeight(int component);
+
+  // Width of a component in bytes with padding for DCTSIZE. Public for testing.
+  int GetComponentStride(int component);
+
+  // Size of a component in bytes.
+  int GetComponentSize(int component);
+
+  // Call this after LoadFrame() if you decide you don't want to decode it
+  // after all.
+  bool UnloadFrame();
+
+  // Decodes the entire image into a one-buffer-per-color-component format.
+  // dst_width must match exactly. dst_height must be <= to image height; if
+  // less, the image is cropped. "planes" must have size equal to at least
+  // GetNumComponents() and they must point to non-overlapping buffers of size
+  // at least GetComponentSize(i). The pointers in planes are incremented
+  // to point to after the end of the written data.
+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+  bool DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
+
+  // Decodes the entire image and passes the data via repeated calls to a
+  // callback function. Each call will get the data for a whole number of
+  // image scanlines.
+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+  bool DecodeToCallback(CallbackFunction fn, void* opaque,
+                        int dst_width, int dst_height);
+
+  // The helper function which recognizes the jpeg sub-sampling type.
+  static JpegSubsamplingType JpegSubsamplingTypeHelper(
+     int* subsample_x, int* subsample_y, int number_of_components);
+
+ private:
+  struct Buffer {
+    const uint8* data;
+    int len;
+  };
+
+  struct BufferVector {
+    Buffer* buffers;
+    int len;
+    int pos;
+  };
+
+  // Methods that are passed to jpeglib.
+  static int fill_input_buffer(jpeg_decompress_struct* cinfo);
+  static void init_source(jpeg_decompress_struct* cinfo);
+  static void skip_input_data(jpeg_decompress_struct* cinfo,
+                              long num_bytes);  // NOLINT
+  static void term_source(jpeg_decompress_struct* cinfo);
+
+  static void ErrorHandler(jpeg_common_struct* cinfo);
+
+  void AllocOutputBuffers(int num_outbufs);
+  void DestroyOutputBuffers();
+
+  bool StartDecode();
+  bool FinishDecode();
+
+  void SetScanlinePointers(uint8** data);
+  bool DecodeImcuRow();
+
+  int GetComponentScanlinePadding(int component);
+
+  // A buffer holding the input data for a frame.
+  Buffer buf_;
+  BufferVector buf_vec_;
+
+  jpeg_decompress_struct* decompress_struct_;
+  jpeg_source_mgr* source_mgr_;
+  SetJmpErrorMgr* error_mgr_;
+
+  // true iff at least one component has scanline padding. (i.e.,
+  // GetComponentScanlinePadding() != 0.)
+  bool has_scanline_padding_;
+
+  // Temporaries used to point to scanline outputs.
+  int num_outbufs_;  // Outermost size of all arrays below.
+  uint8*** scanlines_;
+  int* scanlines_sizes_;
+  // Temporary buffer used for decoding when we can't decode directly to the
+  // output buffers. Large enough for just one iMCU row.
+  uint8** databuf_;
+  int* databuf_strides_;
+};
+
+}  // namespace libyuv
+
+#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_  NOLINT
diff --git a/files/include/libyuv/planar_functions.h b/files/include/libyuv/planar_functions.h
index 9c0a10a3..7e43dabb 100644
--- a/files/include/libyuv/planar_functions.h
+++ b/files/include/libyuv/planar_functions.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,155 +8,331 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  // NOLINT
 #define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
 
 #include "libyuv/basic_types.h"
 
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+
+#ifdef __cplusplus
 namespace libyuv {
+extern "C" {
+#endif
 
-// Copy I420 to I420.
-int I420Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height);
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+              int width, int height,
+              uint32 value);
 
-// Draw a rectangle into I420
-int I420Rect(uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int x, int y,
-             int width, int height,
-             int value_y, int value_u, int value_v);
+// Alias.
+#define I400ToI400 CopyPlane
 
-// Convert I422 to I420.  Used by MJPG.
-int I422ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
+// Copy a plane of data (I420 to I400).
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
                uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-// Convert NV12 to I420.  Also used for NV21.
-int NV12ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-// Convert NV12 to I420. Deprecated.
-int NV12ToI420(const uint8* src_y,
-               const uint8* src_uv, int src_stride,
+// Convert UYVY to I422.
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-// Convert Q420 to I420.
-int Q420ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_yuy2, int src_stride_yuy2,
+// Convert I420 to I400. (calls CopyPlane ignoring u/v).
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-// Convert M420 to I420.
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
+// I420 mirror.
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-// Convert YUY2 to I420.
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// Aliases.
+#define ARGBToBGRA BGRAToARGB
+#define ARGBToABGR ABGRToARGB
+
+// Convert ARGB To RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height);
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_rgb, int dst_stride_rgb,
+              int width, int height);
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height);
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height);
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
                uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-// Convert UYVY to I420.
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+// ARGB little endian (bgra in memory) to I422.
+LIBYUV_API
+int ARGBToI422(const uint8* src_frame, int src_stride_frame,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-// Convert I420 to ARGB.
-int I420ToARGB(const uint8* src_y, int src_stride_y,
+// I422ToARGB is in convert_argb.h
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
+               uint8* dst_bgra, int dst_stride_bgra,
                int width, int height);
 
-// Convert I420 to BGRA.
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
+               uint8* dst_abgr, int dst_stride_abgr,
                int width, int height);
 
-// Convert I420 to ABGR.
-int I420ToABGR(const uint8* src_y, int src_stride_y,
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+// Draw a rectangle into I420.
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int x, int y, int width, int height,
+             int value_y, int value_u, int value_v);
+
+// Draw a rectangle into ARGB.
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+             int x, int y, int width, int height, uint32 value);
+
+// Convert ARGB to gray scale ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height);
 
-// Convert I422 to ARGB.
-int I422ToARGB(const uint8* src_y, int src_stride_y,
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+             int x, int y, int width, int height);
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+              int x, int y, int width, int height);
+
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The last 4 coefficients apply to B, G, R, A and produce R of the output.
+LIBYUV_API
+int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                    const int8* matrix_argb,
+                    int x, int y, int width, int height);
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                   const uint8* table_argb,
+                   int x, int y, int width, int height);
+
+// Quantize a rectangle of ARGB. Alpha unaffected.
+// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
+// interval_size should be a value between 1 and 255.
+// interval_offset should be a value between 0 and 255.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+                 int scale, int interval_size, int interval_offset,
+                 int x, int y, int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
+                             uint8* dst_argb, int width);
+
+// Get function to Alpha Blend ARGB pixels and store to destination.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend();
+
+// Alpha Blend ARGB images and store to destination.
+// Alpha of destination is set to 255.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+              const uint8* src_argb1, int src_stride_argb1,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// Convert I422 to YUY2.
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
+               uint8* dst_frame, int dst_stride_frame,
                int width, int height);
 
-// Convert I444 to ARGB.
-int I444ToARGB(const uint8* src_y, int src_stride_y,
+// Convert I422 to UYVY.
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
+               uint8* dst_frame, int dst_stride_frame,
                int width, int height);
 
-// Convert I400 to ARGB.
-int I400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+// Convert unattentuated ARGB to preattenuated ARGB.
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height);
 
-// Convert I400 to ARGB.  Reverse of ARGBToI400
-int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
-                         uint8* dst_argb, int dst_stride_argb,
-                         int width, int height);
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height);
 
-// Convert RAW to ARGB.
-int RAWToARGB(const uint8* src_raw, int src_stride_raw,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height);
+// Convert MJPG to ARGB.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+               uint8* argb, int argb_stride,
+               int w, int h, int dw, int dh);
 
-// Convert BG24 to ARGB.
-int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+                             int32* dst_cumsum, int dst_stride32_cumsum,
+                             int width, int height);
 
-// Convert ABGR to ARGB. Also used for ARGB to ABGR.
-int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+// Blur ARGB image.
+// Caller should allocate dst_cumsum table of width * height * 16 bytes aligned
+// to 16 byte boundary.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int32* dst_cumsum, int dst_stride32_cumsum,
+             int width, int height, int radius);
 
-// Convert BGRA to ARGB. Also used for ARGB to BGRA.
-int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+// Multiply ARGB image by ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height, uint32 value);
 
-// Convert ARGB to I400.
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
+// Interpolate between two ARGB images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
+// and 255 means 1% src_argb0 and 99% src_argb1.
+// Internally uses ARGBScale bilinear filtering.
+// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+                    const uint8* src_argb1, int src_stride_argb1,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height, int interpolation);
+
+#if defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+    defined(TARGET_IPHONE_SIMULATOR)
+#define YUV_DISABLE_ASM
+#endif
+// Row functions for copying a pixels from a source with a slope to a row
+// of destination. Useful for scaling, rotation, mirror, texture mapping.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+                     uint8* dst_argb, const float* uv_dudv, int width);
+// The following are available on all x86 platforms:
+#if !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width);
+#define HAS_ARGBAFFINEROW_SSE2
+#endif
 
+#ifdef __cplusplus
+}  // extern "C"
 }  // namespace libyuv
+#endif
 
-#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  NOLINT
diff --git a/files/include/libyuv/rotate.h b/files/include/libyuv/rotate.h
index 65c38de3..e7608a2d 100644
--- a/files/include/libyuv/rotate.h
+++ b/files/include/libyuv/rotate.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,45 +8,103 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_ROTATE_H_
+#ifndef INCLUDE_LIBYUV_ROTATE_H_  // NOLINT
 #define INCLUDE_LIBYUV_ROTATE_H_
 
 #include "libyuv/basic_types.h"
 
+#ifdef __cplusplus
 namespace libyuv {
+extern "C" {
+#endif
 
-// Supported rotation
+// Supported rotation.
 enum RotationMode {
-  kRotate0 = 0, // No rotation
-  kRotate90 = 90,  // Rotate 90 degrees clockwise
-  kRotate180 = 180,  // Rotate 180 degrees
-  kRotate270 = 270,  // Rotate 270 degrees clockwise
+  kRotate0 = 0,  // No rotation.
+  kRotate90 = 90,  // Rotate 90 degrees clockwise.
+  kRotate180 = 180,  // Rotate 180 degrees.
+  kRotate270 = 270,  // Rotate 270 degrees clockwise.
 
-  // Deprecated
+  // Deprecated.
   kRotateNone = 0,
   kRotateClockwise = 90,
   kRotateCounterClockwise = 270,
 };
 
-// Rotate I420 frame
+// Rotate I420 frame.
+LIBYUV_API
 int I420Rotate(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
-               int width, int height,
-               RotationMode mode);
+               int src_width, int src_height, RotationMode mode);
 
-// Rotate NV12 input and store in I420
+// Rotate NV12 input and store in I420.
+LIBYUV_API
 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
                      const uint8* src_uv, int src_stride_uv,
                      uint8* dst_y, int dst_stride_y,
                      uint8* dst_u, int dst_stride_u,
                      uint8* dst_v, int dst_stride_v,
-                     int width, int height,
-                     RotationMode mode);
+                     int src_width, int src_height, RotationMode mode);
 
+// Rotate planes by 90, 180, 270
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride,
+                   int width, int height);
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+                uint8* dst_a, int dst_stride_a,
+                uint8* dst_b, int dst_stride_b,
+                int width, int height);
+
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
 }  // namespace libyuv
+#endif
 
-#endif  // INCLUDE_LIBYUV_ROTATE_H_
+#endif  // INCLUDE_LIBYUV_ROTATE_H_  NOLINT
diff --git a/files/include/libyuv/rotate_argb.h b/files/include/libyuv/rotate_argb.h
new file mode 100644
index 00000000..a2781df3
--- /dev/null
+++ b/files/include/libyuv/rotate_argb.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"  // For RotationMode.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Rotate ARGB frame
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int src_width, int src_height, RotationMode mode);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_  NOLINT
diff --git a/files/include/libyuv/row.h b/files/include/libyuv/row.h
new file mode 100644
index 00000000..4814f254
--- /dev/null
+++ b/files/include/libyuv/row.h
@@ -0,0 +1,731 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// TODO(fbarchard): Remove kMaxStride
+#define kMaxStride (2880 * 4)
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
+
+#if defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+    defined(TARGET_IPHONE_SIMULATOR)
+#define YUV_DISABLE_ASM
+#endif
+// True if compiling for SSSE3 as a requirement.
+#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
+#define LIBYUV_SSSE3_ONLY
+#endif
+
+// The following are available on all x86 platforms:
+#if !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+// Conversions.
+#define HAS_ABGRTOARGBROW_SSSE3
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_ARGB1555TOARGBROW_SSE2
+#define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBTOARGB1555ROW_SSE2
+#define HAS_ARGBTOARGB4444ROW_SSE2
+#define HAS_ARGBTORAWROW_SSSE3
+#define HAS_ARGBTORGB24ROW_SSSE3
+#define HAS_ARGBTORGB565ROW_SSE2
+#define HAS_ARGBTORGBAROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
+#define HAS_BGRATOARGBROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_COPYROW_SSE2
+#define HAS_COPYROW_X86
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_I411TOARGBROW_SSSE3
+#define HAS_I422TOABGRROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
+#define HAS_I422TOBGRAROW_SSSE3
+#define HAS_I444TOARGBROW_SSSE3
+#define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORROWUV_SSSE3
+#define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV21TOARGBROW_SSSE3
+#define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB565TOARGBROW_SSE2
+#define HAS_SETROW_X86
+#define HAS_SPLITUV_SSE2
+#define HAS_UYVYTOUV422ROW_SSE2
+#define HAS_UYVYTOUVROW_SSE2
+#define HAS_UYVYTOYROW_SSE2
+#define HAS_YTOARGBROW_SSE2
+#define HAS_YUY2TOUV422ROW_SSE2
+#define HAS_YUY2TOUVROW_SSE2
+#define HAS_YUY2TOYROW_SSE2
+
+// Effects
+#define HAS_ARGBAFFINEROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBINTERPOLATEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSSE3
+#define HAS_ARGBQUANTIZEROW_SSE2
+#define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBSHADE_SSE2
+#define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGE_SSE2
+#endif
+
+// The following are Windows only:
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+#define HAS_ABGRTOARGBROW_SSSE3
+#define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_I422TORGBAROW_SSSE3
+#define HAS_RGBATOARGBROW_SSSE3
+#define HAS_RGBATOUVROW_SSSE3
+#define HAS_RGBATOYROW_SSSE3
+#endif
+
+// The following are disabled when SSSE3 is available:
+#if !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
+    !defined(LIBYUV_SSSE3_ONLY)
+#define HAS_ARGBATTENUATE_SSE2
+#define HAS_ARGBBLENDROW_SSE2
+#define HAS_MIRRORROW_SSE2
+#endif
+
+// The following are available on Neon platforms
+#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_COPYROW_NEON
+#define HAS_I422TOABGRROW_NEON
+#define HAS_I422TOARGBROW_NEON
+#define HAS_I422TOBGRAROW_NEON
+#define HAS_I422TORAWROW_NEON
+#define HAS_I422TORGB24ROW_NEON
+#define HAS_I422TORGBAROW_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORROWUV_NEON
+#define HAS_SETROW_NEON
+#define HAS_SPLITUV_NEON
+#define HAS_UYVYTOUV422ROW_NEON
+#define HAS_UYVYTOUVROW_NEON
+#define HAS_UYVYTOYROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_YUY2TOYROW_NEON
+
+// TODO(fbarchard): Hook these up to calling functions.
+#define HAS_ABGRTOARGBROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORGBAROW_NEON
+#define HAS_BGRATOARGBROW_NEON
+#define HAS_NV12TOARGBROW_NEON
+#define HAS_NV21TOARGBROW_NEON
+#define HAS_RAWTOARGBROW_NEON
+#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RGBATOARGBROW_NEON
+#endif
+
+#if defined(_MSC_VER) && !defined(__CLR_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+typedef __declspec(align(16)) int8 vec8[16];
+typedef __declspec(align(16)) uint8 uvec8[16];
+typedef __declspec(align(16)) int16 vec16[8];
+typedef __declspec(align(16)) uint16 uvec16[8];
+typedef __declspec(align(16)) int32 vec32[4];
+typedef __declspec(align(16)) uint32 uvec32[4];
+#elif defined(__GNUC__)
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+typedef int8 __attribute__((vector_size(16))) vec8;
+typedef uint8 __attribute__((vector_size(16))) uvec8;
+typedef int16 __attribute__((vector_size(16))) vec16;
+typedef uint16 __attribute__((vector_size(16))) uvec16;
+typedef int32 __attribute__((vector_size(16))) vec32;
+typedef uint32 __attribute__((vector_size(16))) uvec32;
+#else
+#define SIMD_ALIGNED(var) var
+typedef int8 vec8[16];
+typedef uint8 uvec8[16];
+typedef int16 vec16[8];
+typedef uint16 uvec16[8];
+typedef int32 vec32[4];
+typedef uint32 uvec32[4];
+#endif
+
+#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
+#define OMITFP
+#else
+#define OMITFP __attribute__((optimize("omit-frame-pointer")))
+#endif
+
+void I422ToARGBRow_NEON(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width);
+void I422ToBGRARow_NEON(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width);
+void I422ToABGRRow_NEON(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width);
+void I422ToRGBARow_NEON(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width);
+void I422ToRGB24Row_NEON(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int width);
+void I422ToRAWRow_NEON(const uint8* y_buf,
+                       const uint8* u_buf,
+                       const uint8* v_buf,
+                       uint8* rgb_buf,
+                       int width);
+void NV12ToARGBRow_NEON(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width);
+void NV21ToARGBRow_NEON(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width);
+
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width);
+
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_C(const uint8* src, uint8* dst, int width);
+
+void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
+void MirrorRowUV_NEON(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
+void MirrorRowUV_C(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
+
+void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_X86(const uint8* src, uint8* dst, int count);
+void CopyRow_NEON(const uint8* src, uint8* dst, int count);
+void CopyRow_C(const uint8* src, uint8* dst, int count);
+
+void SetRow8_X86(uint8* dst, uint32 v32, int count);
+void SetRows32_X86(uint8* dst, uint32 v32, int width,
+                   int dst_stride, int height);
+void SetRow8_NEON(uint8* dst, uint32 v32, int count);
+void SetRows32_NEON(uint8* dst, uint32 v32, int width,
+                    int dst_stride, int height);
+void SetRow8_C(uint8* dst, uint32 v32, int count);
+void SetRows32_C(uint8* dst, uint32 v32, int width, int dst_stride, int height);
+
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void RGBAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+
+void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+
+void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);
+void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);
+void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix);
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
+
+void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix);
+void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix);
+void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix);
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+
+void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);
+void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);
+void RGBAToARGBRow_C(const uint8* src_rgba, uint8* dst_argb, int pix);
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+
+void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
+
+void I444ToARGBRow_C(const uint8* y_buf,
+                     const uint8* u_buf,
+                     const uint8* v_buf,
+                     uint8* argb_buf,
+                     int width);
+
+void I422ToARGBRow_C(const uint8* y_buf,
+                     const uint8* u_buf,
+                     const uint8* v_buf,
+                     uint8* argb_buf,
+                     int width);
+
+void I411ToARGBRow_C(const uint8* y_buf,
+                     const uint8* u_buf,
+                     const uint8* v_buf,
+                     uint8* rgb_buf,
+                     int width);
+
+void NV12ToARGBRow_C(const uint8* y_buf,
+                     const uint8* uv_buf,
+                     uint8* argb_buf,
+                     int width);
+
+void NV21ToARGBRow_C(const uint8* y_buf,
+                     const uint8* vu_buf,
+                     uint8* argb_buf,
+                     int width);
+
+void I422ToBGRARow_C(const uint8* y_buf,
+                     const uint8* u_buf,
+                     const uint8* v_buf,
+                     uint8* bgra_buf,
+                     int width);
+
+void I422ToABGRRow_C(const uint8* y_buf,
+                     const uint8* u_buf,
+                     const uint8* v_buf,
+                     uint8* abgr_buf,
+                     int width);
+
+void I422ToRGBARow_C(const uint8* y_buf,
+                     const uint8* u_buf,
+                     const uint8* v_buf,
+                     uint8* rgba_buf,
+                     int width);
+void I422ToRGB24Row_C(const uint8* y_buf,
+                      const uint8* u_buf,
+                      const uint8* v_buf,
+                      uint8* rgb24_buf,
+                      int width);
+void I422ToRAWRow_C(const uint8* y_buf,
+                    const uint8* u_buf,
+                    const uint8* v_buf,
+                    uint8* raw_buf,
+                    int width);
+
+void YToARGBRow_C(const uint8* y_buf,
+                  uint8* rgb_buf,
+                  int width);
+
+void I444ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* argb_buf,
+                         int width);
+
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* argb_buf,
+                         int width);
+
+void I411ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int width);
+
+void NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* uv_buf,
+                         uint8* argb_buf,
+                         int width);
+
+void NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* vu_buf,
+                         uint8* argb_buf,
+                         int width);
+
+void I422ToBGRARow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* bgra_buf,
+                         int width);
+
+void I422ToABGRRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* abgr_buf,
+                         int width);
+
+void I422ToRGBARow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgba_buf,
+                         int width);
+
+void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* argb_buf,
+                                   int width);
+
+void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* argb_buf,
+                                   int width);
+
+void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* rgb_buf,
+                                   int width);
+
+void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* uv_buf,
+                                   uint8* argb_buf,
+                                   int width);
+
+void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* vu_buf,
+                                   uint8* argb_buf,
+                                   int width);
+
+void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* bgra_buf,
+                                   int width);
+
+void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* abgr_buf,
+                                   int width);
+
+void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* rgba_buf,
+                                   int width);
+
+void I444ToARGBRow_Any_SSSE3(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* argb_buf,
+                             int width);
+
+void I422ToARGBRow_Any_SSSE3(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* argb_buf,
+                             int width);
+
+void I411ToARGBRow_Any_SSSE3(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width);
+
+void NV12ToARGBRow_Any_SSSE3(const uint8* y_buf,
+                             const uint8* uv_buf,
+                             uint8* argb_buf,
+                             int width);
+
+void NV21ToARGBRow_Any_SSSE3(const uint8* y_buf,
+                             const uint8* vu_buf,
+                             uint8* argb_buf,
+                             int width);
+
+void I422ToBGRARow_Any_SSSE3(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* bgra_buf,
+                             int width);
+
+void I422ToABGRRow_Any_SSSE3(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* abgr_buf,
+                             int width);
+
+void I422ToRGBARow_Any_SSSE3(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgba_buf,
+                             int width);
+
+void YToARGBRow_SSE2(const uint8* y_buf,
+                     uint8* argb_buf,
+                     int width);
+
+// ARGB preattenuated alpha blend.
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                        uint8* dst_argb, int width);
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                    uint8* dst_argb, int width);
+
+void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RGBAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int width);
+void I422ToARGBRow_Any_NEON(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width);
+void I422ToBGRARow_Any_NEON(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width);
+void I422ToABGRRow_Any_NEON(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width);
+void I422ToRGBARow_Any_NEON(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width);
+void I422ToRGB24Row_Any_NEON(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width);
+void I422ToRAWRow_Any_NEON(const uint8* y_buf,
+                           const uint8* u_buf,
+                           const uint8* v_buf,
+                           uint8* rgb_buf,
+                           int width);
+void NV12ToARGBRow_Any_NEON(const uint8* y_buf,
+                            const uint8* uv_buf,
+                            uint8* argb_buf,
+                            int width);
+void NV21ToARGBRow_Any_NEON(const uint8* y_buf,
+                            const uint8* uv_buf,
+                            uint8* argb_buf,
+                            int width);
+
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
+                               uint8* dst_y, int pix);
+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                                uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
+                                   uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
+                   uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int pix);
+
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
+                               uint8* dst_y, int pix);
+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                                uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
+                                   uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
+
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
+                   uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int pix);
+
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+
+// Inverse table for unattenuate, shared by C and SSE2.
+extern uint32 fixed_invtbl8[256];
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBSepiaRow_C(uint8* dst_argb, int width);
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
+
+void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
+                              int width);
+
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+                       int interval_offset, int width);
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width);
+
+// Used for blur.
+void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
+                                 int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width);
+
+void CumulativeSumToAverage_C(const int32* topleft, const int32* botleft,
+                              int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+                               const int32* previous_cumsum, int width);
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                    uint32 value);
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value);
+
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+                     uint8* dst_argb, const float* uv_dudv, int width);
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width);
+
+void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction);
+void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride, int dst_width,
+                              int source_y_fraction);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROW_H_  NOLINT
+
diff --git a/files/include/libyuv/scale.h b/files/include/libyuv/scale.h
index 8433908b..18098798 100644
--- a/files/include/libyuv/scale.h
+++ b/files/include/libyuv/scale.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,20 +8,31 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_SCALE_H_
+#ifndef INCLUDE_LIBYUV_SCALE_H_  // NOLINT
 #define INCLUDE_LIBYUV_SCALE_H_
 
 #include "libyuv/basic_types.h"
 
+#ifdef __cplusplus
 namespace libyuv {
+extern "C" {
+#endif
 
 // Supported filtering
 enum FilterMode {
-  kFilterNone = 0,  // Point sample; Fastest
+  kFilterNone = 0,  // Point sample; Fastest.
   kFilterBilinear = 1,  // Faster than box, but lower quality scaling down.
-  kFilterBox = 2  // Highest quality
+  kFilterBox = 2  // Highest quality.
 };
 
+// Scale a YUV plane.
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+                int src_width, int src_height,
+                uint8* dst, int dst_stride,
+                int dst_width, int dst_height,
+                FilterMode filtering);
+
 // Scales a YUV 4:2:0 image from the src width and height to the
 // dst width and height.
 // If filtering is kFilterNone, a simple nearest-neighbor algorithm is
@@ -32,6 +43,7 @@ enum FilterMode {
 // quality image, at further expense of speed.
 // Returns 0 if successful.
 
+LIBYUV_API
 int I420Scale(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
@@ -42,15 +54,8 @@ int I420Scale(const uint8* src_y, int src_stride_y,
               int dst_width, int dst_height,
               FilterMode filtering);
 
-// Legacy API
-// If dst_height_offset is non-zero, the image is offset by that many pixels
-// and stretched to (dst_height - dst_height_offset * 2) pixels high,
-// instead of dst_height.
-int Scale(const uint8* src, int src_width, int src_height,
-          uint8* dst, int dst_width, int dst_height, int dst_height_offset,
-          bool interpolate);
-
-// Same, but specified src terms of each plane location and stride.
+// Legacy API.  Deprecated.
+LIBYUV_API
 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
           int src_stride_y, int src_stride_u, int src_stride_v,
           int src_width, int src_height,
@@ -59,9 +64,19 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
           int dst_width, int dst_height,
           bool interpolate);
 
-// For testing, allow disabling of optimizations.
+// Legacy API.  Deprecated.
+LIBYUV_API
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+                bool interpolate);
+
+// For testing, allow disabling of specialized scalers.
+LIBYUV_API
 void SetUseReferenceImpl(bool use);
 
-} // namespace libyuv
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
 
-#endif // INCLUDE_LIBYUV_SCALE_H_
+#endif  // INCLUDE_LIBYUV_SCALE_H_  NOLINT
diff --git a/files/include/libyuv/scale_argb.h b/files/include/libyuv/scale_argb.h
new file mode 100644
index 00000000..1af0e1dc
--- /dev/null
+++ b/files/include/libyuv/scale_argb.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"  // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+              int src_width, int src_height,
+              uint8* dst_argb, int dst_stride_argb,
+              int dst_width, int dst_height,
+              FilterMode filtering);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_  NOLINT
diff --git a/files/include/libyuv/version.h b/files/include/libyuv/version.h
new file mode 100644
index 00000000..e782ae18
--- /dev/null
+++ b/files/include/libyuv/version.h
@@ -0,0 +1,16 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
+#define INCLUDE_LIBYUV_VERSION_H_
+
+#define LIBYUV_VERSION 397
+
+#endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/files/include/libyuv/video_common.h b/files/include/libyuv/video_common.h
new file mode 100644
index 00000000..5d812c98
--- /dev/null
+++ b/files/include/libyuv/video_common.h
@@ -0,0 +1,159 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Common definitions for video, including fourcc and VideoFormat.
+
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_  // NOLINT
+#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+// Definition of FourCC codes
+//////////////////////////////////////////////////////////////////////////////
+
+// Convert four characters to a FourCC code.
+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
+// constants are used in a switch.
+#define FOURCC(a, b, c, d) ( \
+    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
+    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+
+// Some pages discussing FourCC codes:
+//   http://www.fourcc.org/yuv.php
+//   http://v4l2spec.bytesex.org/spec/book1.htm
+//   http://developer.apple.com/quicktime/icefloe/dispatch020.html
+//   http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
+//   http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
+
+enum FourCC {
+  // Canonical fourcc codes used in our code.
+  FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+  FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+  FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+  FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
+  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+  FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+  FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+  FOURCC_V210 = FOURCC('V', '2', '1', '0'),
+  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+  FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+  FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+  FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // bgr565.
+  FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // abgr1555.
+  FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444.
+  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
+  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+  FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+  // Next four are Bayer RGB formats. The four characters define the order of
+  // the colours in each 2x2 pixel grid, going left-to-right and top-to-bottom.
+  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+
+  // Aliases for canonical fourcc codes, replaced with their canonical
+  // equivalents by CanonicalFourCC().
+  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
+  FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
+  FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
+  FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
+  FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
+  FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
+  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY.
+  FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
+  FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
+  FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
+  FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
+  FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
+
+  // Match any fourcc.
+  FOURCC_ANY  = 0xFFFFFFFF,
+};
+
+enum FourCCBpp {
+  // Canonical fourcc codes used in our code.
+  FOURCC_BPP_I420 = 12,
+  FOURCC_BPP_I422 = 16,
+  FOURCC_BPP_I444 = 24,
+  FOURCC_BPP_I411 = 12,
+  FOURCC_BPP_I400 = 8,
+  FOURCC_BPP_YU12 = 12,
+  FOURCC_BPP_YV12 = 12,
+  FOURCC_BPP_YV16 = 16,
+  FOURCC_BPP_YV24 = 24,
+  FOURCC_BPP_YUY2 = 16,
+  FOURCC_BPP_UYVY = 16,
+  FOURCC_BPP_M420 = 12,
+  FOURCC_BPP_Q420 = 12,
+  FOURCC_BPP_V210 = 22,  // 128 / 6 actually.
+  FOURCC_BPP_24BG = 24,
+  FOURCC_BPP_ARGB = 32,
+  FOURCC_BPP_BGRA = 32,
+  FOURCC_BPP_ABGR = 32,
+  FOURCC_BPP_RGBA = 32,
+  FOURCC_BPP_RGBP = 16,
+  FOURCC_BPP_RGBO = 16,
+  FOURCC_BPP_R444 = 16,
+  FOURCC_BPP_RAW  = 24,
+  FOURCC_BPP_NV21 = 12,
+  FOURCC_BPP_NV12 = 12,
+  FOURCC_BPP_MJPG = 0,  // 0 means unknown.
+  FOURCC_BPP_H264 = 0,
+  // Next four are Bayer RGB formats. The four characters define the order of
+  // the colours in each 2x2 pixel grid, going left-to-right and top-to-bottom.
+  FOURCC_BPP_RGGB = 8,
+  FOURCC_BPP_BGGR = 8,
+  FOURCC_BPP_GRBG = 8,
+  FOURCC_BPP_GBRG = 8,
+
+  // Aliases for canonical fourcc codes, replaced with their canonical
+  // equivalents by CanonicalFourCC().
+  FOURCC_BPP_IYUV = 12,
+  FOURCC_BPP_YU16 = 16,
+  FOURCC_BPP_YU24 = 24,
+  FOURCC_BPP_YUYV = 16,
+  FOURCC_BPP_YUVS = 16,
+  FOURCC_BPP_HDYC = 16,
+  FOURCC_BPP_2VUY = 16,
+  FOURCC_BPP_JPEG = 1,
+  FOURCC_BPP_DMB1 = 1,
+  FOURCC_BPP_BA81 = 8,
+  FOURCC_BPP_RGB3 = 24,
+  FOURCC_BPP_BGR3 = 24,
+
+  // Match any fourcc.
+  FOURCC_BPP_ANY  = 0,  // 0 means unknown.
+};
+
+// Converts fourcc aliases into canonical ones.
+LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_  NOLINT
diff --git a/files/libyuv.gyp b/files/libyuv.gyp
index d5abab73..18137538 100644
--- a/files/libyuv.gyp
+++ b/files/libyuv.gyp
@@ -1,4 +1,4 @@
-# Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+# Copyright 2011 The LibYuv Project Authors. All rights reserved.
 #
 # Use of this source code is governed by a BSD-style license
 # that can be found in the LICENSE file in the root of the source
@@ -7,60 +7,85 @@
 # be found in the AUTHORS file in the root of the source tree.
 
 {
+  'variables': {
+     'use_system_libjpeg%': 0,
+  },
   'targets': [
     {
       'target_name': 'libyuv',
       'type': 'static_library',
+      # 'type': 'shared_library',
+      'conditions': [
+         ['use_system_libjpeg==0', {
+          'dependencies': [
+             '<(DEPTH)/third_party/libjpeg_turbo/libjpeg.gyp:libjpeg',
+          ],
+        }, {
+          'link_settings': {
+            'libraries': [
+              '-ljpeg',
+            ],
+          },
+        }],
+      ],
+      'defines': [
+        'HAVE_JPEG',
+        # 'LIBYUV_BUILDING_SHARED_LIBRARY',
+      ],
       'include_dirs': [
-        'common',
         'include',
+        '.',
       ],
       'direct_dependent_settings': {
         'include_dirs': [
-          'common',
           'include',
+          '.',
         ],
       },
       'sources': [
-        # includes
-        'include/convert.h',
-        'include/general.h',
-        'include/scale.h',
-        'include/planar_functions.h',
+        # includes.
+        'include/libyuv.h',
+        'include/libyuv/basic_types.h',
+        'include/libyuv/compare.h',
+        'include/libyuv/convert.h',
+        'include/libyuv/convert_argb.h',
+        'include/libyuv/convert_from.h',
+        'include/libyuv/cpu_id.h',
+        'include/libyuv/format_conversion.h',
+        'include/libyuv/mjpeg_decoder.h',
+        'include/libyuv/planar_functions.h',
+        'include/libyuv/rotate.h',
+        'include/libyuv/rotate_argb.h',
+        'include/libyuv/row.h',
+        'include/libyuv/scale.h',
+        'include/libyuv/scale_argb.h',
+        'include/libyuv/version.h',
+        'include/libyuv/video_common.h',
 
-        # headers
-        'common/basic_types.h',
-        'common/common.h',
-        'common/constructor_magic.h',
-        'source/cpu_id.h',
-        'source/rotate.h'
-        'source/row.h',
-        'source/video_common.h',
-
-        # sources
+        # sources.
+        'source/compare.cc',
+        'source/compare_neon.cc',
         'source/convert.cc',
+        'source/convert_argb.cc',
+        'source/convert_from.cc',
         'source/cpu_id.cc',
         'source/format_conversion.cc',
-        'source/general.cc',
+        'source/mjpeg_decoder.cc',
         'source/planar_functions.cc',
         'source/rotate.cc',
-        'source/row_table.cc',
+        'source/rotate_argb.cc',
+        'source/rotate_neon.cc',
+        'source/row_common.cc',
+        'source/row_neon.cc',
+        'source/row_posix.cc',
+        'source/row_win.cc',
         'source/scale.cc',
+        'source/scale_neon.cc',
+        'source/scale_argb.cc',
         'source/video_common.cc',
       ],
-      'conditions': [
-        ['OS=="win"', {
-         'sources': [
-           'source/row_win.cc',
-         ],
-        },{ # else
-         'sources': [
-            'source/row_posix.cc',
-          ],
-        }],
-      ]
     },
-  ], # targets
+  ], # targets.
 }
 
 # Local Variables:
diff --git a/files/libyuv_test.gyp b/files/libyuv_test.gyp
new file mode 100755
index 00000000..27cec8f4
--- /dev/null
+++ b/files/libyuv_test.gyp
@@ -0,0 +1,74 @@
+# Copyright 2011 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+  'targets': [
+    {
+      'target_name': 'libyuv_unittest',
+      'type': 'executable',
+      'dependencies': [
+        'libyuv.gyp:libyuv',
+        # The tests are based on gtest
+        'testing/gtest.gyp:gtest',
+        'testing/gtest.gyp:gtest_main',
+      ],
+      'defines': [
+        'LIBYUV_SVNREVISION="<!(svnversion -n)"',
+        # 'LIBYUV_USING_SHARED_LIBRARY',
+      ],
+      'sources': [
+        # headers
+        'unit_test/unit_test.h',
+
+        # sources
+        'unit_test/compare_test.cc',
+        'unit_test/cpu_test.cc',
+        'unit_test/planar_test.cc',
+        'unit_test/rotate_argb_test.cc',
+        'unit_test/rotate_test.cc',
+        'unit_test/scale_argb_test.cc',
+        'unit_test/scale_test.cc',
+        'unit_test/unit_test.cc',
+        'unit_test/version_test.cc',
+      ],
+      'conditions': [
+        ['OS=="linux"', {
+          'cflags': [
+            '-fexceptions',
+          ],
+        }],
+      ], # conditions
+    },
+
+    {
+      'target_name': 'compare',
+      'type': 'executable',
+      'dependencies': [
+        'libyuv.gyp:libyuv',
+      ],
+      'sources': [
+        # sources
+        'util/compare.cc',
+      ],
+      'conditions': [
+        ['OS=="linux"', {
+          'cflags': [
+            '-fexceptions',
+          ],
+        }],
+      ], # conditions
+    },
+
+  ], # targets
+}
+
+# Local Variables:
+# tab-width:2
+# indent-tabs-mode:nil
+# End:
+# vim: set expandtab tabstop=2 shiftwidth=2:
diff --git a/files/source/compare.cc b/files/source/compare.cc
new file mode 100644
index 00000000..bf4a7dae
--- /dev/null
+++ b/files/source/compare.cc
@@ -0,0 +1,571 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/compare.h"
+
+#include <float.h>
+#include <math.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
+  uint32 hash = seed;
+  for (int i = 0; i < count; ++i) {
+    hash += (hash << 5) + src[i];
+  }
+  return hash;
+}
+
+// This module is for Visual C x86
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+#define HAS_HASHDJB2_SSE41
+static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static const uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+static const uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+static const uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+static const uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+
+// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
+// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
+// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
+// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
+// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
+#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
+    _asm _emit 0x40 _asm _emit reg
+
+__declspec(naked) __declspec(align(16))
+static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  __asm {
+    mov        eax, [esp + 4]    // src
+    mov        ecx, [esp + 8]    // count
+    movd       xmm0, [esp + 12]  // seed
+
+    pxor       xmm7, xmm7        // constant 0 for unpck
+    movdqa     xmm6, kHash16x33
+
+    align      16
+  wloop:
+    movdqu     xmm1, [eax]       // src[0-15]
+    lea        eax, [eax + 16]
+    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
+    movdqa     xmm5, kHashMul0
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm2, xmm7        // src[0-7]
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm3, xmm7        // src[0-3]
+    pmulld(0xdd)                 // pmulld     xmm3, xmm5
+    movdqa     xmm5, kHashMul1
+    movdqa     xmm4, xmm2
+    punpckhwd  xmm4, xmm7        // src[4-7]
+    pmulld(0xe5)                 // pmulld     xmm4, xmm5
+    movdqa     xmm5, kHashMul2
+    punpckhbw  xmm1, xmm7        // src[8-15]
+    movdqa     xmm2, xmm1
+    punpcklwd  xmm2, xmm7        // src[8-11]
+    pmulld(0xd5)                 // pmulld     xmm2, xmm5
+    movdqa     xmm5, kHashMul3
+    punpckhwd  xmm1, xmm7        // src[12-15]
+    pmulld(0xcd)                 // pmulld     xmm1, xmm5
+    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm1, xmm2
+    sub        ecx, 16
+    paddd      xmm1, xmm3
+
+    pshufd     xmm2, xmm1, 14    // upper 2 dwords
+    paddd      xmm1, xmm2
+    pshufd     xmm2, xmm1, 1
+    paddd      xmm1, xmm2
+    paddd      xmm0, xmm1
+    jg         wloop
+
+    movd       eax, xmm0        // return hash
+    ret
+  }
+}
+
+#elif !defined(YUV_DISABLE_ASM) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+// GCC 4.2 on OSX has link error when passing static or const to inline.
+// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
+#ifdef __APPLE__
+#define CONST
+#else
+#define CONST static const
+#endif
+#define HAS_HASHDJB2_SSE41
+CONST uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+CONST uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+CONST uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+CONST uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+CONST uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  uint32 hash;
+  asm volatile (
+    "movd      %2,%%xmm0                       \n"
+    "pxor      %%xmm7,%%xmm7                   \n"
+    "movdqa    %4,%%xmm6                       \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm1                     \n"
+    "lea       0x10(%0),%0                     \n"
+    "pmulld    %%xmm6,%%xmm0                   \n"
+    "movdqa    %5,%%xmm5                       \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm7,%%xmm3                   \n"
+    "pmulld    %%xmm5,%%xmm3                   \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "movdqa    %%xmm2,%%xmm4                   \n"
+    "punpckhwd %%xmm7,%%xmm4                   \n"
+    "pmulld    %%xmm5,%%xmm4                   \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "punpckhbw %%xmm7,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklwd %%xmm7,%%xmm2                   \n"
+    "pmulld    %%xmm5,%%xmm2                   \n"
+    "movdqa    %8,%%xmm5                       \n"
+    "punpckhwd %%xmm7,%%xmm1                   \n"
+    "pmulld    %%xmm5,%%xmm1                   \n"
+    "paddd     %%xmm4,%%xmm3                   \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "sub       $0x10,%1                        \n"
+    "paddd     %%xmm3,%%xmm1                   \n"
+    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "jg        1b                              \n"
+    "movd      %%xmm0,%3                       \n"
+  : "+r"(src),        // %0
+    "+r"(count),      // %1
+    "+rm"(seed),      // %2
+    "=g"(hash)        // %3
+  : "m"(kHash16x33),  // %4
+    "m"(kHashMul0),   // %5
+    "m"(kHashMul1),   // %6
+    "m"(kHashMul2),   // %7
+    "m"(kHashMul3)    // %8
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
+  return hash;
+}
+#endif  // HAS_HASHDJB2_SSE41
+
+// hash seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+#if defined(HAS_HASHDJB2_SSE41)
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    HashDjb2_SSE = HashDjb2_SSE41;
+  }
+#endif
+
+  const int kBlockSize = 1 << 15;  // 32768;
+  while (count >= static_cast<uint64>(kBlockSize)) {
+    seed = HashDjb2_SSE(src, kBlockSize, seed);
+    src += kBlockSize;
+    count -= kBlockSize;
+  }
+  int remainder = static_cast<int>(count) & ~15;
+  if (remainder) {
+    seed = HashDjb2_SSE(src, remainder, seed);
+    src += remainder;
+    count -= remainder;
+  }
+  remainder = static_cast<int>(count) & 15;
+  if (remainder) {
+    seed = HashDjb2_C(src, remainder, seed);
+  }
+  return seed;
+}
+
+#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SUMSQUAREERROR_NEON
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+
+#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+#define HAS_SUMSQUAREERROR_SSE2
+__declspec(naked) __declspec(align(16))
+static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
+                                  int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    pxor       xmm0, xmm0
+    pxor       xmm5, xmm5
+    sub        edx, eax
+
+    align      16
+  wloop:
+    movdqa     xmm1, [eax]
+    movdqa     xmm2, [eax + edx]
+    lea        eax,  [eax + 16]
+    sub        ecx, 16
+    movdqa     xmm3, xmm1  // abs trick
+    psubusb    xmm1, xmm2
+    psubusb    xmm2, xmm3
+    por        xmm1, xmm2
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm1, xmm5
+    punpckhbw  xmm2, xmm5
+    pmaddwd    xmm1, xmm1
+    pmaddwd    xmm2, xmm2
+    paddd      xmm0, xmm1
+    paddd      xmm0, xmm2
+    jg         wloop
+
+    pshufd     xmm1, xmm0, 0EEh
+    paddd      xmm0, xmm1
+    pshufd     xmm1, xmm0, 01h
+    paddd      xmm0, xmm1
+    movd       eax, xmm0
+    ret
+  }
+}
+
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_SUMSQUAREERROR_SSE2
+static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
+                                  int count) {
+  uint32 sse;
+  asm volatile (
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
+    "1:                                        \n"
+    "movdqa    (%0),%%xmm1                     \n"
+    "movdqa    (%0,%1,1),%%xmm2                \n"
+    "lea       0x10(%0),%0                     \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psubusb   %%xmm2,%%xmm1                   \n"
+    "psubusb   %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpckhbw %%xmm5,%%xmm2                   \n"
+    "pmaddwd   %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm2,%%xmm2                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "jg        1b                              \n"
+
+    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0,%3                       \n"
+
+  : "+r"(src_a),      // %0
+    "+r"(src_b),      // %1
+    "+r"(count),      // %2
+    "=g"(sse)         // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm5"
+#endif
+  );
+  return sse;
+}
+#endif
+
+static uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b,
+                               int count) {
+  uint32 sse = 0u;
+  for (int i = 0; i < count; ++i) {
+    int diff = src_a[i] - src_b[i];
+    sse += static_cast<uint32>(diff * diff);
+  }
+  return sse;
+}
+
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
+                             int count) {
+  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+      SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SumSquareError = SumSquareError_NEON;
+  }
+#elif defined(HAS_SUMSQUAREERROR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+    // Note only used for multiples of 16 so count is not checked.
+    SumSquareError = SumSquareError_SSE2;
+  }
+#endif
+  // 32K values will fit a 32bit int return value from SumSquareError.
+  // After each block of 32K, accumulate into 64 bit int.
+  const int kBlockSize = 1 << 15;  // 32768;
+  uint64 sse = 0;
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sse)
+#endif
+  for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+  }
+  src_a += count & ~(kBlockSize - 1);
+  src_b += count & ~(kBlockSize - 1);
+  int remainder = count & (kBlockSize - 1) & ~15;
+  if (remainder) {
+    sse += SumSquareError(src_a, src_b, remainder);
+    src_a += remainder;
+    src_b += remainder;
+  }
+  remainder = count & 15;
+  if (remainder) {
+    sse += SumSquareError_C(src_a, src_b, remainder);
+  }
+  return sse;
+}
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+                                  const uint8* src_b, int stride_b,
+                                  int width, int height) {
+  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+      SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SumSquareError = SumSquareError_NEON;
+  }
+#elif defined(HAS_SUMSQUAREERROR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
+      IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
+    SumSquareError = SumSquareError_SSE2;
+  }
+#endif
+
+  uint64 sse = 0;
+  for (int h = 0; h < height; ++h) {
+    sse += SumSquareError(src_a, src_b, width);
+    src_a += stride_a;
+    src_b += stride_b;
+  }
+
+  return sse;
+}
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
+  double psnr;
+  if (sse > 0) {
+    double mse = static_cast<double>(count) / static_cast<double>(sse);
+    psnr = 10.0 * log10(255.0 * 255.0 * mse);
+  } else {
+    psnr = kMaxPsnr;      // Limit to prevent divide by 0
+  }
+
+  if (psnr > kMaxPsnr)
+    psnr = kMaxPsnr;
+
+  return psnr;
+}
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height) {
+  const uint64 samples = width * height;
+  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
+                                                src_b, stride_b,
+                                                width, height);
+  return SumSquareErrorToPsnr(sse, samples);
+}
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height) {
+  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
+                                                  src_y_b, stride_y_b,
+                                                  width, height);
+  const int width_uv = (width + 1) >> 1;
+  const int height_uv = (height + 1) >> 1;
+  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
+                                                  src_u_b, stride_u_b,
+                                                  width_uv, height_uv);
+  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
+                                                  src_v_b, stride_v_b,
+                                                  width_uv, height_uv);
+  const uint64 samples = width * height + 2 * (width_uv * height_uv);
+  const uint64 sse = sse_y + sse_u + sse_v;
+  return SumSquareErrorToPsnr(sse, samples);
+}
+
+static const int64 cc1 =  26634;  // (64^2*(.01*255)^2
+static const int64 cc2 = 239708;  // (64^2*(.03*255)^2
+
+static double Ssim8x8_C(const uint8* src_a, int stride_a,
+                        const uint8* src_b, int stride_b) {
+  int64 sum_a = 0;
+  int64 sum_b = 0;
+  int64 sum_sq_a = 0;
+  int64 sum_sq_b = 0;
+  int64 sum_axb = 0;
+
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      sum_a += src_a[j];
+      sum_b += src_b[j];
+      sum_sq_a += src_a[j] * src_a[j];
+      sum_sq_b += src_b[j] * src_b[j];
+      sum_axb += src_a[j] * src_b[j];
+    }
+
+    src_a += stride_a;
+    src_b += stride_b;
+  }
+
+  const int64 count = 64;
+  // scale the constants by number of pixels
+  const int64 c1 = (cc1 * count * count) >> 12;
+  const int64 c2 = (cc2 * count * count) >> 12;
+
+  const int64 sum_a_x_sum_b = sum_a * sum_b;
+
+  const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
+                       (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+
+  const int64 sum_a_sq = sum_a*sum_a;
+  const int64 sum_b_sq = sum_b*sum_b;
+
+  const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
+                       (count * sum_sq_a - sum_a_sq +
+                        count * sum_sq_b - sum_b_sq + c2);
+
+  if (ssim_d == 0.0)
+    return DBL_MAX;
+  return ssim_n * 1.0 / ssim_d;
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height) {
+  int samples = 0;
+  double ssim_total = 0;
+
+  double (*Ssim8x8)(const uint8* src_a, int stride_a,
+                    const uint8* src_b, int stride_b);
+
+  Ssim8x8 = Ssim8x8_C;
+
+  // sample point start with each 4x4 location
+  for (int i = 0; i < height - 8; i += 4) {
+    for (int j = 0; j < width - 8; j += 4) {
+      ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
+      samples++;
+    }
+
+    src_a += stride_a * 4;
+    src_b += stride_b * 4;
+  }
+
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height) {
+  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
+                                      src_y_b, stride_y_b, width, height);
+  const int width_uv = (width + 1) >> 1;
+  const int height_uv = (height + 1) >> 1;
+  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
+                                      src_u_b, stride_u_b,
+                                      width_uv, height_uv);
+  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
+                                      src_v_b, stride_v_b,
+                                      width_uv, height_uv);
+  return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/compare_neon.cc b/files/source/compare_neon.cc
new file mode 100644
index 00000000..d8b375b8
--- /dev/null
+++ b/files/source/compare_neon.cc
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "vmov.u8    q8, #0                         \n"
+    "vmov.u8    q10, #0                        \n"
+    "vmov.u8    q9, #0                         \n"
+    "vmov.u8    q11, #0                        \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld1.u8    {q0}, [%0]!                    \n"
+    "vld1.u8    {q1}, [%1]!                    \n"
+    "subs       %2, %2, #16                    \n"
+    "vsubl.u8   q2, d0, d2                     \n"
+    "vsubl.u8   q3, d1, d3                     \n"
+    "vmlal.s16  q8, d4, d4                     \n"
+    "vmlal.s16  q9, d6, d6                     \n"
+    "vmlal.s16  q10, d5, d5                    \n"
+    "vmlal.s16  q11, d7, d7                    \n"
+    "bgt        1b                             \n"
+
+    "vadd.u32   q8, q8, q9                     \n"
+    "vadd.u32   q10, q10, q11                  \n"
+    "vadd.u32   q11, q8, q10                   \n"
+    "vpaddl.u32 q1, q11                        \n"
+    "vadd.u64   d0, d2, d3                     \n"
+    "vmov.32    %3, d0[0]                      \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+  return sse;
+}
+
+#endif  // __ARM_NEON__
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/files/source/conversion_tables.h b/files/source/conversion_tables.h
index 9a328649..ef3ebf36 100644
--- a/files/source/conversion_tables.h
+++ b/files/source/conversion_tables.h
@@ -18,7 +18,10 @@
 #ifndef LIBYUV_SOURCE_CONVERSION_TABLES_H_
 #define LIBYUV_SOURCE_CONVERSION_TABLES_H_
 
+#ifdef __cplusplus
 namespace libyuv {
+extern "C" {
+#endif
 
 /******************************************************************************
 * YUV TO RGB approximation
@@ -197,7 +200,10 @@ namespace libyuv {
         Vcg(244),Vcg(245),Vcg(246),Vcg(247),Vcg(248),Vcg(249),Vcg(250),Vcg(251),
         Vcg(252),Vcg(253),Vcg(254),Vcg(255)};
 
-} // namespace libyuv
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
 
 #endif
 
diff --git a/files/source/convert.cc b/files/source/convert.cc
index 8154dcb7..0882c92b 100644
--- a/files/source/convert.cc
+++ b/files/source/convert.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -10,174 +10,131 @@
 
 #include "libyuv/convert.h"
 
-#include "conversion_tables.h"
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
-#include "row.h"
-
-//#define SCALEOPT //Currently for windows only. June 2010
-
-#ifdef SCALEOPT
-#include <emmintrin.h>
+#include "libyuv/format_conversion.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
 #endif
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
 
+#ifdef __cplusplus
 namespace libyuv {
+extern "C" {
+#endif
 
-static inline uint8 Clip(int32 val) {
-  if (val < 0) {
-    return (uint8) 0;
-  } else if (val > 255){
-    return (uint8) 255;
-  }
-  return (uint8) val;
-}
-
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_frame, int dst_stride_frame,
-                int width, int height) {
-  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+// Copy I420 with optional flipping
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
     return -1;
   }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    int halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
 
-  // RGB orientation - bottom up
-  // TODO(fbarchard): support inversion
-  uint8* out = dst_frame + dst_stride_frame * height - dst_stride_frame;
-  uint8* out2 = out - dst_stride_frame;
-  int h, w;
-  int tmp_r, tmp_g, tmp_b;
-  const uint8 *y1, *y2 ,*u, *v;
-  y1 = src_y;
-  y2 = y1 + src_stride_y;
-  u = src_u;
-  v = src_v;
-  for (h = ((height + 1) >> 1); h > 0; h--){
-    // 2 rows at a time, 2 y's at a time
-    for (w = 0; w < ((width + 1) >> 1); w++){
-      // Vertical and horizontal sub-sampling
-      tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
-      tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
-      tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
-      out[0] = Clip(tmp_b);
-      out[1] = Clip(tmp_g);
-      out[2] = Clip(tmp_r);
-
-      tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
-      tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
-      tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
-      out[3] = Clip(tmp_b);
-      out[4] = Clip(tmp_g);
-      out[5] = Clip(tmp_r);
-
-      tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
-      tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
-      tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
-      out2[0] = Clip(tmp_b);
-      out2[1] = Clip(tmp_g);
-      out2[2] = Clip(tmp_r);
-
-      tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
-      tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
-      tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
-      out2[3] = Clip(tmp_b);
-      out2[4] = Clip(tmp_g);
-      out2[5] = Clip(tmp_r);
-
-      out += 6;
-      out2 += 6;
-      y1 += 2;
-      y2 += 2;
-      u++;
-      v++;
-    }
-    y1 += src_stride_y + src_stride_y - width;
-    y2 += src_stride_y + src_stride_y - width;
-    u += src_stride_u - ((width + 1) >> 1);
-    v += src_stride_v - ((width + 1) >> 1);
-    out -= dst_stride_frame * 3;
-    out2 -= dst_stride_frame * 3;
-  } // end height for
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
   return 0;
 }
 
-// Little Endian...
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_frame, int dst_stride_frame,
-                   int width, int height) {
-  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
-    return -1;
+// Move to row_win etc.
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+#define HAS_HALFROW_SSE2
+__declspec(naked) __declspec(align(16))
+static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
+                         uint8* dst_uv, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_uv
+    mov        edx, [esp + 4 + 8]    // src_uv_stride
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    sub        edi, eax
+
+    align      16
+  convertloop:
+    movdqa     xmm0, [eax]
+    pavgb      xmm0, [eax + edx]
+    sub        ecx, 16
+    movdqa     [eax + edi], xmm0
+    lea        eax,  [eax + 16]
+    jg         convertloop
+    pop        edi
+    ret
   }
+}
 
-  // RGB orientation - bottom up
-  uint8* out = dst_frame + dst_stride_frame * (height - 1);
-  uint8* out2 = out - dst_stride_frame;
-  int tmp_r, tmp_g, tmp_b;
-  const uint8 *y1,*y2, *u, *v;
-  y1 = src_y;
-  y2 = y1 + src_stride_y;
-  u = src_u;
-  v = src_v;
-  int h, w;
-
-  for (h = ((height + 1) >> 1); h > 0; h--) {
-    // 2 rows at a time, 2 y's at a time
-    for (w = 0; w < ((width + 1) >> 1); w++) {
-        // Vertical and horizontal sub-sampling
-        // Convert to RGB888 and re-scale to 4 bits
-        tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
-        tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
-        tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
-        out[0] =(uint8)((Clip(tmp_g) & 0xf0) + (Clip(tmp_b) >> 4));
-        out[1] = (uint8)(0xf0 + (Clip(tmp_r) >> 4));
-
-        tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
-        tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
-        tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
-        out[2] = (uint8)((Clip(tmp_g) & 0xf0 ) + (Clip(tmp_b) >> 4));
-        out[3] = (uint8)(0xf0 + (Clip(tmp_r) >> 4));
-
-        tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
-        tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
-        tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
-        out2[0] = (uint8)((Clip(tmp_g) & 0xf0 ) + (Clip(tmp_b) >> 4));
-        out2[1] = (uint8) (0xf0 + (Clip(tmp_r) >> 4));
-
-        tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
-        tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
-        tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
-        out2[2] = (uint8)((Clip(tmp_g) & 0xf0 ) + (Clip(tmp_b) >> 4));
-        out2[3] = (uint8)(0xf0 + (Clip(tmp_r) >> 4));
-
-        out += 4;
-        out2 += 4;
-        y1 += 2;
-        y2 += 2;
-        u++;
-        v++;
-    }
-    y1 += 2 * src_stride_y - width;
-    y2 += 2 * src_stride_y - width;
-    u += src_stride_u - ((width + 1) >> 1);
-    v += src_stride_v - ((width + 1) >> 1);
-    out -= (dst_stride_frame + width) * 2;
-    out2 -= (dst_stride_frame + width) * 2;
-  } // end height for
-  return 0;
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_HALFROW_SSE2
+static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
+                         uint8* dst_uv, int pix) {
+  asm volatile (
+  "sub        %0,%1                            \n"
+  ".p2align  4                                 \n"
+"1:                                            \n"
+  "movdqa     (%0),%%xmm0                      \n"
+  "pavgb      (%0,%3),%%xmm0                   \n"
+  "sub        $0x10,%2                         \n"
+  "movdqa     %%xmm0,(%0,%1)                   \n"
+  "lea        0x10(%0),%0                      \n"
+  "jg         1b                               \n"
+  : "+r"(src_uv),  // %0
+    "+r"(dst_uv),  // %1
+    "+r"(pix)      // %2
+  : "r"(static_cast<intptr_t>(src_uv_stride))  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0"
+#endif
+);
 }
+#endif
 
+static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+                      uint8* dst_uv, int pix) {
+  for (int x = 0; x < pix; ++x) {
+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+  }
+}
 
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_u, int src_stride_u,
-                 const uint8* src_v, int src_stride_v,
-                 uint8* dst_frame, int dst_stride_frame,
-                 int width, int height) {
-  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
     return -1;
   }
-
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -188,717 +145,1937 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
   }
-  uint16* out = (uint16*)(dst_frame) + dst_stride_frame * (height - 1);
-  uint16* out2 = out - dst_stride_frame;
-
-  int tmp_r, tmp_g, tmp_b;
-  const uint8* y1,* y2, * u, * v;
-  y1 = src_y;
-  y2 = y1 + src_stride_y;
-  u = src_u;
-  v = src_v;
-  int h, w;
-
-  for (h = ((height + 1) >> 1); h > 0; h--){
-    // 2 rows at a time, 2 y's at a time
-    for (w = 0; w < ((width + 1) >> 1); w++){
-      // Vertical and horizontal sub-sampling
-      // 1. Convert to RGB888
-      // 2. Shift to adequate location (in the 16 bit word) - RGB 565
-
-      tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
-      tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
-      tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
-      out[0]  = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
-                          & 0xfc) << 3) + (Clip(tmp_b) >> 3);
-
-      tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
-      tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
-      tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
-      out[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
-                         & 0xfc) << 3) + (Clip(tmp_b ) >> 3);
-
-      tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
-      tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
-      tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
-      out2[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
-                          & 0xfc) << 3) + (Clip(tmp_b) >> 3);
-
-      tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
-      tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
-      tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
-      out2[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
-                          & 0xfc) << 3) + (Clip(tmp_b) >> 3);
-
-      y1 += 2;
-      y2 += 2;
-      out += 2;
-      out2 += 2;
-      u++;
-      v++;
-    }
-    y1 += 2 * src_stride_y - width;
-    y2 += 2 * src_stride_y - width;
-    u += src_stride_u - ((width + 1) >> 1);
-    v += src_stride_v - ((width + 1) >> 1);
-    out -= 2 * dst_stride_frame + width;
-    out2 -=  2 * dst_stride_frame + width;
+  int halfwidth = (width + 1) >> 1;
+  void (*HalfRow)(const uint8* src_uv, int src_uv_stride,
+                  uint8* dst_uv, int pix) = HalfRow_C;
+#if defined(HAS_HALFROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(halfwidth, 16) &&
+      IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+      IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+      IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+      IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+    HalfRow = HalfRow_SSE2;
+  }
+#endif
+
+  // Copy Y plane
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+
+  // SubSample U plane.
+  int y;
+  for (y = 0; y < height - 1; y += 2) {
+    HalfRow(src_u, src_stride_u, dst_u, halfwidth);
+    src_u += src_stride_u * 2;
+    dst_u += dst_stride_u;
+  }
+  if (height & 1) {
+    HalfRow(src_u, 0, dst_u, halfwidth);
+  }
+
+  // SubSample V plane.
+  for (y = 0; y < height - 1; y += 2) {
+    HalfRow(src_v, src_stride_v, dst_v, halfwidth);
+    src_v += src_stride_v * 2;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    HalfRow(src_v, 0, dst_v, halfwidth);
   }
   return 0;
 }
 
+// Blends 32x2 pixels to 16x1
+// source in scale.cc
+#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SCALEROWDOWN2_NEON
+void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width);
+#elif !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+
+void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+#endif
+void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
 
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_frame, int dst_stride_frame,
-                   int width, int height) {
-  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
     return -1;
   }
-  uint16* out = (uint16*)(dst_frame) + dst_stride_frame * (height - 1);
-  uint16* out2 = out - dst_stride_frame ;
-  int32 tmp_r, tmp_g, tmp_b;
-  const uint8 *y1,*y2, *u, *v;
-  int h, w;
-
-  y1 = src_y;
-  y2 = y1 + src_stride_y;
-  u = src_u;
-  v = src_v;
-
-  for (h = ((height + 1) >> 1); h > 0; h--){
-    // 2 rows at a time, 2 y's at a time
-    for (w = 0; w < ((width + 1) >> 1); w++){
-      // Vertical and horizontal sub-sampling
-      // 1. Convert to RGB888
-      // 2. Shift to adequate location (in the 16 bit word) - RGB 555
-      // 3. Add 1 for alpha value
-      tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
-      tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
-      tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
-      out[0]  = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) +
-                ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3));
-
-      tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
-      tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]]  + 128) >> 8);
-      tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
-      out[1]  = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) +
-                ((Clip(tmp_g) & 0xf8) << 3)  + (Clip(tmp_b) >> 3));
-
-      tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
-      tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
-      tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
-      out2[0]  = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) +
-                 ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3));
-
-      tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
-      tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
-      tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
-      out2[1]  = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) +
-                 ((Clip(tmp_g) & 0xf8) << 3)  + (Clip(tmp_b) >> 3));
-
-      y1 += 2;
-      y2 += 2;
-      out += 2;
-      out2 += 2;
-      u++;
-      v++;
-    }
-    y1 += 2 * src_stride_y - width;
-    y2 += 2 * src_stride_y - width;
-    u += src_stride_u - ((width + 1) >> 1);
-    v += src_stride_v - ((width + 1) >> 1);
-    out -= 2 * dst_stride_frame + width;
-    out2 -=  2 * dst_stride_frame + width;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  int halfwidth = (width + 1) >> 1;
+  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) = ScaleRowDown2Int_C;
+#if defined(HAS_SCALEROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON) &&
+      IS_ALIGNED(halfwidth, 16)) {
+    ScaleRowDown2 = ScaleRowDown2Int_NEON;
+  }
+#elif defined(HAS_SCALEROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(halfwidth, 16) &&
+      IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+      IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+      IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+      IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+    ScaleRowDown2 = ScaleRowDown2Int_SSE2;
+  }
+#endif
+
+  // Copy Y plane
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+
+  // SubSample U plane.
+  int y;
+  for (y = 0; y < height - 1; y += 2) {
+    ScaleRowDown2(src_u, src_stride_u, dst_u, halfwidth);
+    src_u += src_stride_u * 2;
+    dst_u += dst_stride_u;
+  }
+  if (height & 1) {
+    ScaleRowDown2(src_u, 0, dst_u, halfwidth);
+  }
+
+  // SubSample V plane.
+  for (y = 0; y < height - 1; y += 2) {
+    ScaleRowDown2(src_v, src_stride_v, dst_v, halfwidth);
+    src_v += src_stride_v * 2;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ScaleRowDown2(src_v, 0, dst_v, halfwidth);
   }
   return 0;
 }
 
+// use Bilinear for upsampling chroma
+void ScalePlaneBilinear(int src_width, int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_ptr, uint8* dst_ptr);
 
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
+// 411 chroma is 1/4 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
     return -1;
   }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (height - 1) * dst_stride_u;
+    dst_v = dst_v + (height - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
 
-  const uint8* in1 = src_y;
-  const uint8* in2 = src_y + src_stride_y;
-
-  uint8* out1 = dst_frame;
-  uint8* out2 = dst_frame + dst_stride_frame;
-
-  // YUY2 - Macro-pixel = 2 image pixels
-  // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
-#ifndef SCALEOPT
-  for (int i = 0; i < ((height + 1) >> 1); i++){
-    for (int j = 0; j < ((width + 1) >> 1); j++){
-      out1[0] = in1[0];
-      out1[1] = *src_u;
-      out1[2] = in1[1];
-      out1[3] = *src_v;
-
-      out2[0] = in2[0];
-      out2[1] = *src_u;
-      out2[2] = in2[1];
-      out2[3] = *src_v;
-      out1 += 4;
-      out2 += 4;
-      src_u++;
-      src_v++;
-      in1 += 2;
-      in2 += 2;
-    }
-    in1 += 2 * src_stride_y - width;
-    in2 += 2 * src_stride_y - width;
-    src_u += src_stride_u - ((width + 1) >> 1);
-    src_v += src_stride_v - ((width + 1) >> 1);
-    out1 += dst_stride_frame + dst_stride_frame - 2 * width;
-    out2 += dst_stride_frame + dst_stride_frame - 2 * width;
+  // Copy Y plane
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
-#else
-  for (WebRtc_UWord32 i = 0; i < ((height + 1) >> 1);i++) {
-    int32 width__ = (width >> 4);
-    _asm
-    {
-      ;pusha
-      mov       eax, DWORD PTR [in1]                       ;1939.33
-      mov       ecx, DWORD PTR [in2]                       ;1939.33
-      mov       ebx, DWORD PTR [src_u]                       ;1939.33
-      mov       edx, DWORD PTR [src_v]                       ;1939.33
-      loop0:
-      movq      xmm6, QWORD PTR [ebx]          ;src_u
-      movq      xmm0, QWORD PTR [edx]          ;src_v
-      punpcklbw xmm6, xmm0                     ;src_u, src_v mix
-      ;movdqa    xmm1, xmm6
-      ;movdqa    xmm2, xmm6
-      ;movdqa    xmm4, xmm6
-
-      movdqu    xmm3, XMMWORD PTR [eax]        ;in1
-      movdqa    xmm1, xmm3
-      punpcklbw xmm1, xmm6                     ;in1, src_u, in1, src_v
-      mov       esi, DWORD PTR [out1]
-      movdqu    XMMWORD PTR [esi], xmm1        ;write to out1
-
-      movdqu    xmm5, XMMWORD PTR [ecx]        ;in2
-      movdqa    xmm2, xmm5
-      punpcklbw xmm2, xmm6                     ;in2, src_u, in2, src_v
-      mov       edi, DWORD PTR [out2]
-      movdqu    XMMWORD PTR [edi], xmm2        ;write to out2
-
-      punpckhbw xmm3, xmm6                     ;in1, src_u, in1, src_v again
-      movdqu    XMMWORD PTR [esi+16], xmm3     ;write to out1 again
-      add       esi, 32
-      mov       DWORD PTR [out1], esi
-
-      punpckhbw xmm5, xmm6                     ;src_u, in2, src_v again
-      movdqu    XMMWORD PTR [edi+16], xmm5     ;write to out2 again
-      add       edi, 32
-      mov       DWORD PTR [out2], edi
-
-      add       ebx, 8
-      add       edx, 8
-      add       eax, 16
-      add       ecx, 16
-
-      mov       esi, DWORD PTR [width__]
-      sub       esi, 1
-      mov       DWORD PTR [width__], esi
-      jg        loop0
-
-      mov       DWORD PTR [in1], eax                       ;1939.33
-      mov       DWORD PTR [in2], ecx                       ;1939.33
-      mov       DWORD PTR [src_u], ebx                       ;1939.33
-      mov       DWORD PTR [src_v], edx                       ;1939.33
-
-      ;popa
-      emms
-    }
-    in1 += 2 * src_stride_y - width;
-    in2 += 2 * src_stride_y - width;
-    out1 += dst_stride_frame + dst_stride_frame - 2 * width;
-    out2 += dst_stride_frame + dst_stride_frame - 2 * width;
+
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  int quarterwidth = (width + 3) >> 2;
+
+  // Resample U plane.
+  ScalePlaneBilinear(quarterwidth, height,  // from 1/4 width, 1x height
+                     halfwidth, halfheight,  // to 1/2 width, 1/2 height
+                     src_stride_u,
+                     dst_stride_u,
+                     src_u, dst_u);
+
+  // Resample V plane.
+  ScalePlaneBilinear(quarterwidth, height,  // from 1/4 width, 1x height
+                     halfwidth, halfheight,  // to 1/2 width, 1/2 height
+                     src_stride_v,
+                     dst_stride_v,
+                     src_v, dst_v);
+  return 0;
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (!src_y || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
+  SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
+  return 0;
+}
+
+static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
+                       uint8* dst, int dst_stride_frame,
+                       int width, int height) {
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
+    CopyRow = CopyRow_NEON;
+  }
+#elif defined(HAS_COPYROW_X86)
+  if (IS_ALIGNED(width, 4)) {
+    CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) &&
+        IS_ALIGNED(width, 32) && IS_ALIGNED(src, 16) &&
+        IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
+        IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+      CopyRow = CopyRow_SSE2;
+    }
+#endif
+  }
+#endif
+
+  // Copy plane
+  for (int y = 0; y < height - 1; y += 2) {
+    CopyRow(src, dst, width);
+    CopyRow(src + src_stride_0, dst + dst_stride_frame, width);
+    src += src_stride_0 + src_stride_1;
+    dst += dst_stride_frame * 2;
+  }
+  if (height & 1) {
+    CopyRow(src, dst, width);
+  }
+}
+
+// Support converting from FOURCC_M420
+// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
+// easy conversion to I420.
+// M420 format description:
+// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
+// Chroma is half width / half height. (420)
+// src_stride_m420 is row planar. Normally this will be the width in pixels.
+//   The UV plane is half width, but 2 values, so src_stride_m420 applies to
+//   this as well as the two Y planes.
+static int X420ToI420(const uint8* src_y,
+                      int src_stride_y0, int src_stride_y1,
+                      const uint8* src_uv, int src_stride_uv,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int width, int height) {
+  if (!src_y || !src_uv ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    int halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+      SplitUV_C;
+#if defined(HAS_SPLITUV_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
+    SplitUV = SplitUV_NEON;
+  }
+#elif defined(HAS_SPLITUV_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(halfwidth, 16) &&
+      IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
+      IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+      IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+    SplitUV = SplitUV_SSE2;
   }
 #endif
+
+  if (dst_y) {
+    CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
+               width, height);
+  }
+
+  int halfheight = (height + 1) >> 1;
+  for (int y = 0; y < halfheight; ++y) {
+    // Copy a row of UV.
+    SplitUV(src_uv, dst_u, dst_v, halfwidth);
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_uv += src_stride_uv;
+  }
   return 0;
 }
 
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y,
+                    src_uv, src_stride_uv,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
+}
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
+                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
+}
+
+// Convert Q420 to I420.
+// Format is rows of YY/YUYV
+LIBYUV_API
+int Q420ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (!src_y || !src_yuy2 ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
     return -1;
   }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    int halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
+    CopyRow = CopyRow_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_X86)
+  if (IS_ALIGNED(width, 4)) {
+    CopyRow = CopyRow_X86;
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+    CopyRow = CopyRow_SSE2;
+  }
+#endif
 
-  int i = 0;
-  const uint8* y1 = src_y;
-  const uint8* y2 = y1 + src_stride_y;
-  const uint8* u = src_u;
-  const uint8* v = src_v;
-
-  uint8* out1 = dst_frame;
-  uint8* out2 = dst_frame + dst_stride_frame;
-
-  // Macro-pixel = 2 image pixels
-  // U0Y0V0Y1....U2Y2V2Y3...U4Y4V4Y5.....
-
-#ifndef SCALEOPT
-  for (; i < ((height + 1) >> 1); i++) {
-    for (int j = 0; j < ((width + 1) >> 1); j++) {
-      out1[0] = *u;
-      out1[1] = y1[0];
-      out1[2] = *v;
-      out1[3] = y1[1];
-
-      out2[0] = *u;
-      out2[1] = y2[0];
-      out2[2] = *v;
-      out2[3] = y2[1];
-      out1 += 4;
-      out2 += 4;
-      u++;
-      v++;
-      y1 += 2;
-      y2 += 2;
-    }
-    y1 += 2 * src_stride_y - width;
-    y2 += 2 * src_stride_y - width;
-    u += src_stride_u - ((width + 1) >> 1);
-    v += src_stride_v - ((width + 1) >> 1);
-    out1 += 2 * (dst_stride_frame - width);
-    out2 += 2 * (dst_stride_frame - width);
+  void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+      int pix) = YUY2ToUV422Row_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
+      YUY2ToYRow_C;
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    if (width > 16) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+      YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
+      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          YUY2ToYRow = YUY2ToYRow_SSE2;
+        }
+      }
+    }
   }
-#else
-  for (; i < (height >> 1);i++) {
-    int32 width__ = (width >> 4);
-    _asm
-    {
-      ;pusha
-      mov       eax, DWORD PTR [in1]                       ;1939.33
-      mov       ecx, DWORD PTR [in2]                       ;1939.33
-      mov       ebx, DWORD PTR [src_u]                       ;1939.33
-      mov       edx, DWORD PTR [src_v]                       ;1939.33
-loop0:
-      movq      xmm6, QWORD PTR [ebx]          ;src_u
-      movq      xmm0, QWORD PTR [edx]          ;src_v
-      punpcklbw xmm6, xmm0                     ;src_u, src_v mix
-      movdqa    xmm1, xmm6
-      movdqa    xmm2, xmm6
-      movdqa    xmm4, xmm6
-
-      movdqu    xmm3, XMMWORD PTR [eax]        ;in1
-      punpcklbw xmm1, xmm3                     ;src_u, in1, src_v
-      mov       esi, DWORD PTR [out1]
-      movdqu    XMMWORD PTR [esi], xmm1        ;write to out1
-
-      movdqu    xmm5, XMMWORD PTR [ecx]        ;in2
-      punpcklbw xmm2, xmm5                     ;src_u, in2, src_v
-      mov       edi, DWORD PTR [out2]
-      movdqu    XMMWORD PTR [edi], xmm2        ;write to out2
-
-      punpckhbw xmm4, xmm3                     ;src_u, in1, src_v again
-      movdqu    XMMWORD PTR [esi+16], xmm4     ;write to out1 again
-      add       esi, 32
-      mov       DWORD PTR [out1], esi
-
-      punpckhbw xmm6, xmm5                     ;src_u, in2, src_v again
-      movdqu    XMMWORD PTR [edi+16], xmm6     ;write to out2 again
-      add       edi, 32
-      mov       DWORD PTR [out2], edi
-
-      add       ebx, 8
-      add       edx, 8
-      add       eax, 16
-      add       ecx, 16
-
-      mov       esi, DWORD PTR [width__]
-      sub       esi, 1
-      mov       DWORD PTR [width__], esi
-      jg        loop0
-
-      mov       DWORD PTR [in1], eax                       ;1939.33
-      mov       DWORD PTR [in2], ecx                       ;1939.33
-      mov       DWORD PTR [src_u], ebx                       ;1939.33
-      mov       DWORD PTR [src_v], edx                       ;1939.33
-
-      ;popa
-      emms
-    }
-    in1 += width;
-    in2 += width;
-    out1 += 2 * (dst_stride_frame - width);
-    out2 += 2 * (dst_stride_frame - width);
+#elif defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 8) {
+      YUY2ToYRow = YUY2ToYRow_Any_NEON;
+      if (width > 16) {
+        YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+      }
+    }
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+    }
   }
 #endif
+
+  for (int y = 0; y < height - 1; y += 2) {
+    CopyRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+
+    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    CopyRow(src_y, dst_y, width);
+    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+  }
   return 0;
 }
 
+// Test if over reading on source is safe.
+// TODO(fbarchard): Find more efficient solution to safely do odd sizes.
+// Macros to control read policy, from slowest to fastest:
+// READSAFE_NEVER - disables read ahead on systems with strict memory reads
+// READSAFE_ODDHEIGHT - last row of odd height done with C.
+//   This policy assumes that the caller handles the last row of an odd height
+//   image using C.
+// READSAFE_PAGE - enable read ahead within same page.
+//   A page is 4096 bytes. When reading ahead, if the last pixel is near the
+//   end the page, and a read spans the page into the next page, a memory
+//   exception can occur if that page has not been allocated, or is a guard
+//   page. This setting ensures the overread is within the same page.
+// READSAFE_ALWAYS - enables read ahead on systems without memory exceptions
+//   or where buffers are padded by 64 bytes.
+
+#if defined(HAS_RGB24TOARGBROW_SSSE3) || \
+    defined(HAS_RGB24TOARGBROW_SSSE3) || \
+    defined(HAS_RAWTOARGBROW_SSSE3) || \
+    defined(HAS_RGB565TOARGBROW_SSE2) || \
+    defined(HAS_ARGB1555TOARGBROW_SSE2) || \
+    defined(HAS_ARGB4444TOARGBROW_SSE2)
+
+#define READSAFE_ODDHEIGHT
+
+static bool TestReadSafe(const uint8* src_yuy2, int src_stride_yuy2,
+                        int width, int height, int bpp, int overread) {
+  if (width > kMaxStride) {
+    return false;
+  }
+#if defined(READSAFE_ALWAYS)
+  return true;
+#elif defined(READSAFE_NEVER)
+  return false;
+#elif defined(READSAFE_ODDHEIGHT)
+  if (!(width & 15) ||
+      (src_stride_yuy2 >= 0 && (height & 1) && width * bpp >= overread)) {
+    return true;
+  }
+  return false;
+#elif defined(READSAFE_PAGE)
+  if (src_stride_yuy2 >= 0) {
+    src_yuy2 += (height - 1) * src_stride_yuy2;
+  }
+  uintptr_t last_adr = (uintptr_t)(src_yuy2) + width * bpp - 1;
+  uintptr_t last_read_adr = last_adr + overread - 1;
+  if (((last_adr ^ last_read_adr) & ~4095) == 0) {
+    return true;
+  }
+  return false;
+#endif
+}
+#endif
 
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_frame, int dst_stride_frame,
-                 int width, int height) {
-  if (src_y == NULL || src_uv == NULL || dst_frame == NULL) {
-    return -1;
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+  void (*YUY2ToYRow)(const uint8* src_yuy2,
+                     uint8* dst_y, int pix);
+  YUY2ToYRow = YUY2ToYRow_C;
+  YUY2ToUVRow = YUY2ToUVRow_C;
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    if (width > 16) {
+      YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+      YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
+      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+        YUY2ToUVRow = YUY2ToUVRow_SSE2;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          YUY2ToYRow = YUY2ToYRow_SSE2;
+        }
+      }
+    }
   }
+#elif defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 8) {
+      YUY2ToYRow = YUY2ToYRow_Any_NEON;
+      if (width > 16) {
+        YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+      }
+    }
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUVRow = YUY2ToUVRow_NEON;
+    }
+  }
+#endif
 
-  // Bi-Planar: Y plane followed by an interlaced U and V plane
-  const uint8* interlacedSrc = src_uv;
-  uint16* out = (uint16*)(src_y) + dst_stride_frame * (height - 1);
-  uint16* out2 = out - dst_stride_frame;
-  int32 tmp_r, tmp_g, tmp_b;
-  const uint8 *y1,*y2;
-  y1 = src_y;
-  y2 = y1 + src_stride_y;
-  int h, w;
-
-  for (h = ((height + 1) >> 1); h > 0; h--) {
-    // 2 rows at a time, 2 y's at a time
-    for (w = 0; w < ((width + 1) >> 1); w++) {
-      // Vertical and horizontal sub-sampling
-      // 1. Convert to RGB888
-      // 2. Shift to adequate location (in the 16 bit word) - RGB 565
-
-      tmp_r = (int32)((mapYc[y1[0]] + mapVcr[interlacedSrc[1]] + 128) >> 8);
-      tmp_g = (int32)((mapYc[y1[0]] + mapUcg[interlacedSrc[0]]
-                      + mapVcg[interlacedSrc[1]] + 128) >> 8);
-      tmp_b = (int32)((mapYc[y1[0]] + mapUcb[interlacedSrc[0]] + 128) >> 8);
-      out[0]  = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
-                          & 0xfc) << 3) + (Clip(tmp_b) >> 3);
-
-      tmp_r = (int32)((mapYc[y1[1]] + mapVcr[interlacedSrc[1]] + 128) >> 8);
-      tmp_g = (int32)((mapYc[y1[1]] + mapUcg[interlacedSrc[0]]
-                      + mapVcg[interlacedSrc[1]] + 128) >> 8);
-      tmp_b = (int32)((mapYc[y1[1]] + mapUcb[interlacedSrc[0]] + 128) >> 8);
-      out[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
-                         & 0xfc) << 3) + (Clip(tmp_b ) >> 3);
-
-      tmp_r = (int32)((mapYc[y2[0]] + mapVcr[interlacedSrc[1]] + 128) >> 8);
-      tmp_g = (int32)((mapYc[y2[0]] + mapUcg[interlacedSrc[0]]
-                      + mapVcg[interlacedSrc[1]] + 128) >> 8);
-      tmp_b = (int32)((mapYc[y2[0]] + mapUcb[interlacedSrc[0]] + 128) >> 8);
-      out2[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
-                          & 0xfc) << 3) + (Clip(tmp_b) >> 3);
-
-      tmp_r = (int32)((mapYc[y2[1]] + mapVcr[interlacedSrc[1]]
-                      + 128) >> 8);
-      tmp_g = (int32)((mapYc[y2[1]] + mapUcg[interlacedSrc[0]]
-                      + mapVcg[interlacedSrc[1]] + 128) >> 8);
-      tmp_b = (int32)((mapYc[y2[1]] + mapUcb[interlacedSrc[0]] + 128) >> 8);
-      out2[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
-                          & 0xfc) << 3) + (Clip(tmp_b) >> 3);
-
-      y1 += 2;
-      y2 += 2;
-      out += 2;
-      out2 += 2;
-      interlacedSrc += 2;
-    }
-    y1 += 2 * src_stride_y - width;
-    y2 += 2 * src_stride_y - width;
-    interlacedSrc += src_stride_uv - ((width + 1) >> 1);
-    out -= 3 * dst_stride_frame + dst_stride_frame - width;
-    out2 -= 3 * dst_stride_frame + dst_stride_frame - width;
+  for (int y = 0; y < height - 1; y += 2) {
+    YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+    src_yuy2 += src_stride_yuy2 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
   }
   return 0;
 }
 
-// TODO(fbarchard): Deprecated - this is same as BG24ToARGB with -height
-int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
-                uint8* dst_frame, int dst_stride_frame,
-                int width, int height) {
-  if (src_frame == NULL || dst_frame == NULL) {
-    return -1;
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
   }
+  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+                     uint8* dst_y, int pix);
+  UYVYToYRow = UYVYToYRow_C;
+  UYVYToUVRow = UYVYToUVRow_C;
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    if (width > 16) {
+      UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+      UYVYToYRow = UYVYToYRow_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
+      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
+        UYVYToUVRow = UYVYToUVRow_SSE2;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          UYVYToYRow = UYVYToYRow_SSE2;
+        }
+      }
+    }
+  }
+#elif defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 8) {
+      UYVYToYRow = UYVYToYRow_Any_NEON;
+      if (width > 16) {
+        UYVYToUVRow = UYVYToUVRow_Any_NEON;
+      }
+    }
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUVRow = UYVYToUVRow_NEON;
+    }
+  }
+#endif
 
-  int i, j, offset;
-  uint8* outFrame = dst_frame;
-  const uint8* inFrame = src_frame;
+  for (int y = 0; y < height - 1; y += 2) {
+    UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+    UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
+    src_uyvy += src_stride_uyvy * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+  }
+  return 0;
+}
 
-  outFrame += dst_stride_frame * (height - 1) * 4;
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      offset = j * 4;
-      outFrame[0 + offset] = inFrame[0];
-      outFrame[1 + offset] = inFrame[1];
-      outFrame[2 + offset] = inFrame[2];
-      outFrame[3 + offset] = 0xff;
-      inFrame += 3;
+// Visual C x86 or GCC little endian.
+#if defined(__x86_64__) || defined(_M_X64) || \
+  defined(__i386__) || defined(_M_IX86) || \
+  defined(__arm__) || defined(_M_ARM) || \
+  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define LIBYUV_LITTLE_ENDIAN
+#endif
+
+#ifdef LIBYUV_LITTLE_ENDIAN
+#define READWORD(p) (*reinterpret_cast<const uint32*>(p))
+#else
+static inline uint32 READWORD(const uint8* p) {
+  return static_cast<uint32>(p[0]) |
+      (static_cast<uint32>(p[1]) << 8) |
+      (static_cast<uint32>(p[2]) << 16) |
+      (static_cast<uint32>(p[3]) << 24);
+}
+#endif
+
+// Must be multiple of 6 pixels. Will over convert to handle remainder.
+// https://developer.apple.com/quicktime/icefloe/dispatch019.html#v210
+static void V210ToUYVYRow_C(const uint8* src_v210, uint8* dst_uyvy, int width) {
+  for (int x = 0; x < width; x += 6) {
+    uint32 w = READWORD(src_v210 + 0);
+    dst_uyvy[0] = (w >> 2) & 0xff;
+    dst_uyvy[1] = (w >> 12) & 0xff;
+    dst_uyvy[2] = (w >> 22) & 0xff;
+
+    w = READWORD(src_v210 + 4);
+    dst_uyvy[3] = (w >> 2) & 0xff;
+    dst_uyvy[4] = (w >> 12) & 0xff;
+    dst_uyvy[5] = (w >> 22) & 0xff;
+
+    w = READWORD(src_v210 + 8);
+    dst_uyvy[6] = (w >> 2) & 0xff;
+    dst_uyvy[7] = (w >> 12) & 0xff;
+    dst_uyvy[8] = (w >> 22) & 0xff;
+
+    w = READWORD(src_v210 + 12);
+    dst_uyvy[9] = (w >> 2) & 0xff;
+    dst_uyvy[10] = (w >> 12) & 0xff;
+    dst_uyvy[11] = (w >> 22) & 0xff;
+
+    src_v210 += 16;
+    dst_uyvy += 12;
+  }
+}
+
+// Convert V210 to I420.
+// V210 is 10 bit version of UYVY. 16 bytes to store 6 pixels.
+// With is multiple of 48.
+LIBYUV_API
+int V210ToI420(const uint8* src_v210, int src_stride_v210,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (width * 2 * 2 > kMaxStride) {  // 2 rows of UYVY are required.
+    return -1;
+  } else if (!src_v210 || !dst_y || !dst_u || !dst_v ||
+             width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_v210 = src_v210 + (height - 1) * src_stride_v210;
+    src_stride_v210 = -src_stride_v210;
+  }
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  void (*V210ToUYVYRow)(const uint8* src_v210, uint8* dst_uyvy, int pix);
+  V210ToUYVYRow = V210ToUYVYRow_C;
+
+  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+                     uint8* dst_y, int pix);
+  UYVYToYRow = UYVYToYRow_C;
+  UYVYToUVRow = UYVYToUVRow_C;
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
+    UYVYToUVRow = UYVYToUVRow_SSE2;
+    UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+    if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+      UYVYToYRow = UYVYToYRow_SSE2;
+    }
+  }
+#elif defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 8) {
+      UYVYToYRow = UYVYToYRow_Any_NEON;
+      if (width > 16) {
+        UYVYToUVRow = UYVYToUVRow_Any_NEON;
+      }
     }
-    outFrame -= 4 * (dst_stride_frame - width);
-    inFrame += src_stride_frame - width;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUVRow = UYVYToUVRow_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    if (width > 16) {
+      UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+      UYVYToYRow = UYVYToYRow_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+      UYVYToUVRow = UYVYToUVRow_SSE2;
+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        UYVYToYRow = UYVYToYRow_SSE2;
+      }
+    }
+  }
+#elif defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 8) {
+      UYVYToYRow = UYVYToYRow_Any_NEON;
+      if (width > 16) {
+        UYVYToUVRow = UYVYToUVRow_Any_NEON;
+      }
+    }
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUVRow = UYVYToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (int y = 0; y < height - 1; y += 2) {
+    V210ToUYVYRow(src_v210, row, width);
+    V210ToUYVYRow(src_v210 + src_stride_v210, row + kMaxStride, width);
+    UYVYToUVRow(row, kMaxStride, dst_u, dst_v, width);
+    UYVYToYRow(row, dst_y, width);
+    UYVYToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+    src_v210 += src_stride_v210 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    V210ToUYVYRow(src_v210, row, width);
+    UYVYToUVRow(row, 0, dst_u, dst_v, width);
+    UYVYToYRow(row, dst_y, width);
   }
   return 0;
 }
 
-int ARGBToI420(const uint8* src_frame, int src_stride_frame,
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  if (!src_argb ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_frame = src_frame + (height - 1) * src_stride_frame;
-    src_stride_frame = -src_stride_frame;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
   }
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
   void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width);
+
+  ARGBToYRow = ARGBToYRow_C;
+  ARGBToUVRow = ARGBToUVRow_C;
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
-      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
-    ARGBToYRow = ARGBToYRow_SSSE3;
-  } else
-#endif
-  {
-    ARGBToYRow = ARGBToYRow_C;
-  }
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
-      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
-      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
-    ARGBToUVRow = ARGBToUVRow_SSSE3;
-  } else
-#endif
-  {
-    ARGBToUVRow = ARGBToUVRow_C;
-  }
-
-  for (int y = 0; y < (height - 1); y += 2) {
-    ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
-    ARGBToYRow(src_frame, dst_y, width);
-    ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
-    src_frame += src_stride_frame * 2;
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (width > 16) {
+      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+      ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+        ARGBToUVRow = ARGBToUVRow_SSSE3;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          ARGBToYRow = ARGBToYRow_SSSE3;
+        }
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height - 1; y += 2) {
+    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    src_argb += src_stride_argb * 2;
     dst_y += dst_stride_y * 2;
     dst_u += dst_stride_u;
     dst_v += dst_stride_v;
   }
   if (height & 1) {
-    ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
-    ARGBToYRow(src_frame, dst_y, width);
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
   }
   return 0;
 }
 
-int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+LIBYUV_API
+int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  if (!src_bgra ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_frame = src_frame + (height - 1) * src_stride_frame;
-    src_stride_frame = -src_stride_frame;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
   }
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix);
+  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
                       uint8* dst_u, uint8* dst_v, int width);
+
+  BGRAToYRow = BGRAToYRow_C;
+  BGRAToUVRow = BGRAToUVRow_C;
 #if defined(HAS_BGRATOYROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
-      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
-    ARGBToYRow = BGRAToYRow_SSSE3;
-  } else
-#endif
-  {
-    ARGBToYRow = BGRAToYRow_C;
-  }
-#if defined(HAS_BGRATOUVROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
-      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
-      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
-    ARGBToUVRow = BGRAToUVRow_SSSE3;
-  } else
-#endif
-  {
-    ARGBToUVRow = BGRAToUVRow_C;
-  }
-
-  for (int y = 0; y < (height - 1); y += 2) {
-    ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
-    ARGBToYRow(src_frame, dst_y, width);
-    ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
-    src_frame += src_stride_frame * 2;
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (width > 16) {
+      BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+      BGRAToYRow = BGRAToYRow_Any_SSSE3;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;
+      BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) {
+        BGRAToUVRow = BGRAToUVRow_SSSE3;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          BGRAToYRow = BGRAToYRow_SSSE3;
+        }
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height - 1; y += 2) {
+    BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+    BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
+    src_bgra += src_stride_bgra * 2;
     dst_y += dst_stride_y * 2;
     dst_u += dst_stride_u;
     dst_v += dst_stride_v;
   }
   if (height & 1) {
-    ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
-    ARGBToYRow(src_frame, dst_y, width);
+    BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
   }
   return 0;
 }
 
-int ABGRToI420(const uint8* src_frame, int src_stride_frame,
+LIBYUV_API
+int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  if (!src_abgr ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_frame = src_frame + (height - 1) * src_stride_frame;
-    src_stride_frame = -src_stride_frame;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
   }
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix);
+  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
                       uint8* dst_u, uint8* dst_v, int width);
+
+  ABGRToYRow = ABGRToYRow_C;
+  ABGRToUVRow = ABGRToUVRow_C;
 #if defined(HAS_ABGRTOYROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
-      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
-    ARGBToYRow = ABGRToYRow_SSSE3;
-  } else
-#endif
-  {
-    ARGBToYRow = ABGRToYRow_C;
-  }
-#if defined(HAS_ABGRTOUVROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
-      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
-      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
-    ARGBToUVRow = ABGRToUVRow_SSSE3;
-  } else
-#endif
-  {
-    ARGBToUVRow = ABGRToUVRow_C;
-  }
-
-  for (int y = 0; y < (height - 1); y += 2) {
-    ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
-    ARGBToYRow(src_frame, dst_y, width);
-    ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
-    src_frame += src_stride_frame * 2;
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (width > 16) {
+      ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+      ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;
+      ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) {
+        ABGRToUVRow = ABGRToUVRow_SSSE3;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          ABGRToYRow = ABGRToYRow_SSSE3;
+        }
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height - 1; y += 2) {
+    ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+    ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+    src_abgr += src_stride_abgr * 2;
     dst_y += dst_stride_y * 2;
     dst_u += dst_stride_u;
     dst_v += dst_stride_v;
   }
   if (height & 1) {
-    ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
-    ARGBToYRow(src_frame, dst_y, width);
+    ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
   }
   return 0;
 }
 
-int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
+LIBYUV_API
+int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (!src_rgba ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
+  }
+  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix);
+  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int width);
+
+  RGBAToYRow = RGBAToYRow_C;
+  RGBAToUVRow = RGBAToUVRow_C;
+#if defined(HAS_RGBATOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (width > 16) {
+      RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+      RGBAToYRow = RGBAToYRow_Any_SSSE3;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;
+      RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) {
+        RGBAToUVRow = RGBAToUVRow_SSSE3;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          RGBAToYRow = RGBAToYRow_SSSE3;
+        }
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height - 1; y += 2) {
+    RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+    RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
+    src_rgba += src_stride_rgba * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
                 uint8* dst_y, int dst_stride_y,
                 uint8* dst_u, int dst_stride_u,
                 uint8* dst_v, int dst_stride_v,
                 int width, int height) {
+  if (width * 4 > kMaxStride) {  // Row buffer is required.
+    return -1;
+  } else if (!src_rgb24 ||
+             !dst_y || !dst_u || !dst_v ||
+             width <= 0 || height == 0) {
+      return -1;
+  }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_frame = src_frame + (height - 1) * src_stride_frame;
-    src_stride_frame = -src_stride_frame;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
   }
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
+
+  RGB24ToARGBRow = RGB24ToARGBRow_C;
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      TestReadSafe(src_rgb24, src_stride_rgb24, width, height, 3, 48)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+  }
+#endif
+
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
   void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width);
-#if defined(HAS_RGB24TOYROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
-      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
-    ARGBToYRow = RGB24ToYRow_SSSE3;
-  } else
-#endif
-  {
-    ARGBToYRow = RGB24ToYRow_C;
-  }
-#if defined(HAS_RGB24TOUVROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
-      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
-      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
-    ARGBToUVRow = RGB24ToUVRow_SSSE3;
-  } else
-#endif
-  {
-    ARGBToUVRow = RGB24ToUVRow_C;
-  }
-
-  for (int y = 0; y < (height - 1); y += 2) {
-    ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
-    ARGBToYRow(src_frame, dst_y, width);
-    ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
-    src_frame += src_stride_frame * 2;
+
+  ARGBToYRow = ARGBToYRow_C;
+  ARGBToUVRow = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (width > 16) {
+      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    }
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height - 1; y += 2) {
+    RGB24ToARGBRow(src_rgb24, row, width);
+    RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kMaxStride, width);
+    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+    src_rgb24 += src_stride_rgb24 * 2;
     dst_y += dst_stride_y * 2;
     dst_u += dst_stride_u;
     dst_v += dst_stride_v;
   }
   if (height & 1) {
-    ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
-    ARGBToYRow(src_frame, dst_y, width);
+    RGB24ToARGBRow_C(src_rgb24, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
   }
   return 0;
 }
 
-int RAWToI420(const uint8* src_frame, int src_stride_frame,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height) {
+LIBYUV_API
+int RAWToI420(const uint8* src_raw, int src_stride_raw,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height) {
+  if (width * 4 > kMaxStride) {  // Row buffer is required.
+    return -1;
+  } else if (!src_raw ||
+             !dst_y || !dst_u || !dst_v ||
+             width <= 0 || height == 0) {
+      return -1;
+  }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_frame = src_frame + (height - 1) * src_stride_frame;
-    src_stride_frame = -src_stride_frame;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
   }
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
+
+  RAWToARGBRow = RAWToARGBRow_C;
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      TestReadSafe(src_raw, src_stride_raw, width, height, 3, 48)) {
+    RAWToARGBRow = RAWToARGBRow_SSSE3;
+  }
+#endif
+
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
   void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width);
-#if defined(HAS_RAWTOYROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
-      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
-    ARGBToYRow = RAWToYRow_SSSE3;
-  } else
-#endif
-  {
-    ARGBToYRow = RAWToYRow_C;
-  }
-#if defined(HAS_RAWTOUVROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
-      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
-      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
-    ARGBToUVRow = RAWToUVRow_SSSE3;
-  } else
-#endif
-  {
-    ARGBToUVRow = RAWToUVRow_C;
-  }
-
-  for (int y = 0; y < (height - 1); y += 2) {
-    ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
-    ARGBToYRow(src_frame, dst_y, width);
-    ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
-    src_frame += src_stride_frame * 2;
+
+  ARGBToYRow = ARGBToYRow_C;
+  ARGBToUVRow = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (width > 16) {
+      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    }
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height - 1; y += 2) {
+    RAWToARGBRow(src_raw, row, width);
+    RAWToARGBRow(src_raw + src_stride_raw, row + kMaxStride, width);
+    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+    src_raw += src_stride_raw * 2;
     dst_y += dst_stride_y * 2;
     dst_u += dst_stride_u;
     dst_v += dst_stride_v;
   }
   if (height & 1) {
-    ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
-    ARGBToYRow(src_frame, dst_y, width);
+    RAWToARGBRow_C(src_raw, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
   }
   return 0;
 }
 
-} // namespace libyuv
+LIBYUV_API
+int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
+                 uint8* dst_y, int dst_stride_y,
+                 uint8* dst_u, int dst_stride_u,
+                 uint8* dst_v, int dst_stride_v,
+                 int width, int height) {
+  if (width * 4 > kMaxStride) {  // Row buffer is required.
+    return -1;
+  } else if (!src_rgb565 ||
+             !dst_y || !dst_u || !dst_v ||
+             width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
+
+  RGB565ToARGBRow = RGB565ToARGBRow_C;
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      TestReadSafe(src_rgb565, src_stride_rgb565, width, height, 2, 16)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+  }
+#endif
+
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+
+  ARGBToYRow = ARGBToYRow_C;
+  ARGBToUVRow = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (width > 16) {
+      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    }
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height - 1; y += 2) {
+    RGB565ToARGBRow(src_rgb565, row, width);
+    RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kMaxStride, width);
+    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+    src_rgb565 += src_stride_rgb565 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    RGB565ToARGBRow_C(src_rgb565, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
+                 uint8* dst_y, int dst_stride_y,
+                 uint8* dst_u, int dst_stride_u,
+                 uint8* dst_v, int dst_stride_v,
+                 int width, int height) {
+  if (width * 4 > kMaxStride) {  // Row buffer is required.
+    return -1;
+  } else if (!src_argb1555 ||
+             !dst_y || !dst_u || !dst_v ||
+             width <= 0 || height == 0) {
+      return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
+
+  ARGB1555ToARGBRow = ARGB1555ToARGBRow_C;
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      TestReadSafe(src_argb1555, src_stride_argb1555, width, height, 2, 16)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+  }
+#endif
+
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+
+  ARGBToYRow = ARGBToYRow_C;
+  ARGBToUVRow = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (width > 16) {
+      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    }
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height - 1; y += 2) {
+    ARGB1555ToARGBRow(src_argb1555, row, width);
+    ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555,
+                      row + kMaxStride, width);
+    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+    src_argb1555 += src_stride_argb1555 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGB1555ToARGBRow_C(src_argb1555, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height) {
+  if (width * 4 > kMaxStride) {  // Row buffer is required.
+    return -1;
+  } else if (!src_argb4444 ||
+             !dst_y || !dst_u || !dst_v ||
+             width <= 0 || height == 0) {
+      return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
+
+  ARGB4444ToARGBRow = ARGB4444ToARGBRow_C;
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      TestReadSafe(src_argb4444, src_stride_argb4444, width, height, 2, 16)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+  }
+#endif
+
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+
+  ARGBToYRow = ARGBToYRow_C;
+  ARGBToUVRow = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (width > 16) {
+      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    }
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height - 1; y += 2) {
+    ARGB4444ToARGBRow(src_argb4444, row, width);
+    ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444,
+                      row + kMaxStride, width);
+    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+    src_argb4444 += src_stride_argb4444 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGB4444ToARGBRow_C(src_argb4444, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+  }
+  return 0;
+}
+
+#ifdef HAVE_JPEG
+struct I420Buffers {
+  uint8* y;
+  int y_stride;
+  uint8* u;
+  int u_stride;
+  uint8* v;
+  int v_stride;
+  int w;
+  int h;
+};
+
+static void JpegCopyI420(void* opaque,
+                         const uint8* const* data,
+                         const int* strides,
+                         int rows) {
+  I420Buffers* dest = static_cast<I420Buffers*>(opaque);
+  I420Copy(data[0], strides[0],
+           data[1], strides[1],
+           data[2], strides[2],
+           dest->y, dest->y_stride,
+           dest->u, dest->u_stride,
+           dest->v, dest->v_stride,
+           dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = static_cast<I420Buffers*>(opaque);
+  I422ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = static_cast<I420Buffers*>(opaque);
+  I444ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI411ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = static_cast<I420Buffers*>(opaque);
+  I411ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = static_cast<I420Buffers*>(opaque);
+  I400ToI420(data[0], strides[0],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to I420
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToI420(const uint8* sample,
+               size_t sample_size,
+               uint8* y, int y_stride,
+               uint8* u, int u_stride,
+               uint8* v, int v_stride,
+               int w, int h,
+               int dw, int dh) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port to C
+  MJpegDecoder mjpeg_decoder;
+  bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != w ||
+              mjpeg_decoder.GetHeight() != h)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() ==
+            MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
+    // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
+    // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
+    // YUV411
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
+    // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice. 411 is supported by libjpeg
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return 0;
+}
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToI420(const uint8* sample,
+#ifdef HAVE_JPEG
+                  size_t sample_size,
+#else
+                  size_t /* sample_size */,
+#endif
+                  uint8* y, int y_stride,
+                  uint8* u, int u_stride,
+                  uint8* v, int v_stride,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int dst_width, int dst_height,
+                  RotationMode rotation,
+                  uint32 format) {
+  if (!y || !u || !v || !sample ||
+      src_width <= 0 || dst_width <= 0  ||
+      src_height == 0 || dst_height == 0) {
+    return -1;
+  }
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8* src;
+  const uint8* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_dst_height = (dst_height < 0) ? -dst_height : dst_height;
+  if (src_height < 0) {
+    inv_dst_height = -inv_dst_height;
+  }
+  int r = 0;
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+  // and then rotate the I420 to the final destination buffer.
+  // For in-place conversion, if destination y is same as source sample,
+  // also enable temporary buffer.
+  bool need_buf = (rotation && format != FOURCC_I420 &&
+      format != FOURCC_NV12 && format != FOURCC_NV21 &&
+      format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
+  uint8* tmp_y = y;
+  uint8* tmp_u = u;
+  uint8* tmp_v = v;
+  int tmp_y_stride = y_stride;
+  int tmp_u_stride = u_stride;
+  int tmp_v_stride = v_stride;
+  uint8* buf = NULL;
+  int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height;
+  if (need_buf) {
+    int y_size = dst_width * abs_dst_height;
+    int uv_size = ((dst_width + 1) / 2) * ((abs_dst_height + 1) / 2);
+    buf = new uint8[y_size + uv_size * 2];
+    if (!buf) {
+      return 1;  // Out of memory runtime error.
+    }
+    y = buf;
+    u = y + y_size;
+    v = u + uv_size;
+    y_stride = dst_width;
+    u_stride = v_stride = ((dst_width + 1) / 2);
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToI420(src, aligned_src_width * 2,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_width, inv_dst_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToI420(src, aligned_src_width * 2,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_width, inv_dst_height);
+      break;
+    case FOURCC_V210:
+      // stride is multiple of 48 pixels (128 bytes).
+      // pixels come in groups of 6 = 16 bytes
+      src = sample + (aligned_src_width + 47) / 48 * 128 * crop_y +
+            crop_x / 6 * 16;
+      r = V210ToI420(src, (aligned_src_width + 47) / 48 * 128,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_width, inv_dst_height);
+      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToI420(src, src_width * 3,
+                      y, y_stride,
+                      u, u_stride,
+                      v, v_stride,
+                      dst_width, inv_dst_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToI420(src, src_width * 3,
+                    y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    dst_width, inv_dst_height);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ARGBToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_width, inv_dst_height);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_width, inv_dst_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_width, inv_dst_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_width, inv_dst_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToI420(src, src_width * 2,
+                       y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       dst_width, inv_dst_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToI420(src, src_width * 2,
+                         y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_width, inv_dst_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToI420(src, src_width * 2,
+                         y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_width, inv_dst_height);
+      break;
+    // TODO(fbarchard): Support cropping Bayer by odd numbers
+    // by adjusting fourcc.
+    case FOURCC_BGGR:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerBGGRToI420(src, src_width,
+                          y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_width, inv_dst_height);
+      break;
+
+    case FOURCC_GBRG:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerGBRGToI420(src, src_width,
+                          y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_width, inv_dst_height);
+      break;
+
+    case FOURCC_GRBG:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerGRBGToI420(src, src_width,
+                          y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_width, inv_dst_height);
+      break;
+
+    case FOURCC_RGGB:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerRGGBToI420(src, src_width,
+                          y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_width, inv_dst_height);
+      break;
+
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToI420(src, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_width, inv_dst_height);
+      break;
+
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      r = NV12ToI420Rotate(src, src_width,
+                           src_uv, aligned_src_width,
+                           y, y_stride,
+                           u, u_stride,
+                           v, v_stride,
+                           dst_width, inv_dst_height, rotation);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      // Call NV12 but with u and v parameters swapped.
+      r = NV12ToI420Rotate(src, src_width,
+                           src_uv, aligned_src_width,
+                           y, y_stride,
+                           v, v_stride,
+                           u, u_stride,
+                           dst_width, inv_dst_height, rotation);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToI420(src, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_width, inv_dst_height);
+      break;
+    case FOURCC_Q420:
+      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
+      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
+               src_width + crop_x * 2;
+      r = Q420ToI420(src, src_width * 3,
+                    src_uv, src_width * 3,
+                    y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    dst_width, inv_dst_height);
+      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420Rotate(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_width, inv_dst_height, rotation);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToI420(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_width, inv_dst_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToI420(src_y, src_width,
+                     src_u, src_width,
+                     src_v, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_width, inv_dst_height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (src_width + 3) / 4;
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u = sample + src_width * abs_src_height +
+          quarterwidth * crop_y + crop_x / 4;
+      const uint8* src_v = sample + src_width * abs_src_height +
+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+      r = I411ToI420(src_y, src_width,
+                     src_u, quarterwidth,
+                     src_v, quarterwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_width, inv_dst_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToI420(sample, sample_size,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     src_width, abs_src_height, dst_width, inv_dst_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = I420Rotate(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     tmp_y, tmp_y_stride,
+                     tmp_u, tmp_u_stride,
+                     tmp_v, tmp_v_stride,
+                     dst_width, abs_dst_height, rotation);
+    }
+    delete buf;
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc
new file mode 100644
index 00000000..1c5aa9d9
--- /dev/null
+++ b/files/source/convert_argb.cc
@@ -0,0 +1,1300 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include <string.h>  // for memset()
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB with optional flipping
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height) {
+  if (!src_argb || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+            width * 4, height);
+  return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  void (*I444ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I444ToARGBRow_C;
+#if defined(HAS_I444TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I444ToARGBRow = I444ToARGBRow_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I422ToARGBRow = I422ToARGBRow_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  void (*I411ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I411ToARGBRow_C;
+#if defined(HAS_I411TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I411ToARGBRow = I411ToARGBRow_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+                         uint8* dst_argb, int dst_stride_argb,
+                         int width, int height) {
+  if (!src_y || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  void (*YToARGBRow)(const uint8* y_buf,
+                     uint8* rgb_buf,
+                     int width) = YToARGBRow_C;
+#if defined(HAS_YTOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 8) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    YToARGBRow = YToARGBRow_SSE2;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    YToARGBRow(src_y, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+  }
+  return 0;
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (!src_y || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
+      I400ToARGBRow_C;
+#if defined(HAS_I400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 8) &&
+      IS_ALIGNED(src_y, 8) && IS_ALIGNED(src_stride_y, 8) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    I400ToARGBRow = I400ToARGBRow_SSE2;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    I400ToARGBRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert BGRA to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (!src_bgra || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+  void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix) =
+      BGRAToARGBRow_C;
+#if defined(HAS_BGRATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    BGRAToARGBRow = BGRAToARGBRow_SSSE3;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    BGRAToARGBRow(src_bgra, dst_argb, width);
+    src_bgra += src_stride_bgra;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ABGR to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (!src_abgr || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+  void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix) =
+      ABGRToARGBRow_C;
+#if defined(HAS_ABGRTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ABGRToARGBRow = ABGRToARGBRow_SSSE3;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    ABGRToARGBRow(src_abgr, dst_argb, width);
+    src_abgr += src_stride_abgr;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (!src_rgba || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
+  }
+  void (*RGBAToARGBRow)(const uint8* src_rgba, uint8* dst_argb, int pix) =
+      RGBAToARGBRow_C;
+#if defined(HAS_RGBATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    RGBAToARGBRow = RGBAToARGBRow_SSSE3;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    RGBAToARGBRow(src_rgba, dst_argb, width);
+    src_rgba += src_stride_rgba;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RAW to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  if (!src_raw || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix) =
+      RAWToARGBRow_C;
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    RAWToARGBRow = RAWToARGBRow_SSSE3;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    RAWToARGBRow(src_raw, dst_argb, width);
+    src_raw += src_stride_raw;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RGB24 to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height) {
+  if (!src_rgb24 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+  void (*RGB24ToARGBRow)(const uint8* src_rgb24, uint8* dst_argb, int pix) =
+      RGB24ToARGBRow_C;
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    RGB24ToARGBRow(src_rgb24, dst_argb, width);
+    src_rgb24 += src_stride_rgb24;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RGB565 to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  if (!src_rgb565 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
+      RGB565ToARGBRow_C;
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 8) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    RGB565ToARGBRow(src_rgb565, dst_argb, width);
+    src_rgb565 += src_stride_rgb565;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB1555 to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height) {
+  if (!src_argb1555 || !dst_argb ||
+       width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) = ARGB1555ToARGBRow_C;
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 8) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
+    src_argb1555 += src_stride_argb1555;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB4444 to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height) {
+  if (!src_argb4444 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) = ARGB4444ToARGBRow_C;
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 8) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
+    src_argb4444 += src_stride_argb4444;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (!src_y || !src_uv || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV12ToARGBRow_C;
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    NV12ToARGBRow(src_y, src_uv, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (!src_y || !src_uv || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  void (*NV21ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV21ToARGBRow_C;
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    NV21ToARGBRow(src_y, src_uv, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (!src_m420 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV12ToARGBRow_C;
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height - 1; y += 2) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+    NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
+                  dst_argb + dst_stride_argb, width);
+    dst_argb += dst_stride_argb * 2;
+    src_m420 += src_stride_m420 * 3;
+  }
+  if (height & 1) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+  }
+  return 0;
+}
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (!src_yuy2 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+      int pix) = YUY2ToUV422Row_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2,
+                     uint8* dst_y, int pix) = YUY2ToYRow_C;
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    if (width > 16) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+      YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
+      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+        YUY2ToYRow = YUY2ToYRow_SSE2;
+      }
+    }
+  }
+#elif defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 8) {
+      YUY2ToYRow = YUY2ToYRow_Any_NEON;
+      if (width > 16) {
+        YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+      }
+    }
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+      }
+    }
+  }
+#endif
+
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* argb_buf,
+                        int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8) &&
+        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+
+  SIMD_ALIGNED(uint8 rowy[kMaxStride]);
+  SIMD_ALIGNED(uint8 rowu[kMaxStride]);
+  SIMD_ALIGNED(uint8 rowv[kMaxStride]);
+
+  for (int y = 0; y < height; ++y) {
+    YUY2ToUV422Row(src_yuy2, rowu, rowv, width);
+    YUY2ToYRow(src_yuy2, rowy, width);
+    I422ToARGBRow(rowy, rowu, rowv, dst_argb, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (!src_uyvy || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  void (*UYVYToUV422Row)(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+      int pix) = UYVYToUV422Row_C;
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+                     uint8* dst_y, int pix) = UYVYToYRow_C;
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    if (width > 16) {
+      UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+      UYVYToYRow = UYVYToYRow_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
+      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
+        UYVYToUV422Row = UYVYToUV422Row_SSE2;
+        UYVYToYRow = UYVYToYRow_SSE2;
+      }
+    }
+  }
+#endif
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* argb_buf,
+                        int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8) &&
+        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+
+  SIMD_ALIGNED(uint8 rowy[kMaxStride]);
+  SIMD_ALIGNED(uint8 rowu[kMaxStride]);
+  SIMD_ALIGNED(uint8 rowv[kMaxStride]);
+
+  for (int y = 0; y < height; ++y) {
+    UYVYToUV422Row(src_uyvy, rowu, rowv, width);
+    UYVYToYRow(src_uyvy, rowy, width);
+    I422ToARGBRow(rowy, rowu, rowv, dst_argb, width);
+    src_uyvy += src_stride_uyvy;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+#ifdef HAVE_JPEG
+struct ARGBBuffers {
+  uint8* argb;
+  int argb_stride;
+  int w;
+  int h;
+};
+
+static void JpegI420ToARGB(void* opaque,
+                         const uint8* const* data,
+                         const int* strides,
+                         int rows) {
+  ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
+  I420ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
+  I422ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
+  I444ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI411ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
+  I411ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
+  I400ToARGB(data[0], strides[0],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to ARGB
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample,
+               size_t sample_size,
+               uint8* argb, int argb_stride,
+               int w, int h,
+               int dw, int dh) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port to C
+  MJpegDecoder mjpeg_decoder;
+  bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != w ||
+              mjpeg_decoder.GetHeight() != h)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() ==
+            MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
+    // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
+    // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
+    // YUV411
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
+    // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice. 411 is supported by libjpeg
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return 0;
+}
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToARGB(const uint8* sample, size_t sample_size,
+                  uint8* dst_argb, int argb_stride,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int dst_width, int dst_height,
+                  RotationMode rotation,
+                  uint32 format) {
+  if (dst_argb == NULL || sample == NULL ||
+      src_width <= 0 || dst_width <= 0 ||
+      src_height == 0 || dst_height == 0) {
+    return -1;
+  }
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8* src;
+  const uint8* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_dst_height = (dst_height < 0) ? -dst_height : dst_height;
+  if (src_height < 0) {
+    inv_dst_height = -inv_dst_height;
+  }
+  int r = 0;
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+  // and then rotate the I420 to the final destination buffer.
+  // For in-place conversion, if destination dst_argb is same as source sample,
+  // also enable temporary buffer.
+  bool need_buf = (rotation && format != FOURCC_ARGB) || dst_argb == sample;
+  uint8* tmp_argb = dst_argb;
+  int tmp_argb_stride = argb_stride;
+  uint8* buf = NULL;
+  int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height;
+  if (need_buf) {
+    int argb_size = dst_width * abs_dst_height * 4;
+    buf = new uint8[argb_size];
+    if (!buf) {
+      return 1;  // Out of memory runtime error.
+    }
+    dst_argb = buf;
+    argb_stride = dst_width;
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToARGB(src, aligned_src_width * 2,
+                     dst_argb, argb_stride,
+                     dst_width, inv_dst_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToARGB(src, aligned_src_width * 2,
+                     dst_argb, argb_stride,
+                     dst_width, inv_dst_height);
+      break;
+//    case FOURCC_V210:
+      // stride is multiple of 48 pixels (128 bytes).
+      // pixels come in groups of 6 = 16 bytes
+//      src = sample + (aligned_src_width + 47) / 48 * 128 * crop_y +
+//            crop_x / 6 * 16;
+//      r = V210ToARGB(src, (aligned_src_width + 47) / 48 * 128,
+//                     dst_argb, argb_stride,
+//                     dst_width, inv_dst_height);
+//      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToARGB(src, src_width * 3,
+                      dst_argb, argb_stride,
+                      dst_width, inv_dst_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToARGB(src, src_width * 3,
+                    dst_argb, argb_stride,
+                    dst_width, inv_dst_height);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ARGBToARGB(src, src_width * 4,
+                     dst_argb, argb_stride,
+                     dst_width, inv_dst_height);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToARGB(src, src_width * 4,
+                     dst_argb, argb_stride,
+                     dst_width, inv_dst_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToARGB(src, src_width * 4,
+                     dst_argb, argb_stride,
+                     dst_width, inv_dst_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToARGB(src, src_width * 4,
+                     dst_argb, argb_stride,
+                     dst_width, inv_dst_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToARGB(src, src_width * 2,
+                       dst_argb, argb_stride,
+                       dst_width, inv_dst_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToARGB(src, src_width * 2,
+                         dst_argb, argb_stride,
+                         dst_width, inv_dst_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToARGB(src, src_width * 2,
+                         dst_argb, argb_stride,
+                         dst_width, inv_dst_height);
+      break;
+    // TODO(fbarchard): Support cropping Bayer by odd numbers
+    // by adjusting fourcc.
+    case FOURCC_BGGR:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerBGGRToARGB(src, src_width,
+                          dst_argb, argb_stride,
+                          dst_width, inv_dst_height);
+      break;
+
+    case FOURCC_GBRG:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerGBRGToARGB(src, src_width,
+                          dst_argb, argb_stride,
+                          dst_width, inv_dst_height);
+      break;
+
+    case FOURCC_GRBG:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerGRBGToARGB(src, src_width,
+                          dst_argb, argb_stride,
+                          dst_width, inv_dst_height);
+      break;
+
+    case FOURCC_RGGB:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerRGGBToARGB(src, src_width,
+                          dst_argb, argb_stride,
+                          dst_width, inv_dst_height);
+      break;
+
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToARGB(src, src_width,
+                     dst_argb, argb_stride,
+                     dst_width, inv_dst_height);
+      break;
+
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      r = NV12ToARGB(src, src_width,
+                     src_uv, aligned_src_width,
+                     dst_argb, argb_stride,
+                     dst_width, inv_dst_height);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      // Call NV12 but with u and v parameters swapped.
+      r = NV21ToARGB(src, src_width,
+                     src_uv, aligned_src_width,
+                     dst_argb, argb_stride,
+                     dst_width, inv_dst_height);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToARGB(src, src_width,
+                     dst_argb, argb_stride,
+                     dst_width, inv_dst_height);
+      break;
+//    case FOURCC_Q420:
+//      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
+//      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
+//               src_width + crop_x * 2;
+//      r = Q420ToARGB(src, src_width * 3,
+//                    src_uv, src_width * 3,
+//                    dst_argb, argb_stride,
+//                    dst_width, inv_dst_height);
+//      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     dst_argb, argb_stride,
+                     dst_width, inv_dst_height);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     dst_argb, argb_stride,
+                     dst_width, inv_dst_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToARGB(src_y, src_width,
+                     src_u, src_width,
+                     src_v, src_width,
+                     dst_argb, argb_stride,
+                     dst_width, inv_dst_height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (src_width + 3) / 4;
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u = sample + src_width * abs_src_height +
+          quarterwidth * crop_y + crop_x / 4;
+      const uint8* src_v = sample + src_width * abs_src_height +
+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+      r = I411ToARGB(src_y, src_width,
+                     src_u, quarterwidth,
+                     src_v, quarterwidth,
+                     dst_argb, argb_stride,
+                     dst_width, inv_dst_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToARGB(sample, sample_size,
+                     dst_argb, argb_stride,
+                     src_width, abs_src_height, dst_width, inv_dst_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = ARGBRotate(dst_argb, argb_stride,
+                     tmp_argb, tmp_argb_stride,
+                     dst_width, abs_dst_height, rotation);
+    }
+    delete buf;
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc
new file mode 100644
index 00000000..4ea974ac
--- /dev/null
+++ b/files/source/convert_from.cc
@@ -0,0 +1,1425 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h"  // For I420Copy
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (height - 1) * dst_stride_u;
+    dst_v = dst_v + (height - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  int halfwidth = (width + 1) >> 1;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 64)) {
+    CopyRow = CopyRow_NEON;
+  }
+#elif defined(HAS_COPYROW_X86)
+  if (IS_ALIGNED(halfwidth, 4)) {
+    CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 32) &&
+        IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+        IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+        IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+        IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+      CopyRow = CopyRow_SSE2;
+    }
+#endif
+  }
+#endif
+
+  // Copy Y plane
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+
+  // UpSample U plane.
+  int y;
+  for (y = 0; y < height - 1; y += 2) {
+    CopyRow(src_u, dst_u, halfwidth);
+    CopyRow(src_u, dst_u + dst_stride_u, halfwidth);
+    src_u += src_stride_u;
+    dst_u += dst_stride_u * 2;
+  }
+  if (height & 1) {
+    CopyRow(src_u, dst_u, halfwidth);
+  }
+
+  // UpSample V plane.
+  for (y = 0; y < height - 1; y += 2) {
+    CopyRow(src_v, dst_v, halfwidth);
+    CopyRow(src_v, dst_v + dst_stride_v, halfwidth);
+    src_v += src_stride_v;
+    dst_v += dst_stride_v * 2;
+  }
+  if (height & 1) {
+    CopyRow(src_v, dst_v, halfwidth);
+  }
+  return 0;
+}
+
+// use Bilinear for upsampling chroma
+void ScalePlaneBilinear(int src_width, int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_ptr, uint8* dst_ptr);
+
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (!src_y || !src_u|| !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (height - 1) * dst_stride_u;
+    dst_v = dst_v + (height - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+
+  // Copy Y plane
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+
+  // Upsample U plane.
+  ScalePlaneBilinear(halfwidth, halfheight,
+                     width, height,
+                     src_stride_u,
+                     dst_stride_u,
+                     src_u, dst_u);
+
+  // Upsample V plane.
+  ScalePlaneBilinear(halfwidth, halfheight,
+                     width, height,
+                     src_stride_v,
+                     dst_stride_v,
+                     src_v, dst_v);
+  return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 411 chroma is 1/4 width, 1x height
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (height - 1) * dst_stride_u;
+    dst_v = dst_v + (height - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+
+  // Copy Y plane
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  int quarterwidth = (width + 3) >> 2;
+
+  // Resample U plane.
+  ScalePlaneBilinear(halfwidth, halfheight,  // from 1/2 width, 1/2 height
+                     quarterwidth, height,  // to 1/4 width, 1x height
+                     src_stride_u,
+                     dst_stride_u,
+                     src_u, dst_u);
+
+  // Resample V plane.
+  ScalePlaneBilinear(halfwidth, halfheight,  // from 1/2 width, 1/2 height
+                     quarterwidth, height,  // to 1/4 width, 1x height
+                     src_stride_v,
+                     dst_stride_v,
+                     src_v, dst_v);
+  return 0;
+}
+
+// Copy to I400. Source can be I420,422,444,400,NV12,NV21
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+             uint8* dst_y, int dst_stride_y,
+             int width, int height) {
+  if (!src_y || !dst_y ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// YUY2 - Macro-pixel = 2 image pixels
+// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+
+// UYVY - Macro-pixel = 2 image pixels
+// U0Y0V0Y1
+
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+#define HAS_I42XTOYUY2ROW_SSE2
+__declspec(naked) __declspec(align(16))
+static void I42xToYUY2Row_SSE2(const uint8* src_y,
+                               const uint8* src_u,
+                               const uint8* src_v,
+                               uint8* dst_frame, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_y
+    mov        esi, [esp + 8 + 8]    // src_u
+    mov        edx, [esp + 8 + 12]   // src_v
+    mov        edi, [esp + 8 + 16]   // dst_frame
+    mov        ecx, [esp + 8 + 20]   // width
+    sub        edx, esi
+
+    align      16
+  convertloop:
+    movq       xmm2, qword ptr [esi] // U
+    movq       xmm3, qword ptr [esi + edx] // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3 // UV
+    movdqa     xmm0, [eax] // Y
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm2 // YUYV
+    punpckhbw  xmm1, xmm2
+    movdqa     [edi], xmm0
+    movdqa     [edi + 16], xmm1
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#define HAS_I42XTOUYVYROW_SSE2
+__declspec(naked) __declspec(align(16))
+static void I42xToUYVYRow_SSE2(const uint8* src_y,
+                               const uint8* src_u,
+                               const uint8* src_v,
+                               uint8* dst_frame, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_y
+    mov        esi, [esp + 8 + 8]    // src_u
+    mov        edx, [esp + 8 + 12]   // src_v
+    mov        edi, [esp + 8 + 16]   // dst_frame
+    mov        ecx, [esp + 8 + 20]   // width
+    sub        edx, esi
+
+    align      16
+  convertloop:
+    movq       xmm2, qword ptr [esi] // U
+    movq       xmm3, qword ptr [esi + edx] // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3 // UV
+    movdqa     xmm0, [eax] // Y
+    movdqa     xmm1, xmm2
+    lea        eax, [eax + 16]
+    punpcklbw  xmm1, xmm0 // UYVY
+    punpckhbw  xmm2, xmm0
+    movdqa     [edi], xmm1
+    movdqa     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_I42XTOYUY2ROW_SSE2
+static void I42xToYUY2Row_SSE2(const uint8* src_y,
+                               const uint8* src_u,
+                               const uint8* src_v,
+                               uint8* dst_frame, int width) {
+ asm volatile (
+    "sub        %1,%2                            \n"
+    ".p2align  4                                 \n"
+  "1:                                            \n"
+    "movq      (%1),%%xmm2                       \n"
+    "movq      (%1,%2,1),%%xmm3                  \n"
+    "lea       0x8(%1),%1                        \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
+    "movdqa    (%0),%%xmm0                       \n"
+    "lea       0x10(%0),%0                       \n"
+    "movdqa    %%xmm0,%%xmm1                     \n"
+    "punpcklbw %%xmm2,%%xmm0                     \n"
+    "punpckhbw %%xmm2,%%xmm1                     \n"
+    "movdqa    %%xmm0,(%3)                       \n"
+    "movdqa    %%xmm1,0x10(%3)                   \n"
+    "lea       0x20(%3),%3                       \n"
+    "sub       $0x10,%4                          \n"
+    "jg         1b                               \n"
+    : "+r"(src_y),  // %0
+      "+r"(src_u),  // %1
+      "+r"(src_v),  // %2
+      "+r"(dst_frame),  // %3
+      "+rm"(width)  // %4
+    :
+    : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+  );
+}
+
+#define HAS_I42XTOUYVYROW_SSE2
+static void I42xToUYVYRow_SSE2(const uint8* src_y,
+                               const uint8* src_u,
+                               const uint8* src_v,
+                               uint8* dst_frame, int width) {
+ asm volatile (
+    "sub        %1,%2                            \n"
+    ".p2align  4                                 \n"
+  "1:                                            \n"
+    "movq      (%1),%%xmm2                       \n"
+    "movq      (%1,%2,1),%%xmm3                  \n"
+    "lea       0x8(%1),%1                        \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
+    "movdqa    (%0),%%xmm0                       \n"
+    "movdqa    %%xmm2,%%xmm1                     \n"
+    "lea       0x10(%0),%0                       \n"
+    "punpcklbw %%xmm0,%%xmm1                     \n"
+    "punpckhbw %%xmm0,%%xmm2                     \n"
+    "movdqa    %%xmm1,(%3)                       \n"
+    "movdqa    %%xmm2,0x10(%3)                   \n"
+    "lea       0x20(%3),%3                       \n"
+    "sub       $0x10,%4                          \n"
+    "jg         1b                               \n"
+    : "+r"(src_y),  // %0
+      "+r"(src_u),  // %1
+      "+r"(src_v),  // %2
+      "+r"(dst_frame),  // %3
+      "+rm"(width)  // %4
+    :
+    : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+  );
+}
+#endif
+
+static void I42xToYUY2Row_C(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_frame, int width) {
+    for (int x = 0; x < width - 1; x += 2) {
+      dst_frame[0] = src_y[0];
+      dst_frame[1] = src_u[0];
+      dst_frame[2] = src_y[1];
+      dst_frame[3] = src_v[0];
+      dst_frame += 4;
+      src_y += 2;
+      src_u += 1;
+      src_v += 1;
+    }
+    if (width & 1) {
+      dst_frame[0] = src_y[0];
+      dst_frame[1] = src_u[0];
+      dst_frame[2] = src_y[0];  // duplicate last y
+      dst_frame[3] = src_v[0];
+    }
+}
+
+static void I42xToUYVYRow_C(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_frame, int width) {
+    for (int x = 0; x < width - 1; x += 2) {
+      dst_frame[0] = src_u[0];
+      dst_frame[1] = src_y[0];
+      dst_frame[2] = src_v[0];
+      dst_frame[3] = src_y[1];
+      dst_frame += 4;
+      src_y += 2;
+      src_u += 1;
+      src_v += 1;
+    }
+    if (width & 1) {
+      dst_frame[0] = src_u[0];
+      dst_frame[1] = src_y[0];
+      dst_frame[2] = src_v[0];
+      dst_frame[3] = src_y[0];  // duplicate last y
+    }
+}
+
+// Visual C x86 or GCC little endian.
+#if defined(__x86_64__) || defined(_M_X64) || \
+  defined(__i386__) || defined(_M_IX86) || \
+  defined(__arm__) || defined(_M_ARM) || \
+  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define LIBYUV_LITTLE_ENDIAN
+#endif
+
+#ifdef LIBYUV_LITTLE_ENDIAN
+#define WRITEWORD(p, v) *reinterpret_cast<uint32*>(p) = v
+#else
+static inline void WRITEWORD(uint8* p, uint32 v) {
+  p[0] = (uint8)(v & 255);
+  p[1] = (uint8)((v >> 8) & 255);
+  p[2] = (uint8)((v >> 16) & 255);
+  p[3] = (uint8)((v >> 24) & 255);
+}
+#endif
+
+#define EIGHTTOTEN(x) (x << 2 | x >> 6)
+static void UYVYToV210Row_C(const uint8* src_uyvy, uint8* dst_v210, int width) {
+  for (int x = 0; x < width; x += 6) {
+    WRITEWORD(dst_v210 + 0, (EIGHTTOTEN(src_uyvy[0])) |
+                            (EIGHTTOTEN(src_uyvy[1]) << 10) |
+                            (EIGHTTOTEN(src_uyvy[2]) << 20));
+    WRITEWORD(dst_v210 + 4, (EIGHTTOTEN(src_uyvy[3])) |
+                            (EIGHTTOTEN(src_uyvy[4]) << 10) |
+                            (EIGHTTOTEN(src_uyvy[5]) << 20));
+    WRITEWORD(dst_v210 + 8, (EIGHTTOTEN(src_uyvy[6])) |
+                            (EIGHTTOTEN(src_uyvy[7]) << 10) |
+                            (EIGHTTOTEN(src_uyvy[8]) << 20));
+    WRITEWORD(dst_v210 + 12, (EIGHTTOTEN(src_uyvy[9])) |
+                             (EIGHTTOTEN(src_uyvy[10]) << 10) |
+                             (EIGHTTOTEN(src_uyvy[11]) << 20));
+    src_uyvy += 12;
+    dst_v210 += 16;
+  }
+}
+
+// TODO(fbarchard): Deprecate, move or expand 422 support?
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v || !dst_frame ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
+    dst_stride_frame = -dst_stride_frame;
+  }
+  void (*I42xToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_frame, int width) =
+      I42xToYUY2Row_C;
+#if defined(HAS_I42XTOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+    I42xToYUY2Row = I42xToYUY2Row_SSE2;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    I42xToYUY2Row(src_y, src_u, src_y, dst_frame, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_frame += dst_stride_frame;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v || !dst_frame ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
+    dst_stride_frame = -dst_stride_frame;
+  }
+  void (*I42xToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_frame, int width) =
+      I42xToYUY2Row_C;
+#if defined(HAS_I42XTOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+    I42xToYUY2Row = I42xToYUY2Row_SSE2;
+  }
+#endif
+
+  for (int y = 0; y < height - 1; y += 2) {
+    I42xToYUY2Row(src_y, src_u, src_v, dst_frame, width);
+    I42xToYUY2Row(src_y + src_stride_y, src_u, src_v,
+                  dst_frame + dst_stride_frame, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_frame += dst_stride_frame * 2;
+  }
+  if (height & 1) {
+    I42xToYUY2Row(src_y, src_u, src_v, dst_frame, width);
+  }
+  return 0;
+}
+
+// TODO(fbarchard): Deprecate, move or expand 422 support?
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v || !dst_frame ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
+    dst_stride_frame = -dst_stride_frame;
+  }
+  void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_frame, int width) =
+      I42xToUYVYRow_C;
+#if defined(HAS_I42XTOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+    I42xToUYVYRow = I42xToUYVYRow_SSE2;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    I42xToUYVYRow(src_y, src_u, src_y, dst_frame, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_frame += dst_stride_frame;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v || !dst_frame ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
+    dst_stride_frame = -dst_stride_frame;
+  }
+  void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_frame, int width) =
+      I42xToUYVYRow_C;
+#if defined(HAS_I42XTOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+    I42xToUYVYRow = I42xToUYVYRow_SSE2;
+  }
+#endif
+
+  for (int y = 0; y < height - 1; y += 2) {
+    I42xToUYVYRow(src_y, src_u, src_v, dst_frame, width);
+    I42xToUYVYRow(src_y + src_stride_y, src_u, src_v,
+                  dst_frame + dst_stride_frame, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_frame += dst_stride_frame * 2;
+  }
+  if (height & 1) {
+    I42xToUYVYRow(src_y, src_u, src_v, dst_frame, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToV210(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height) {
+  if (width * 16 / 6 > kMaxStride) {  // Row buffer of V210 is required.
+    return -1;
+  } else if (!src_y || !src_u || !src_v || !dst_frame ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
+    dst_stride_frame = -dst_stride_frame;
+  }
+
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  void (*UYVYToV210Row)(const uint8* src_uyvy, uint8* dst_v210, int pix);
+  UYVYToV210Row = UYVYToV210Row_C;
+
+  void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_frame, int width) =
+      I42xToUYVYRow_C;
+#if defined(HAS_I42XTOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16)) {
+    I42xToUYVYRow = I42xToUYVYRow_SSE2;
+  }
+#endif
+
+  for (int y = 0; y < height - 1; y += 2) {
+    I42xToUYVYRow(src_y, src_u, src_v, row, width);
+    UYVYToV210Row(row, dst_frame, width);
+    I42xToUYVYRow(src_y + src_stride_y, src_u, src_v, row, width);
+    UYVYToV210Row(row, dst_frame + dst_stride_frame, width);
+
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_frame += dst_stride_frame * 2;
+  }
+  if (height & 1) {
+    I42xToUYVYRow(src_y, src_u, src_v, row, width);
+    UYVYToV210Row(row, dst_frame, width);
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I422ToARGBRow = I422ToARGBRow_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_bgra ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+    dst_stride_bgra = -dst_stride_bgra;
+  }
+  void (*I422ToBGRARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToBGRARow_C;
+#if defined(HAS_I422TOBGRAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToBGRARow = I422ToBGRARow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToBGRARow = I422ToBGRARow_NEON;
+    }
+  }
+#elif defined(HAS_I422TOBGRAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
+        I422ToBGRARow = I422ToBGRARow_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+    dst_bgra += dst_stride_bgra;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_abgr ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+    dst_stride_abgr = -dst_stride_abgr;
+  }
+  void (*I422ToABGRRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToABGRRow_C;
+#if defined(HAS_I422TOABGRROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToABGRRow = I422ToABGRRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToABGRRow = I422ToABGRRow_NEON;
+    }
+  }
+#elif defined(HAS_I422TOABGRROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
+        I422ToABGRRow = I422ToABGRRow_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+    dst_abgr += dst_stride_abgr;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_rgba ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+  void (*I422ToRGBARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToRGBARow_C;
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#elif defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
+        I422ToRGBARow = I422ToRGBARow_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_rgb24 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+  void (*I422ToRGB24Row)(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int width) = I422ToRGB24Row_C;
+#if defined(HAS_I422TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
+    }
+  }
+#elif defined(HAS_I422TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
+        I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              uint8* dst_raw, int dst_stride_raw,
+              int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_raw ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_raw = dst_raw + (height - 1) * dst_stride_raw;
+    dst_stride_raw = -dst_stride_raw;
+  }
+  void (*I422ToRAWRow)(const uint8* y_buf,
+                       const uint8* u_buf,
+                       const uint8* v_buf,
+                       uint8* rgb_buf,
+                       int width) = I422ToRAWRow_C;
+#if defined(HAS_I422TORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRAWRow = I422ToRAWRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRAWRow = I422ToRAWRow_NEON;
+    }
+  }
+#elif defined(HAS_I422TORAWROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRAWRow = I422ToRAWRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
+        I422ToRAWRow = I422ToRAWRow_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
+    dst_raw += dst_stride_raw;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_u, int src_stride_u,
+                 const uint8* src_v, int src_stride_v,
+                 uint8* dst_rgb, int dst_stride_rgb,
+                 int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_rgb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
+    dst_stride_rgb = -dst_stride_rgb;
+  }
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_NEON;
+  }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_SSSE3;
+  }
+#endif
+
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  void (*ARGBToRGB565Row)(const uint8* src_rgb, uint8* dst_rgb, int pix) =
+      ARGBToRGB565Row_C;
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    if (width * 2 <= kMaxStride) {
+      ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, row, width);
+    ARGBToRGB565Row(row, dst_rgb, width);
+    dst_rgb += dst_stride_rgb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_NEON;
+  }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_SSSE3;
+  }
+#endif
+
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToARGB1555Row_C;
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    if (width * 2 <= kMaxStride) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, row, width);
+    ARGBToARGB1555Row(row, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_NEON;
+  }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_SSSE3;
+  }
+#endif
+
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+     ARGBToARGB4444Row_C;
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    if (width * 2 <= kMaxStride) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, row, width);
+    ARGBToARGB4444Row(row, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to specified format
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+                    const uint8* u, int u_stride,
+                    const uint8* v, int v_stride,
+                    uint8* dst_sample, int dst_sample_stride,
+                    int width, int height,
+                    uint32 format) {
+  if (!y || !u|| !v || !dst_sample ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  int r = 0;
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      r = I420ToYUY2(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2,
+                     width, height);
+      break;
+    case FOURCC_UYVY:
+      r = I420ToUYVY(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2,
+                     width, height);
+      break;
+    case FOURCC_V210:
+      r = I420ToV210(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride :
+                         (width + 47) / 48 * 128,
+                     width, height);
+      break;
+    case FOURCC_RGBP:
+      r = I420ToRGB565(y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       dst_sample,
+                       dst_sample_stride ? dst_sample_stride : width * 2,
+                       width, height);
+      break;
+    case FOURCC_RGBO:
+      r = I420ToARGB1555(y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_R444:
+      r = I420ToARGB4444(y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_24BG:
+      r = I420ToRGB24(y, y_stride,
+                      u, u_stride,
+                      v, v_stride,
+                      dst_sample,
+                      dst_sample_stride ? dst_sample_stride : width * 3,
+                      width, height);
+      break;
+    case FOURCC_RAW:
+      r = I420ToRAW(y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    dst_sample,
+                    dst_sample_stride ? dst_sample_stride : width * 3,
+                    width, height);
+      break;
+    case FOURCC_ARGB:
+      r = I420ToARGB(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_BGRA:
+      r = I420ToBGRA(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_ABGR:
+      r = I420ToABGR(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_RGBA:
+      r = I420ToRGBA(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_BGGR:
+      r = I420ToBayerBGGR(y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_sample,
+                          dst_sample_stride ? dst_sample_stride : width,
+                          width, height);
+      break;
+    case FOURCC_GBRG:
+      r = I420ToBayerGBRG(y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_sample,
+                          dst_sample_stride ? dst_sample_stride : width,
+                          width, height);
+      break;
+    case FOURCC_GRBG:
+      r = I420ToBayerGRBG(y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_sample,
+                          dst_sample_stride ? dst_sample_stride : width,
+                          width, height);
+      break;
+    case FOURCC_RGGB:
+      r = I420ToBayerRGGB(y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_sample,
+                          dst_sample_stride ? dst_sample_stride : width,
+                          width, height);
+      break;
+    case FOURCC_I400:
+      r = I400Copy(y, y_stride,
+                   dst_sample,
+                   dst_sample_stride ? dst_sample_stride : width,
+                   width, height);
+      break;
+    // Triplanar formats
+    // TODO(fbarchard): halfstride instead of halfwidth
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      int halfwidth = (width + 1) / 2;
+      int halfheight = (height + 1) / 2;
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV12) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + halfwidth * halfheight;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + halfwidth * halfheight;
+      }
+      r = I420Copy(y, y_stride,
+                   u, u_stride,
+                   v, v_stride,
+                   dst_sample, width,
+                   dst_u, halfwidth,
+                   dst_v, halfwidth,
+                   width, height);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      int halfwidth = (width + 1) / 2;
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV16) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + halfwidth * height;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + halfwidth * height;
+      }
+      r = I420ToI422(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, halfwidth,
+                     dst_v, halfwidth,
+                     width, height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV24) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + width * height;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + width * height;
+      }
+      r = I420ToI444(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, width,
+                     dst_v, width,
+                     width, height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (width + 3) / 4;
+      uint8* dst_u = dst_sample + width * height;
+      uint8* dst_v = dst_u + quarterwidth * height;
+      r = I420ToI411(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, quarterwidth,
+                     dst_v, quarterwidth,
+                     width, height);
+      break;
+    }
+
+    // Formats not supported - MJPG, biplanar, some rgb formats.
+    default:
+      return -1;  // unknown fourcc - return failure code.
+  }
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc
index cc44e215..2e96d9b9 100644
--- a/files/source/cpu_id.cc
+++ b/files/source/cpu_id.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -9,66 +9,206 @@
  */
 
 #include "libyuv/cpu_id.h"
-#include "libyuv/basic_types.h"  // for CPU_X86
 
 #ifdef _MSC_VER
-#include <intrin.h>
+#include <intrin.h>  // For __cpuid()
 #endif
+#if !defined(__CLR_VER) && defined(_M_X64) && \
+    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#include <immintrin.h>  // For _xgetbv()
+#endif
+
+#include <stdlib.h>  // For getenv()
+
+// For ArmCpuCaps() but unittested on all platforms
+#include <stdio.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h"  // For CPU_X86
 
 // TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
 #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
-static inline void __cpuid(int cpu_info[4], int info_type) {
-  __asm__ volatile (
-    "mov %%ebx, %%edi\n"
-    "cpuid\n"
-    "xchg %%edi, %%ebx\n"
+static __inline void __cpuid(int cpu_info[4], int info_type) {
+  asm volatile (  // NOLINT
+    "mov %%ebx, %%edi                          \n"
+    "cpuid                                     \n"
+    "xchg %%edi, %%ebx                         \n"
     : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type)
-  );
+    : "a"(info_type));
 }
 #elif defined(__i386__) || defined(__x86_64__)
-static inline void __cpuid(int cpu_info[4], int info_type) {
-  __asm__ volatile (
-    "cpuid\n"
+static __inline void __cpuid(int cpu_info[4], int info_type) {
+  asm volatile (  // NOLINT
+    "cpuid                                     \n"
     : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type)
-  );
+    : "a"(info_type));
 }
 #endif
 
+#ifdef __cplusplus
 namespace libyuv {
+extern "C" {
+#endif
+
+// Low level cpuid for X86. Returns zeros on other CPUs.
+#if !defined(__CLR_VER) && (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__))
+LIBYUV_API
+void CpuId(int cpu_info[4], int info_type) {
+  __cpuid(cpu_info, info_type);
+}
+#else
+LIBYUV_API
+void CpuId(int cpu_info[4], int) {
+  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
+}
+#endif
+
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+#if !defined(__CLR_VER) && defined(_M_X64) && \
+    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#define HAS_XGETBV
+static uint32 XGetBV(unsigned int xcr) {
+  return static_cast<uint32>(_xgetbv(xcr));
+}
+#elif !defined(__CLR_VER) && defined(_M_IX86)
+#define HAS_XGETBV
+__declspec(naked) __declspec(align(16))
+static uint32 XGetBV(unsigned int xcr) {
+  __asm {
+    mov        ecx, [esp + 4]    // xcr
+    _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // xgetbv for vs2005.
+    ret
+  }
+}
+#elif defined(__i386__) || defined(__x86_64__)
+#define HAS_XGETBV
+static uint32 XGetBV(unsigned int xcr) {
+  uint32 xcr_feature_mask;
+  asm volatile (  // NOLINT
+    ".byte 0x0f, 0x01, 0xd0\n"
+    : "=a"(xcr_feature_mask)
+    : "c"(xcr)
+    : "memory", "cc", "edx");  // edx unused.
+  return xcr_feature_mask;
+}
+#endif
+#ifdef HAS_XGETBV
+static const int kXCR_XFEATURE_ENABLED_MASK = 0;
+#endif
+
+// based on libvpx arm_cpudetect.c
+// For Arm, but public to allow testing on any CPU
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name) {
+  int flags = 0;
+  FILE* fin = fopen(cpuinfo_name, "r");
+  if (fin) {
+    char buf[512];
+    while (fgets(buf, 511, fin)) {
+      if (memcmp(buf, "Features", 8) == 0) {
+        flags |= kCpuInitialized;
+        char* p = strstr(buf, " neon");
+        if (p && (p[5] == ' ' || p[5] == '\n')) {
+          flags |= kCpuHasNEON;
+          break;
+        }
+      }
+    }
+    fclose(fin);
+  }
+  return flags;
+}
 
 // CPU detect function for SIMD instruction sets.
-static int cpu_info_ = 0;
+LIBYUV_API
+int cpu_info_ = 0;
 
-// TODO(fbarchard): (cpu_info[2] & 0x10000000 ? kCpuHasAVX : 0)
-static void InitCpuFlags() {
-#ifdef CPU_X86
+// Test environment variable for disabling CPU features. Any non-zero value
+// to disable. Zero ignored to make it easy to set the variable on/off.
+static bool TestEnv(const char* name) {
+  const char* var = getenv(name);
+  if (var) {
+    if (var[0] != '0') {
+      return true;
+    }
+  }
+  return false;
+}
+
+LIBYUV_API
+int InitCpuFlags(void) {
+#if !defined(__CLR_VER) && defined(CPU_X86)
   int cpu_info[4];
   __cpuid(cpu_info, 1);
-  cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
-              (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
-              kCpuInitialized;
+  cpu_info_ = ((cpu_info[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+              ((cpu_info[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+              ((cpu_info[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+              ((cpu_info[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+              (((cpu_info[2] & 0x18000000) == 0x18000000) ? kCpuHasAVX : 0) |
+              kCpuInitialized | kCpuHasX86;
+#ifdef HAS_XGETBV
+  if (cpu_info_ & kCpuHasAVX) {
+    __cpuid(cpu_info, 7);
+    if ((cpu_info[1] & 0x00000020) &&
+        ((XGetBV(kXCR_XFEATURE_ENABLED_MASK) & 0x06) == 0x06)) {
+      cpu_info_ |= kCpuHasAVX2;
+    }
+  }
+#endif
+  // environment variable overrides for testing.
+  if (TestEnv("LIBYUV_DISABLE_X86")) {
+    cpu_info_ &= ~kCpuHasX86;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE2")) {
+    cpu_info_ &= ~kCpuHasSSE2;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
+    cpu_info_ &= ~kCpuHasSSSE3;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE41")) {
+    cpu_info_ &= ~kCpuHasSSE41;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE42")) {
+    cpu_info_ &= ~kCpuHasSSE42;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVX")) {
+    cpu_info_ &= ~kCpuHasAVX;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
+    cpu_info_ &= ~kCpuHasAVX2;
+  }
+  if (TestEnv("LIBYUV_DISABLE_ASM")) {
+    cpu_info_ = kCpuInitialized;
+  }
+#elif defined(__arm__)
+#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+  // linux arm parse text file for neon detect.
+  cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
 #elif defined(__ARM_NEON__)
   // gcc -mfpu=neon defines __ARM_NEON__
   // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
   // to disable Neon on devices that do not have it.
-  cpu_info_ = kCpuHasNEON | kCpuInitialized;
-#else
-  cpu_info_ = kCpuInitialized;
+  cpu_info_ = kCpuHasNEON;
 #endif
+  cpu_info_ |= kCpuInitialized | kCpuHasARM;
+  if (TestEnv("LIBYUV_DISABLE_NEON")) {
+    cpu_info_ &= ~kCpuHasNEON;
+  }
+  if (TestEnv("LIBYUV_DISABLE_ASM")) {
+    cpu_info_ = kCpuInitialized;
+  }
+#endif  // __arm__
+  return cpu_info_;
 }
 
+LIBYUV_API
 void MaskCpuFlags(int enable_flags) {
   InitCpuFlags();
-  cpu_info_ &= enable_flags;
-}
-
-bool TestCpuFlag(int flag) {
-  if (0 == cpu_info_) {
-    InitCpuFlags();
-  }
-  return cpu_info_ & flag ? true : false;
+  cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
 }
 
+#ifdef __cplusplus
+}  // extern "C"
 }  // namespace libyuv
+#endif
diff --git a/files/source/format_conversion.cc b/files/source/format_conversion.cc
index 958f44c4..ed12de88 100644
--- a/files/source/format_conversion.cc
+++ b/files/source/format_conversion.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,66 +8,73 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <assert.h>
+#include "libyuv/format_conversion.h"
 
+#include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
-#include "video_common.h"
-#include "row.h"
-
-#define kMaxStride (2048 * 4)
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
 
+#ifdef __cplusplus
 namespace libyuv {
+extern "C" {
+#endif
 
 // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
-// and vst would select which 2 components to write.  The low level would need
+// and vst would select which 2 components to write. The low level would need
 // to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
 
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_ARGBTOBAYERROW_SSSE3
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
                                  uint8* dst_bayer, uint32 selector, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_argb
     mov        edx, [esp + 8]    // dst_bayer
-    movd       xmm7, [esp + 12]  // selector
+    movd       xmm5, [esp + 12]  // selector
     mov        ecx, [esp + 16]   // pix
-    pshufd     xmm7, xmm7, 0
+    pshufd     xmm5, xmm5, 0
 
+    align      16
   wloop:
     movdqa     xmm0, [eax]
     lea        eax, [eax + 16]
-    pshufb     xmm0, xmm7
+    pshufb     xmm0, xmm5
+    sub        ecx, 4
     movd       [edx], xmm0
     lea        edx, [edx + 4]
-    sub        ecx, 4
-    ja         wloop
+    jg         wloop
     ret
   }
 }
 
-#elif (defined(__x86_64__) || defined(__i386__)) && \
-    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 
 #define HAS_ARGBTOBAYERROW_SSSE3
 static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
                                  uint32 selector, int pix) {
-  asm volatile(
-    "movd   %3,%%xmm7\n"
-    "pshufd $0x0,%%xmm7,%%xmm7\n"
-"1:"
-    "movdqa (%0),%%xmm0\n"
-    "lea    0x10(%0),%0\n"
-    "pshufb %%xmm7,%%xmm0\n"
-    "movd   %%xmm0,(%1)\n"
-    "lea    0x4(%1),%1\n"
-    "sub    $0x4,%2\n"
-    "ja     1b\n"
+  asm volatile (
+    "movd   %3,%%xmm5                          \n"
+    "pshufd $0x0,%%xmm5,%%xmm5                 \n"
+    ".p2align  4                               \n"
+"1:                                            \n"
+    "movdqa (%0),%%xmm0                        \n"
+    "lea    0x10(%0),%0                        \n"
+    "pshufb %%xmm5,%%xmm0                      \n"
+    "sub    $0x4,%2                            \n"
+    "movd   %%xmm0,(%1)                        \n"
+    "lea    0x4(%1),%1                         \n"
+    "jg     1b                                 \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_bayer), // %1
     "+r"(pix)        // %2
-  : "r"(selector)    // %3
-  : "memory"
+  : "g"(selector)    // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm5"
+#endif
+
 );
 }
 #endif
@@ -77,7 +84,7 @@ static void ARGBToBayerRow_C(const uint8* src_argb,
   int index0 = selector & 0xff;
   int index1 = (selector >> 8) & 0xff;
   // Copy a row of Bayer.
-  for (int x = 0; x < (pix - 1); x += 2) {
+  for (int x = 0; x < pix - 1; x += 2) {
     dst_bayer[0] = src_argb[index0];
     dst_bayer[1] = src_argb[index1];
     src_argb += 8;
@@ -96,243 +103,258 @@ static uint32 GenerateSelector(int select0, int select1) {
          static_cast<uint32>((select1 + 12) << 24);
 }
 
-// Converts 32 bit ARGB to any Bayer RGB format.
-int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb,
-                   uint8* dst_bayer, int dst_stride_bayer,
-                   uint32 dst_fourcc_bayer,
-                   int width, int height) {
-  if (height < 0) {
-    height = -height;
-    src_rgb = src_rgb + (height - 1) * src_stride_rgb;
-    src_stride_rgb = -src_stride_rgb;
-  }
-  void (*ARGBToBayerRow)(const uint8* src_argb,
-                         uint8* dst_bayer, uint32 selector, int pix);
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 4 == 0) &&
-      IS_ALIGNED(src_rgb, 16) && (src_stride_rgb % 16 == 0) &&
-      IS_ALIGNED(dst_bayer, 4) && (dst_stride_bayer % 4 == 0)) {
-    ARGBToBayerRow = ARGBToBayerRow_SSSE3;
-  } else
-#endif
-  {
-    ARGBToBayerRow = ARGBToBayerRow_C;
-  }
-
-  int blue_index = 0;
-  int green_index = 1;
-  int red_index = 2;
-
+static int MakeSelectors(const int blue_index,
+                         const int green_index,
+                         const int red_index,
+                         uint32 dst_fourcc_bayer,
+                         uint32 *index_map) {
   // Now build a lookup table containing the indices for the four pixels in each
   // 2x2 Bayer grid.
-  uint32 index_map[2];
   switch (dst_fourcc_bayer) {
-    default:
-      assert(false);
-    case FOURCC_RGGB:
-      index_map[0] = GenerateSelector(red_index, green_index);
-      index_map[1] = GenerateSelector(green_index, blue_index);
-      break;
     case FOURCC_BGGR:
       index_map[0] = GenerateSelector(blue_index, green_index);
       index_map[1] = GenerateSelector(green_index, red_index);
       break;
-    case FOURCC_GRBG:
-      index_map[0] = GenerateSelector(green_index, red_index);
-      index_map[1] = GenerateSelector(blue_index, green_index);
-      break;
     case FOURCC_GBRG:
       index_map[0] = GenerateSelector(green_index, blue_index);
       index_map[1] = GenerateSelector(red_index, green_index);
       break;
+    case FOURCC_RGGB:
+      index_map[0] = GenerateSelector(red_index, green_index);
+      index_map[1] = GenerateSelector(green_index, blue_index);
+      break;
+    case FOURCC_GRBG:
+      index_map[0] = GenerateSelector(green_index, red_index);
+      index_map[1] = GenerateSelector(blue_index, green_index);
+      break;
+    default:
+      return -1;  // Bad FourCC
+  }
+  return 0;
+}
+
+// Converts 32 bit ARGB to Bayer RGB formats.
+LIBYUV_API
+int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_bayer, int dst_stride_bayer,
+                int width, int height,
+                uint32 dst_fourcc_bayer) {
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
+                         uint32 selector, int pix) = ARGBToBayerRow_C;
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+    ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+  }
+#endif
+  const int blue_index = 0;  // Offsets for ARGB format
+  const int green_index = 1;
+  const int red_index = 2;
+  uint32 index_map[2];
+  if (MakeSelectors(blue_index, green_index, red_index,
+                    dst_fourcc_bayer, index_map)) {
+    return -1;  // Bad FourCC
   }
 
-  // Now convert.
   for (int y = 0; y < height; ++y) {
-    ARGBToBayerRow(src_rgb, dst_bayer, index_map[y & 1], width);
-    src_rgb += src_stride_rgb;
+    ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width);
+    src_argb += src_stride_argb;
     dst_bayer += dst_stride_bayer;
   }
   return 0;
 }
 
-#define AVG(a,b) (((a) + (b)) >> 1)
+#define AVG(a, b) (((a) + (b)) >> 1)
 
 static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_rgb, int pix) {
+                       uint8* dst_argb, int pix) {
   const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
   uint8 g = src_bayer0[1];
   uint8 r = src_bayer1[1];
-  for (int x = 0; x < (pix - 2); x += 2) {
-    dst_rgb[0] = src_bayer0[0];
-    dst_rgb[1] = AVG(g, src_bayer0[1]);
-    dst_rgb[2] = AVG(r, src_bayer1[1]);
-    dst_rgb[3] = 255U;
-    dst_rgb[4] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_rgb[5] = src_bayer0[1];
-    dst_rgb[6] = src_bayer1[1];
-    dst_rgb[7] = 255U;
+  for (int x = 0; x < pix - 2; x += 2) {
+    dst_argb[0] = src_bayer0[0];
+    dst_argb[1] = AVG(g, src_bayer0[1]);
+    dst_argb[2] = AVG(r, src_bayer1[1]);
+    dst_argb[3] = 255U;
+    dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_argb[5] = src_bayer0[1];
+    dst_argb[6] = src_bayer1[1];
+    dst_argb[7] = 255U;
     g = src_bayer0[1];
     r = src_bayer1[1];
     src_bayer0 += 2;
     src_bayer1 += 2;
-    dst_rgb += 8;
-  }
-  dst_rgb[0] = src_bayer0[0];
-  dst_rgb[1] = AVG(g, src_bayer0[1]);
-  dst_rgb[2] = AVG(r, src_bayer1[1]);
-  dst_rgb[3] = 255U;
-  dst_rgb[4] = src_bayer0[0];
-  dst_rgb[5] = src_bayer0[1];
-  dst_rgb[6] = src_bayer1[1];
-  dst_rgb[7] = 255U;
+    dst_argb += 8;
+  }
+  dst_argb[0] = src_bayer0[0];
+  dst_argb[1] = AVG(g, src_bayer0[1]);
+  dst_argb[2] = AVG(r, src_bayer1[1]);
+  dst_argb[3] = 255U;
+  if (!(pix & 1)) {
+    dst_argb[4] = src_bayer0[0];
+    dst_argb[5] = src_bayer0[1];
+    dst_argb[6] = src_bayer1[1];
+    dst_argb[7] = 255U;
+  }
 }
 
 static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_rgb, int pix) {
+                       uint8* dst_argb, int pix) {
   const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
   uint8 g = src_bayer0[1];
   uint8 b = src_bayer1[1];
-  for (int x = 0; x < (pix - 2); x += 2) {
-    dst_rgb[0] = AVG(b, src_bayer1[1]);
-    dst_rgb[1] = AVG(g, src_bayer0[1]);
-    dst_rgb[2] = src_bayer0[0];
-    dst_rgb[3] = 255U;
-    dst_rgb[4] = src_bayer1[1];
-    dst_rgb[5] = src_bayer0[1];
-    dst_rgb[6] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_rgb[7] = 255U;
+  for (int x = 0; x < pix - 2; x += 2) {
+    dst_argb[0] = AVG(b, src_bayer1[1]);
+    dst_argb[1] = AVG(g, src_bayer0[1]);
+    dst_argb[2] = src_bayer0[0];
+    dst_argb[3] = 255U;
+    dst_argb[4] = src_bayer1[1];
+    dst_argb[5] = src_bayer0[1];
+    dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_argb[7] = 255U;
     g = src_bayer0[1];
     b = src_bayer1[1];
     src_bayer0 += 2;
     src_bayer1 += 2;
-    dst_rgb += 8;
-  }
-  dst_rgb[0] = AVG(b, src_bayer1[1]);
-  dst_rgb[1] = AVG(g, src_bayer0[1]);
-  dst_rgb[2] = src_bayer0[0];
-  dst_rgb[3] = 255U;
-  dst_rgb[4] = src_bayer1[1];
-  dst_rgb[5] = src_bayer0[1];
-  dst_rgb[6] = src_bayer0[0];
-  dst_rgb[7] = 255U;
+    dst_argb += 8;
+  }
+  dst_argb[0] = AVG(b, src_bayer1[1]);
+  dst_argb[1] = AVG(g, src_bayer0[1]);
+  dst_argb[2] = src_bayer0[0];
+  dst_argb[3] = 255U;
+  if (!(pix & 1)) {
+    dst_argb[4] = src_bayer1[1];
+    dst_argb[5] = src_bayer0[1];
+    dst_argb[6] = src_bayer0[0];
+    dst_argb[7] = 255U;
+  }
 }
 
 static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_rgb, int pix) {
+                       uint8* dst_argb, int pix) {
   const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
   uint8 b = src_bayer0[1];
-  for (int x = 0; x < (pix - 2); x += 2) {
-    dst_rgb[0] = AVG(b, src_bayer0[1]);
-    dst_rgb[1] = src_bayer0[0];
-    dst_rgb[2] = src_bayer1[0];
-    dst_rgb[3] = 255U;
-    dst_rgb[4] = src_bayer0[1];
-    dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_rgb[6] = AVG(src_bayer1[0], src_bayer1[2]);
-    dst_rgb[7] = 255U;
+  for (int x = 0; x < pix - 2; x += 2) {
+    dst_argb[0] = AVG(b, src_bayer0[1]);
+    dst_argb[1] = src_bayer0[0];
+    dst_argb[2] = src_bayer1[0];
+    dst_argb[3] = 255U;
+    dst_argb[4] = src_bayer0[1];
+    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]);
+    dst_argb[7] = 255U;
     b = src_bayer0[1];
     src_bayer0 += 2;
     src_bayer1 += 2;
-    dst_rgb += 8;
-  }
-  dst_rgb[0] = AVG(b, src_bayer0[1]);
-  dst_rgb[1] = src_bayer0[0];
-  dst_rgb[2] = src_bayer1[0];
-  dst_rgb[3] = 255U;
-  dst_rgb[4] = src_bayer0[1];
-  dst_rgb[5] = src_bayer0[0];
-  dst_rgb[6] = src_bayer1[0];
-  dst_rgb[7] = 255U;
+    dst_argb += 8;
+  }
+  dst_argb[0] = AVG(b, src_bayer0[1]);
+  dst_argb[1] = src_bayer0[0];
+  dst_argb[2] = src_bayer1[0];
+  dst_argb[3] = 255U;
+  if (!(pix & 1)) {
+    dst_argb[4] = src_bayer0[1];
+    dst_argb[5] = src_bayer0[0];
+    dst_argb[6] = src_bayer1[0];
+    dst_argb[7] = 255U;
+  }
 }
 
 static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_rgb, int pix) {
+                       uint8* dst_argb, int pix) {
   const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
   uint8 r = src_bayer0[1];
-  for (int x = 0; x < (pix - 2); x += 2) {
-    dst_rgb[0] = src_bayer1[0];
-    dst_rgb[1] = src_bayer0[0];
-    dst_rgb[2] = AVG(r, src_bayer0[1]);
-    dst_rgb[3] = 255U;
-    dst_rgb[4] = AVG(src_bayer1[0], src_bayer1[2]);
-    dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_rgb[6] = src_bayer0[1];
-    dst_rgb[7] = 255U;
+  for (int x = 0; x < pix - 2; x += 2) {
+    dst_argb[0] = src_bayer1[0];
+    dst_argb[1] = src_bayer0[0];
+    dst_argb[2] = AVG(r, src_bayer0[1]);
+    dst_argb[3] = 255U;
+    dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]);
+    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_argb[6] = src_bayer0[1];
+    dst_argb[7] = 255U;
     r = src_bayer0[1];
     src_bayer0 += 2;
     src_bayer1 += 2;
-    dst_rgb += 8;
-  }
-  dst_rgb[0] = src_bayer1[0];
-  dst_rgb[1] = src_bayer0[0];
-  dst_rgb[2] = AVG(r, src_bayer0[1]);
-  dst_rgb[3] = 255U;
-  dst_rgb[4] = src_bayer1[0];
-  dst_rgb[5] = src_bayer0[0];
-  dst_rgb[6] = src_bayer0[1];
-  dst_rgb[7] = 255U;
+    dst_argb += 8;
+  }
+  dst_argb[0] = src_bayer1[0];
+  dst_argb[1] = src_bayer0[0];
+  dst_argb[2] = AVG(r, src_bayer0[1]);
+  dst_argb[3] = 255U;
+  if (!(pix & 1)) {
+    dst_argb[4] = src_bayer1[0];
+    dst_argb[5] = src_bayer0[0];
+    dst_argb[6] = src_bayer0[1];
+    dst_argb[7] = 255U;
+  }
 }
 
 // Converts any Bayer RGB format to ARGB.
-int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer,
-                   uint32 src_fourcc_bayer,
-                   uint8* dst_rgb, int dst_stride_rgb,
-                   int width, int height) {
+LIBYUV_API
+int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height,
+                uint32 src_fourcc_bayer) {
   if (height < 0) {
     height = -height;
-    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
-    dst_stride_rgb = -dst_stride_rgb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
   }
   void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_rgb, int pix);
+                    uint8* dst_argb, int pix);
   void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_rgb, int pix);
-
+                    uint8* dst_argb, int pix);
   switch (src_fourcc_bayer) {
-    default:
-      assert(false);
-    case FOURCC_RGGB:
-      BayerRow0 = BayerRowRG;
-      BayerRow1 = BayerRowGB;
-      break;
     case FOURCC_BGGR:
       BayerRow0 = BayerRowBG;
       BayerRow1 = BayerRowGR;
       break;
+    case FOURCC_GBRG:
+      BayerRow0 = BayerRowGB;
+      BayerRow1 = BayerRowRG;
+      break;
     case FOURCC_GRBG:
       BayerRow0 = BayerRowGR;
       BayerRow1 = BayerRowBG;
       break;
-    case FOURCC_GBRG:
-      BayerRow0 = BayerRowGB;
-      BayerRow1 = BayerRowRG;
+    case FOURCC_RGGB:
+      BayerRow0 = BayerRowRG;
+      BayerRow1 = BayerRowGB;
       break;
+    default:
+      return -1;    // Bad FourCC
   }
 
-  for (int y = 0; y < (height - 1); y += 2) {
-    BayerRow0(src_bayer, src_stride_bayer, dst_rgb, width);
+  for (int y = 0; y < height - 1; y += 2) {
+    BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
     BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
-        dst_rgb + dst_stride_rgb, width);
+              dst_argb + dst_stride_argb, width);
     src_bayer += src_stride_bayer * 2;
-    dst_rgb += dst_stride_rgb * 2;
+    dst_argb += dst_stride_argb * 2;
   }
   if (height & 1) {
-    BayerRow0(src_bayer, -src_stride_bayer, dst_rgb, width);
+    BayerRow0(src_bayer, -src_stride_bayer, dst_argb, width);
   }
   return 0;
 }
 
 // Converts any Bayer RGB format to ARGB.
-int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
-                   uint32 src_fourcc_bayer,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height) {
+LIBYUV_API
+int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height,
+                uint32 src_fourcc_bayer) {
   if (width * 4 > kMaxStride) {
-    return -1;
+    return -1;  // Size too large for row buffer
   }
   // Negative height means invert the image.
   if (height < 0) {
@@ -346,60 +368,50 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
     dst_stride_v = -dst_stride_v;
   }
   void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_rgb, int pix);
+                    uint8* dst_argb, int pix);
   void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_rgb, int pix);
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+                    uint8* dst_argb, int pix);
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
   void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
 
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
-      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
     ARGBToYRow = ARGBToYRow_SSSE3;
-  } else
-#endif
-  {
-    ARGBToYRow = ARGBToYRow_C;
   }
+#endif
 #if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
-      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
-      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
     ARGBToUVRow = ARGBToUVRow_SSSE3;
-  } else
-#endif
-  {
-    ARGBToUVRow = ARGBToUVRow_C;
   }
+#endif
 
   switch (src_fourcc_bayer) {
-    default:
-      assert(false);
-    case FOURCC_RGGB:
-      BayerRow0 = BayerRowRG;
-      BayerRow1 = BayerRowGB;
-      break;
     case FOURCC_BGGR:
       BayerRow0 = BayerRowBG;
       BayerRow1 = BayerRowGR;
       break;
+    case FOURCC_GBRG:
+      BayerRow0 = BayerRowGB;
+      BayerRow1 = BayerRowRG;
+      break;
     case FOURCC_GRBG:
       BayerRow0 = BayerRowGR;
       BayerRow1 = BayerRowBG;
       break;
-    case FOURCC_GBRG:
-      BayerRow0 = BayerRowGB;
-      BayerRow1 = BayerRowRG;
+    case FOURCC_RGGB:
+      BayerRow0 = BayerRowRG;
+      BayerRow1 = BayerRowGB;
       break;
+    default:
+      return -1;  // Bad FourCC
   }
 
-  for (int y = 0; y < (height - 1); y += 2) {
+  for (int y = 0; y < height - 1; y += 2) {
     BayerRow0(src_bayer, src_stride_bayer, row, width);
     BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
               row + kMaxStride, width);
@@ -411,7 +423,6 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
     dst_u += dst_stride_u;
     dst_v += dst_stride_v;
   }
-  // TODO(fbarchard): Make sure this filters properly
   if (height & 1) {
     BayerRow0(src_bayer, src_stride_bayer, row, width);
     ARGBToUVRow(row, 0, dst_u, dst_v, width);
@@ -420,4 +431,124 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
   return 0;
 }
 
+// Convert I420 to Bayer.
+LIBYUV_API
+int I420ToBayer(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_bayer, int dst_stride_bayer,
+                int width, int height,
+                uint32 dst_fourcc_bayer) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    int halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_NEON;
+  }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_SSSE3;
+  }
+#endif
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
+                         uint32 selector, int pix) = ARGBToBayerRow_C;
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
+    ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+  }
+#endif
+  const int blue_index = 0;  // Offsets for ARGB format
+  const int green_index = 1;
+  const int red_index = 2;
+  uint32 index_map[2];
+  if (MakeSelectors(blue_index, green_index, red_index,
+                    dst_fourcc_bayer, index_map)) {
+    return -1;  // Bad FourCC
+  }
+
+  for (int y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, row, width);
+    ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
+    dst_bayer += dst_stride_bayer;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+#define MAKEBAYERFOURCC(BAYER)                                                 \
+LIBYUV_API                                                                     \
+int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer,         \
+                         uint8* dst_y, int dst_stride_y,                       \
+                         uint8* dst_u, int dst_stride_u,                       \
+                         uint8* dst_v, int dst_stride_v,                       \
+                         int width, int height) {                              \
+  return BayerToI420(src_bayer, src_stride_bayer,                              \
+                     dst_y, dst_stride_y,                                      \
+                     dst_u, dst_stride_u,                                      \
+                     dst_v, dst_stride_v,                                      \
+                     width, height,                                            \
+                     FOURCC_##BAYER);                                          \
+}                                                                              \
+                                                                               \
+LIBYUV_API                                                                     \
+int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y,                   \
+                       const uint8* src_u, int src_stride_u,                   \
+                       const uint8* src_v, int src_stride_v,                   \
+                       uint8* dst_bayer, int dst_stride_bayer,                 \
+                       int width, int height) {                                \
+  return I420ToBayer(src_y, src_stride_y,                                      \
+                     src_u, src_stride_u,                                      \
+                     src_v, src_stride_v,                                      \
+                     dst_bayer, dst_stride_bayer,                              \
+                     width, height,                                            \
+                     FOURCC_##BAYER);                                          \
+}                                                                              \
+                                                                               \
+LIBYUV_API                                                                     \
+int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb,             \
+                       uint8* dst_bayer, int dst_stride_bayer,                 \
+                       int width, int height) {                                \
+  return ARGBToBayer(src_argb, src_stride_argb,                                \
+                     dst_bayer, dst_stride_bayer,                              \
+                     width, height,                                            \
+                     FOURCC_##BAYER);                                          \
+}                                                                              \
+                                                                               \
+LIBYUV_API                                                                     \
+int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer,         \
+                         uint8* dst_argb, int dst_stride_argb,                 \
+                         int width, int height) {                              \
+  return BayerToARGB(src_bayer, src_stride_bayer,                              \
+                     dst_argb, dst_stride_argb,                                \
+                     width, height,                                            \
+                     FOURCC_##BAYER);                                          \
+}
+
+MAKEBAYERFOURCC(BGGR)
+MAKEBAYERFOURCC(GBRG)
+MAKEBAYERFOURCC(GRBG)
+MAKEBAYERFOURCC(RGGB)
+
+#ifdef __cplusplus
+}  // extern "C"
 }  // namespace libyuv
+#endif
diff --git a/files/source/general.cc b/files/source/general.cc
deleted file mode 100644
index 9d39f9bf..00000000
--- a/files/source/general.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/general.h"
-
-#include <string.h>     // memcpy(), memset()
-
-#include "libyuv/planar_functions.h"
-
-namespace libyuv {
-
-int
-I420Mirror(const uint8* src_yplane, int src_ystride,
-           const uint8* src_uplane, int src_ustride,
-           const uint8* src_vplane, int src_vstride,
-           uint8* dst_yplane, int dst_ystride,
-           uint8* dst_uplane, int dst_ustride,
-           uint8* dst_vplane, int dst_vstride,
-           int width, int height) {
-  if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL ||
-      dst_yplane == NULL || dst_uplane == NULL || dst_vplane == NULL) {
-    return -1;
-  }
-
-  int indO = 0;
-  int indS  = 0;
-  int wind, hind;
-  uint8 tmpVal, tmpValU, tmpValV;
-  // Will swap two values per iteration
-  const int halfWidth = (width + 1) >> 1;
-
-  // Y
-  for (wind = 0; wind < halfWidth; wind++) {
-   for (hind = 0; hind < height; hind++) {
-     indO = hind * src_ystride + wind;
-     indS = hind * dst_ystride + (width - wind - 1);
-     tmpVal = src_yplane[indO];
-     dst_yplane[indO] = src_yplane[indS];
-     dst_yplane[indS] = tmpVal;
-    }
-  }
-
-  const int halfHeight = (height + 1) >> 1;
-  const int halfSrcuvStride = (height + 1) >> 1;
-  const int halfuvWidth = (width + 1) >> 2;
-
-  for (wind = 0; wind < halfuvWidth; wind++) {
-   for (hind = 0; hind < halfHeight; hind++) {
-     indO = hind * halfSrcuvStride + wind;
-     indS = hind * halfSrcuvStride + (halfuvWidth - wind - 1);
-     // U
-     tmpValU = src_uplane[indO];
-     dst_uplane[indO] = src_uplane[indS];
-     dst_uplane[indS] = tmpValU;
-     // V
-     tmpValV = src_vplane[indO];
-     dst_vplane[indO] = src_vplane[indS];
-     dst_vplane[indS] = tmpValV;
-   }
-  }
-  return 0;
-}
-
-// Make a center cut
-int
-I420Crop(uint8* frame,
-         int src_width, int src_height,
-         int dst_width, int dst_height)
-{
-  if (frame == NULL)
-    return -1;
-
-  if (src_width == dst_width && src_height == dst_height) {
-      // Nothing to do
-    return 3 * dst_height * dst_width / 2;
-  }
-  if (dst_width > src_width || dst_height > src_height) {
-      // error
-      return -1;
-  }
-  int i = 0;
-  int m = 0;
-  int loop = 0;
-  int half_dst_width = dst_width / 2;
-  int halfdst_height = dst_height / 2;
-  int halfsrc_width = src_width / 2;
-  int half_dst_height= src_height / 2;
-  int crop_height = ( src_height - dst_height ) / 2;
-  int crop_width = ( src_width - dst_width ) / 2;
-
-  for (i = src_width * crop_height + crop_width; loop < dst_height ;
-      loop++, i += src_width) {
-    memcpy(&frame[m],&frame[i],dst_width);
-    m += dst_width;
-  }
-  i = src_width * src_height; // ilum
-  loop = 0;
-  for ( i += (halfsrc_width * crop_height / 2 + crop_width / 2);
-        loop < halfdst_height; loop++,i += halfsrc_width) {
-    memcpy(&frame[m],&frame[i],half_dst_width);
-    m += half_dst_width;
-  }
-  loop = 0;
-  i = src_width * src_height + half_dst_height * halfsrc_width; // ilum + Cr
-  for ( i += (halfsrc_width * crop_height / 2 + crop_width / 2);
-        loop < halfdst_height; loop++, i += halfsrc_width) {
-    memcpy(&frame[m],&frame[i],half_dst_width);
-    m += half_dst_width;
-  }
-  return 0;
-}
-
-
-int
-I420CropPad(const uint8* src_frame, int src_width,
-            int src_height, uint8* dst_frame,
-            int dst_width, int dst_height)
-{
-  if (src_width < 1 || dst_width < 1 || src_height < 1 || dst_height < 1) {
-    return -1;
-  }
-  if (src_width == dst_width && src_height == dst_height) {
-    memcpy(dst_frame, src_frame, 3 * dst_width * (dst_height >> 1));
-  } else {
-    if (src_height < dst_height) {
-      // pad height
-      int pad_height = dst_height - src_height;
-      int i = 0;
-      int pad_width = 0;
-      int crop_width = 0;
-      int width = src_width;
-      if (src_width < dst_width) {
-        // pad width
-        pad_width = dst_width - src_width;
-      } else {
-        // cut width
-        crop_width = src_width - dst_width;
-        width = dst_width;
-      }
-      if (pad_height) {
-        memset(dst_frame, 0, dst_width * (pad_height >> 1));
-        dst_frame +=  dst_width * (pad_height >> 1);
-      }
-      for (i = 0; i < src_height;i++) {
-        if (pad_width) {
-            memset(dst_frame, 0, pad_width / 2);
-            dst_frame +=  pad_width / 2;
-        }
-        src_frame += crop_width >> 1; // in case we have a cut
-        memcpy(dst_frame,src_frame ,width);
-        src_frame += crop_width >> 1;
-        dst_frame += width;
-        src_frame += width;
-        if (pad_width) {
-          memset(dst_frame, 0, pad_width / 2);
-          dst_frame +=  pad_width / 2;
-        }
-      }
-      if (pad_height) {
-        memset(dst_frame, 0, dst_width * (pad_height >> 1));
-        dst_frame +=  dst_width * (pad_height >> 1);
-      }
-      if (pad_height) {
-        memset(dst_frame, 127, (dst_width >> 2) * (pad_height >> 1));
-        dst_frame +=  (dst_width >> 2) * (pad_height >> 1);
-      }
-      for (i = 0; i < (src_height >> 1); i++) {
-        if (pad_width) {
-          memset(dst_frame, 127, pad_width >> 2);
-          dst_frame +=  pad_width >> 2;
-        }
-        src_frame += crop_width >> 2; // in case we have a cut
-        memcpy(dst_frame, src_frame,width >> 1);
-        src_frame += crop_width >> 2;
-        dst_frame += width >> 1;
-        src_frame += width >> 1;
-        if (pad_width) {
-          memset(dst_frame, 127, pad_width >> 2);
-          dst_frame +=  pad_width >> 2;
-        }
-      }
-      if (pad_height) {
-        memset(dst_frame, 127, (dst_width >> 1) * (pad_height >> 1));
-        dst_frame +=  (dst_width >> 1) * (pad_height >> 1);
-      }
-      for (i = 0; i < (src_height >> 1); i++) {
-        if (pad_width) {
-          memset(dst_frame, 127, pad_width >> 2);
-          dst_frame +=  pad_width >> 2;
-        }
-        src_frame += crop_width >> 2; // in case we have a cut
-        memcpy(dst_frame, src_frame,width >> 1);
-        src_frame += crop_width >> 2;
-        dst_frame += width >> 1;
-        src_frame += width >> 1;
-        if (pad_width) {
-          memset(dst_frame, 127, pad_width >> 2);
-          dst_frame += pad_width >> 2;
-        }
-      }
-      if (pad_height) {
-        memset(dst_frame, 127, (dst_width >> 2) * (pad_height >> 1));
-        dst_frame +=  (dst_width >> 2) * (pad_height >> 1);
-      }
-    } else {
-      // cut height
-      int i = 0;
-      int pad_width = 0;
-      int crop_width = 0;
-      int width = src_width;
-
-      if (src_width < dst_width) {
-        // pad width
-        pad_width = dst_width - src_width;
-      } else {
-        // cut width
-        crop_width = src_width - dst_width;
-        width = dst_width;
-      }
-      int diff_height = src_height - dst_height;
-      src_frame += src_width * (diff_height >> 1);  // skip top I
-
-      for (i = 0; i < dst_height; i++) {
-        if (pad_width) {
-          memset(dst_frame, 0, pad_width / 2);
-          dst_frame +=  pad_width / 2;
-        }
-        src_frame += crop_width >> 1; // in case we have a cut
-        memcpy(dst_frame,src_frame ,width);
-        src_frame += crop_width >> 1;
-        dst_frame += width;
-        src_frame += width;
-        if (pad_width) {
-          memset(dst_frame, 0, pad_width / 2);
-          dst_frame +=  pad_width / 2;
-        }
-      }
-      src_frame += src_width * (diff_height >> 1);  // skip end I
-      src_frame += (src_width >> 2) * (diff_height >> 1); // skip top of Cr
-      for (i = 0; i < (dst_height >> 1); i++) {
-        if (pad_width) {
-          memset(dst_frame, 127, pad_width >> 2);
-          dst_frame +=  pad_width >> 2;
-        }
-        src_frame += crop_width >> 2; // in case we have a cut
-        memcpy(dst_frame, src_frame,width >> 1);
-        src_frame += crop_width >> 2;
-        dst_frame += width >> 1;
-        src_frame += width >> 1;
-        if (pad_width) {
-          memset(dst_frame, 127, pad_width >> 2);
-          dst_frame +=  pad_width >> 2;
-        }
-      }
-      src_frame += (src_width >> 2) * (diff_height >> 1); // skip end of Cr
-      src_frame += (src_width >> 2) * (diff_height >> 1); // skip top of Cb
-      for (i = 0; i < (dst_height >> 1); i++) {
-        if (pad_width) {
-          memset(dst_frame, 127, pad_width >> 2);
-          dst_frame +=  pad_width >> 2;
-        }
-        src_frame += crop_width >> 2; // in case we have a cut
-        memcpy(dst_frame, src_frame, width >> 1);
-        src_frame += crop_width >> 2;
-        dst_frame += width >> 1;
-        src_frame += width >> 1;
-        if (pad_width) {
-          memset(dst_frame, 127, pad_width >> 2);
-          dst_frame +=  pad_width >> 2;
-        }
-      }
-    }
-  }
-  return 0;
-}
-
-} // namespace libyuv
diff --git a/files/source/mjpeg_decoder.cc b/files/source/mjpeg_decoder.cc
new file mode 100644
index 00000000..aa603947
--- /dev/null
+++ b/files/source/mjpeg_decoder.cc
@@ -0,0 +1,583 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#ifdef HAVE_JPEG
+// Must be included before jpeglib
+#include <assert.h>
+#ifndef __CLR_VER
+#include <setjmp.h>
+#define HAVE_SETJMP
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+
+extern "C" {
+#include <jpeglib.h>
+}
+
+#include <climits>
+#include <cstring>
+
+namespace libyuv {
+
+#ifdef HAVE_SETJMP
+struct SetJmpErrorMgr {
+  jpeg_error_mgr base;  // Must be at the top
+  jmp_buf setjmp_buffer;
+};
+#endif
+
+const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
+const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
+const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
+const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
+const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
+const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
+
+MJpegDecoder::MJpegDecoder()
+    : has_scanline_padding_(false),
+      num_outbufs_(0),
+      scanlines_(NULL),
+      scanlines_sizes_(NULL),
+      databuf_(NULL),
+      databuf_strides_(NULL) {
+  decompress_struct_ = new jpeg_decompress_struct;
+  source_mgr_ = new jpeg_source_mgr;
+#ifdef HAVE_SETJMP
+  error_mgr_ = new SetJmpErrorMgr;
+  decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
+  // Override standard exit()-based error handler.
+  error_mgr_->base.error_exit = &ErrorHandler;
+#endif
+  decompress_struct_->client_data = NULL;
+  source_mgr_->init_source = &init_source;
+  source_mgr_->fill_input_buffer = &fill_input_buffer;
+  source_mgr_->skip_input_data = &skip_input_data;
+  source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
+  source_mgr_->term_source = &term_source;
+  jpeg_create_decompress(decompress_struct_);
+  decompress_struct_->src = source_mgr_;
+  buf_vec_.buffers = &buf_;
+  buf_vec_.len = 1;
+}
+
+MJpegDecoder::~MJpegDecoder() {
+  jpeg_destroy_decompress(decompress_struct_);
+  delete decompress_struct_;
+  delete source_mgr_;
+#ifdef HAVE_SETJMP
+  delete error_mgr_;
+#endif
+  DestroyOutputBuffers();
+}
+
+// Helper function to validate the jpeg looks ok.
+// TODO(fbarchard): Improve performance. Scan backward for EOI?
+bool ValidateJpeg(const uint8* sample, size_t sample_size) {
+  if (sample_size < 64) {
+    // ERROR: Invalid jpeg size: sample_size
+    return false;
+  }
+  if (sample[0] != 0xff || sample[1] != 0xd8) {
+    // ERROR: Invalid jpeg initial start code
+    return false;
+  }
+  bool soi = true;
+  int total_eoi = 0;
+  for (int i = 2; i < static_cast<int>(sample_size) - 1; ++i) {
+    if (sample[i] == 0xff) {
+      if (sample[i + 1] == 0xd8) {  // Start Of Image
+        soi = true;
+      } else if (sample[i + 1] == 0xd9) {  // End Of Image
+        if (soi) {
+          ++total_eoi;
+        }
+        soi = false;
+      }
+    }
+  }
+  if (!total_eoi) {
+    // ERROR: Invalid jpeg end code not found. Size sample_size
+    return false;
+  }
+  return true;
+}
+
+bool MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+  if (!ValidateJpeg(src, src_len)) {
+    return false;
+  }
+
+  buf_.data = src;
+  buf_.len = static_cast<int>(src_len);
+  buf_vec_.pos = 0;
+  decompress_struct_->client_data = &buf_vec_;
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called jpeg_read_header, it experienced an error, and we called
+    // longjmp() and rewound the stack to here. Return error.
+    return false;
+  }
+#endif
+  if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
+    // ERROR: Bad MJPEG header
+    return false;
+  }
+  AllocOutputBuffers(GetNumComponents());
+  for (int i = 0; i < num_outbufs_; ++i) {
+    int scanlines_size = GetComponentScanlinesPerImcuRow(i);
+    if (scanlines_sizes_[i] != scanlines_size) {
+      if (scanlines_[i]) {
+        delete scanlines_[i];
+      }
+      scanlines_[i] = new uint8* [scanlines_size];
+      scanlines_sizes_[i] = scanlines_size;
+    }
+
+    // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
+    // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
+    // the preceding scanlines, the padding is not needed/wanted because the
+    // following addresses will already be valid (they are the initial bytes of
+    // the next scanline) and will be overwritten when jpeglib writes out that
+    // next scanline.
+    int databuf_stride = GetComponentStride(i);
+    int databuf_size = scanlines_size * databuf_stride;
+    if (databuf_strides_[i] != databuf_stride) {
+      if (databuf_[i]) {
+        delete databuf_[i];
+      }
+      databuf_[i] = new uint8[databuf_size];
+      databuf_strides_[i] = databuf_stride;
+    }
+
+    if (GetComponentStride(i) != GetComponentWidth(i)) {
+      has_scanline_padding_ = true;
+    }
+  }
+  return true;
+}
+
+static int DivideAndRoundUp(int numerator, int denominator) {
+  return (numerator + denominator - 1) / denominator;
+}
+
+static int DivideAndRoundDown(int numerator, int denominator) {
+  return numerator / denominator;
+}
+
+// Returns width of the last loaded frame.
+int MJpegDecoder::GetWidth() {
+  return decompress_struct_->image_width;
+}
+
+// Returns height of the last loaded frame.
+int MJpegDecoder::GetHeight() {
+  return decompress_struct_->image_height;
+}
+
+// Returns format of the last loaded frame. The return value is one of the
+// kColorSpace* constants.
+int MJpegDecoder::GetColorSpace() {
+  return decompress_struct_->jpeg_color_space;
+}
+
+// Number of color components in the color space.
+int MJpegDecoder::GetNumComponents() {
+  return decompress_struct_->num_components;
+}
+
+// Sample factors of the n-th component.
+int MJpegDecoder::GetHorizSampFactor(int component) {
+  return decompress_struct_->comp_info[component].h_samp_factor;
+}
+
+int MJpegDecoder::GetVertSampFactor(int component) {
+  return decompress_struct_->comp_info[component].v_samp_factor;
+}
+
+int MJpegDecoder::GetHorizSubSampFactor(int component) {
+  return decompress_struct_->max_h_samp_factor /
+      GetHorizSampFactor(component);
+}
+
+int MJpegDecoder::GetVertSubSampFactor(int component) {
+  return decompress_struct_->max_v_samp_factor /
+      GetVertSampFactor(component);
+}
+
+int MJpegDecoder::GetImageScanlinesPerImcuRow() {
+  return decompress_struct_->max_v_samp_factor * DCTSIZE;
+}
+
+int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
+  int vs = GetVertSubSampFactor(component);
+  return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
+}
+
+int MJpegDecoder::GetComponentWidth(int component) {
+  int hs = GetHorizSubSampFactor(component);
+  return DivideAndRoundUp(GetWidth(), hs);
+}
+
+int MJpegDecoder::GetComponentHeight(int component) {
+  int vs = GetVertSubSampFactor(component);
+  return DivideAndRoundUp(GetHeight(), vs);
+}
+
+// Get width in bytes padded out to a multiple of DCTSIZE
+int MJpegDecoder::GetComponentStride(int component) {
+  return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
+}
+
+int MJpegDecoder::GetComponentSize(int component) {
+  return GetComponentWidth(component) * GetComponentHeight(component);
+}
+
+bool MJpegDecoder::UnloadFrame() {
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called jpeg_abort_decompress, it experienced an error, and we called
+    // longjmp() and rewound the stack to here. Return error.
+    return false;
+  }
+#endif
+  jpeg_abort_decompress(decompress_struct_);
+  return true;
+}
+
+static void CopyRows(uint8* source, int source_stride,
+                     uint8* dest, int pixels, int numrows) {
+  for (int i = 0; i < numrows; ++i) {
+    memcpy(dest, source, pixels);
+    dest += pixels;
+    source += source_stride;
+  }
+}
+
+// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
+bool MJpegDecoder::DecodeToBuffers(
+    uint8** planes, int dst_width, int dst_height) {
+  if (dst_width != GetWidth() ||
+      dst_height > GetHeight()) {
+    // ERROR: Bad dimensions
+    return false;
+  }
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called into jpeglib, it experienced an error sometime during this
+    // function call, and we called longjmp() and rewound the stack to here.
+    // Return error.
+    return false;
+  }
+#endif
+  if (!StartDecode()) {
+    return false;
+  }
+  SetScanlinePointers(databuf_);
+  int lines_left = dst_height;
+  // Compute amount of lines to skip to implement vertical crop.
+  // TODO(fbarchard): Ensure skip is a multiple of maximum component
+  // subsample. ie 2
+  int skip = (GetHeight() - dst_height) / 2;
+  if (skip > 0) {
+    // There is no API to skip lines in the output data, so we read them
+    // into the temp buffer.
+    while (skip >= GetImageScanlinesPerImcuRow()) {
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return false;
+      }
+      skip -= GetImageScanlinesPerImcuRow();
+    }
+    if (skip > 0) {
+      // Have a partial iMCU row left over to skip. Must read it and then
+      // copy the parts we want into the destination.
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return false;
+      }
+      for (int i = 0; i < num_outbufs_; ++i) {
+        // TODO(fbarchard): Compute skip to avoid this
+        assert(skip % GetVertSubSampFactor(i) == 0);
+        int rows_to_skip =
+            DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
+                                rows_to_skip;
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        CopyRows(databuf_[i] + data_to_skip, GetComponentStride(i),
+                 planes[i], GetComponentWidth(i), scanlines_to_copy);
+        planes[i] += scanlines_to_copy * GetComponentWidth(i);
+      }
+      lines_left -= (GetImageScanlinesPerImcuRow() - skip);
+    }
+  }
+
+  // Read full MCUs but cropped horizontally
+  for (; lines_left > GetImageScanlinesPerImcuRow();
+         lines_left -= GetImageScanlinesPerImcuRow()) {
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return false;
+    }
+    for (int i = 0; i < num_outbufs_; ++i) {
+      int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
+      CopyRows(databuf_[i], GetComponentStride(i),
+               planes[i], GetComponentWidth(i), scanlines_to_copy);
+      planes[i] += scanlines_to_copy * GetComponentWidth(i);
+    }
+  }
+
+  if (lines_left > 0) {
+    // Have a partial iMCU row left over to decode.
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return false;
+    }
+    for (int i = 0; i < num_outbufs_; ++i) {
+      int scanlines_to_copy =
+          DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
+      CopyRows(databuf_[i], GetComponentStride(i),
+               planes[i], GetComponentWidth(i), scanlines_to_copy);
+      planes[i] += scanlines_to_copy * GetComponentWidth(i);
+    }
+  }
+  return FinishDecode();
+}
+
+bool MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
+    int dst_width, int dst_height) {
+  if (dst_width != GetWidth() ||
+      dst_height > GetHeight()) {
+    // ERROR: Bad dimensions
+    return false;
+  }
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called into jpeglib, it experienced an error sometime during this
+    // function call, and we called longjmp() and rewound the stack to here.
+    // Return error.
+    return false;
+  }
+#endif
+  if (!StartDecode()) {
+    return false;
+  }
+  SetScanlinePointers(databuf_);
+  int lines_left = dst_height;
+  // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
+  int skip = (GetHeight() - dst_height) / 2;
+  if (skip > 0) {
+    while (skip >= GetImageScanlinesPerImcuRow()) {
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return false;
+      }
+      skip -= GetImageScanlinesPerImcuRow();
+    }
+    if (skip > 0) {
+      // Have a partial iMCU row left over to skip.
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return false;
+      }
+      for (int i = 0; i < num_outbufs_; ++i) {
+        // TODO(fbarchard): Compute skip to avoid this
+        assert(skip % GetVertSubSampFactor(i) == 0);
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        // Change our own data buffer pointers so we can pass them to the
+        // callback.
+        databuf_[i] += data_to_skip;
+      }
+      int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
+      (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
+      // Now change them back.
+      for (int i = 0; i < num_outbufs_; ++i) {
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        databuf_[i] -= data_to_skip;
+      }
+      lines_left -= scanlines_to_copy;
+    }
+  }
+  // Read full MCUs until we get to the crop point.
+  for (; lines_left >= GetImageScanlinesPerImcuRow();
+         lines_left -= GetImageScanlinesPerImcuRow()) {
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return false;
+    }
+    (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
+  }
+  if (lines_left > 0) {
+    // Have a partial iMCU row left over to decode.
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return false;
+    }
+    (*fn)(opaque, databuf_, databuf_strides_, lines_left);
+  }
+  return FinishDecode();
+}
+
+void MJpegDecoder::init_source(j_decompress_ptr cinfo) {
+  fill_input_buffer(cinfo);
+}
+
+boolean MJpegDecoder::fill_input_buffer(j_decompress_ptr cinfo) {
+  BufferVector* buf_vec = static_cast<BufferVector*>(cinfo->client_data);
+  if (buf_vec->pos >= buf_vec->len) {
+    assert(0 && "No more data");
+    // ERROR: No more data
+    return FALSE;
+  }
+  cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
+  cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
+  ++buf_vec->pos;
+  return TRUE;
+}
+
+void MJpegDecoder::skip_input_data(j_decompress_ptr cinfo,
+                                   long num_bytes) {  // NOLINT
+  cinfo->src->next_input_byte += num_bytes;
+}
+
+void MJpegDecoder::term_source(j_decompress_ptr cinfo) {
+  // Nothing to do.
+}
+
+#ifdef HAVE_SETJMP
+void MJpegDecoder::ErrorHandler(j_common_ptr cinfo) {
+  // This is called when a jpeglib command experiences an error. Unfortunately
+  // jpeglib's error handling model is not very flexible, because it expects the
+  // error handler to not return--i.e., it wants the program to terminate. To
+  // recover from errors we use setjmp() as shown in their example. setjmp() is
+  // C's implementation for the "call with current continuation" functionality
+  // seen in some functional programming languages.
+  char buf[JMSG_LENGTH_MAX];
+  (*cinfo->err->format_message)(cinfo, buf);
+  // ERROR: Error in jpeglib: buf
+
+  SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
+  // This rewinds the call stack to the point of the corresponding setjmp()
+  // and causes it to return (for a second time) with value 1.
+  longjmp(mgr->setjmp_buffer, 1);
+}
+#endif
+
+void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
+  if (num_outbufs != num_outbufs_) {
+    // We could perhaps optimize this case to resize the output buffers without
+    // necessarily having to delete and recreate each one, but it's not worth
+    // it.
+    DestroyOutputBuffers();
+
+    scanlines_ = new uint8** [num_outbufs];
+    scanlines_sizes_ = new int[num_outbufs];
+    databuf_ = new uint8* [num_outbufs];
+    databuf_strides_ = new int[num_outbufs];
+
+    for (int i = 0; i < num_outbufs; ++i) {
+      scanlines_[i] = NULL;
+      scanlines_sizes_[i] = 0;
+      databuf_[i] = NULL;
+      databuf_strides_[i] = 0;
+    }
+
+    num_outbufs_ = num_outbufs;
+  }
+}
+
+void MJpegDecoder::DestroyOutputBuffers() {
+  for (int i = 0; i < num_outbufs_; ++i) {
+    delete [] scanlines_[i];
+    delete [] databuf_[i];
+  }
+  delete [] scanlines_;
+  delete [] databuf_;
+  delete [] scanlines_sizes_;
+  delete [] databuf_strides_;
+  scanlines_ = NULL;
+  databuf_ = NULL;
+  scanlines_sizes_ = NULL;
+  databuf_strides_ = NULL;
+  num_outbufs_ = 0;
+}
+
+// JDCT_IFAST and do_block_smoothing improve performance substantially.
+bool MJpegDecoder::StartDecode() {
+  decompress_struct_->raw_data_out = TRUE;
+  decompress_struct_->dct_method = JDCT_IFAST;  // JDCT_ISLOW is default
+  decompress_struct_->dither_mode = JDITHER_NONE;
+  decompress_struct_->do_fancy_upsampling = false;  // Not applicable to 'raw'
+  decompress_struct_->enable_2pass_quant = false;  // Only for buffered mode
+  decompress_struct_->do_block_smoothing = false;  // blocky but fast
+
+  if (!jpeg_start_decompress(decompress_struct_)) {
+    // ERROR: Couldn't start JPEG decompressor";
+    return false;
+  }
+  return true;
+}
+
+bool MJpegDecoder::FinishDecode() {
+  // jpeglib considers it an error if we finish without decoding the whole
+  // image, so we call "abort" rather than "finish".
+  jpeg_abort_decompress(decompress_struct_);
+  return true;
+}
+
+void MJpegDecoder::SetScanlinePointers(uint8** data) {
+  for (int i = 0; i < num_outbufs_; ++i) {
+    uint8* data_i = data[i];
+    for (int j = 0; j < scanlines_sizes_[i]; ++j) {
+      scanlines_[i][j] = data_i;
+      data_i += GetComponentStride(i);
+    }
+  }
+}
+
+inline bool MJpegDecoder::DecodeImcuRow() {
+  return static_cast<unsigned int>(GetImageScanlinesPerImcuRow()) ==
+      jpeg_read_raw_data(decompress_struct_,
+                         scanlines_,
+                         GetImageScanlinesPerImcuRow());
+}
+
+// The helper function which recognizes the jpeg sub-sampling type.
+JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
+    int* subsample_x, int* subsample_y, int number_of_components) {
+  if (number_of_components == 3) {  // Color images.
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 2 && subsample_y[1] == 2 &&
+        subsample_x[2] == 2 && subsample_y[2] == 2) {
+      return kJpegYuv420;
+    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 2 && subsample_y[1] == 1 &&
+        subsample_x[2] == 2 && subsample_y[2] == 1) {
+      return kJpegYuv422;
+    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 1 && subsample_y[1] == 1 &&
+        subsample_x[2] == 1 && subsample_y[2] == 1) {
+      return kJpegYuv444;
+    }
+  } else if (number_of_components == 1) {  // Grey-scale images.
+    if (subsample_x[0] == 1 && subsample_y[0] == 1) {
+      return kJpegYuv400;
+    }
+  }
+  return kJpegUnknown;
+}
+
+}  // namespace libyuv
+#endif  // HAVE_JPEG
+
diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc
index a7e3e38a..a7f5086a 100644
--- a/files/source/planar_functions.cc
+++ b/files/source/planar_functions.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -10,321 +10,104 @@
 
 #include "libyuv/planar_functions.h"
 
-#include <string.h>
+#include <string.h>  // for memset()
 
 #include "libyuv/cpu_id.h"
-#include "row.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/row.h"
 
+#ifdef __cplusplus
 namespace libyuv {
-
-#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
-#define HAS_SPLITUV_NEON
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
-// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
-static void SplitUV_NEON(const uint8* src_uv,
-                         uint8* dst_u, uint8* dst_v, int pix) {
-  __asm__ volatile
-  (
-    "1:\n"
-    "vld2.u8    {q0,q1}, [%0]!    \n"  // load 16 pairs of UV
-    "vst1.u8    {q0}, [%1]!       \n"  // store U
-    "vst1.u8    {q1}, [%2]!       \n"  // Store V
-    "subs       %3, %3, #16       \n"  // 16 processed per loop
-    "bhi        1b                \n"
-    : "+r"(src_uv),
-      "+r"(dst_u),
-      "+r"(dst_v),
-      "+r"(pix)             // Output registers
-    :                       // Input registers
-    : "q0", "q1"            // Clobber List
-  );
-}
-
-#elif (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
-    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#if defined(_MSC_VER)
-#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
-#else
-#define TALIGN16(t, var) t var __attribute__((aligned(16)))
+extern "C" {
 #endif
 
-// Shuffle table for converting ABGR to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = {
-  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
-};
-
-// Shuffle table for converting BGRA to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
-  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
-};
-
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
-#define HAS_SPLITUV_SSE2
-__declspec(naked)
-static void SplitUV_SSE2(const uint8* src_uv,
-                         uint8* dst_u, uint8* dst_v, int pix) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
-    psrlw      xmm7, 8
-
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm0
-    movdqa     xmm3, xmm1
-    pand       xmm0, xmm7   // even bytes
-    pand       xmm1, xmm7
-    packuswb   xmm0, xmm1
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    psrlw      xmm2, 8      // odd bytes
-    psrlw      xmm3, 8
-    packuswb   xmm2, xmm3
-    movdqa     [edi], xmm2
-    lea        edi, [edi + 16]
-    sub        ecx, 16
-    ja         wloop
-    pop        edi
-    ret
+// Copy a plane of data
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
+    CopyRow = CopyRow_NEON;
   }
-}
-
-#elif (defined(__x86_64__) || defined(__i386__)) && \
-    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#define HAS_SPLITUV_SSE2
-static void SplitUV_SSE2(const uint8* src_uv,
-                         uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile(
-  "pcmpeqb    %%xmm7,%%xmm7\n"
-  "psrlw      $0x8,%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "lea        0x20(%0),%0\n"
-  "movdqa     %%xmm0,%%xmm2\n"
-  "movdqa     %%xmm1,%%xmm3\n"
-  "pand       %%xmm7,%%xmm0\n"
-  "pand       %%xmm7,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm0\n"
-  "movdqa     %%xmm0,(%1)\n"
-  "lea        0x10(%1),%1\n"
-  "psrlw      $0x8,%%xmm2\n"
-  "psrlw      $0x8,%%xmm3\n"
-  "packuswb   %%xmm3,%%xmm2\n"
-  "movdqa     %%xmm2,(%2)\n"
-  "lea        0x10(%2),%2\n"
-  "sub        $0x10,%3\n"
-  "ja         1b\n"
-  : "+r"(src_uv),     // %0
-    "+r"(dst_u),      // %1
-    "+r"(dst_v),      // %2
-    "+r"(pix)         // %3
-  :
-  : "memory"
-);
-}
 #endif
+#if defined(HAS_COPYROW_X86)
+  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+    CopyRow = CopyRow_X86;
+  }
 #endif
-
-static void SplitUV_C(const uint8* src_uv,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  // Copy a row of UV.
-  for (int x = 0; x < pix; ++x) {
-    dst_u[0] = src_uv[0];
-    dst_v[0] = src_uv[1];
-    src_uv += 2;
-    dst_u += 1;
-    dst_v += 1;
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+    CopyRow = CopyRow_SSE2;
   }
-}
+#endif
 
-static void I420CopyPlane(const uint8* src_y, int src_stride_y,
-                          uint8* dst_y, int dst_stride_y,
-                          int width, int height) {
   // Copy plane
   for (int y = 0; y < height; ++y) {
-    memcpy(dst_y, src_y, width);
+    CopyRow(src_y, dst_y, width);
     src_y += src_stride_y;
     dst_y += dst_stride_y;
   }
 }
 
-// Copy I420 with optional flipping
-int I420Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+// Convert I420 to I400.
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
+               uint8*, int,  // src_u
+               uint8*, int,  // src_v
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
-
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    int halfheight = (height + 1) >> 1;
     src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
   }
-
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  I420CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
-  I420CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   return 0;
 }
 
-// SetRows32 writes 'count' bytes using a 32 bit value repeated
-
-#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
-#define HAS_SETROW_NEON
-static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
-  __asm__ volatile
-  (
-    "vdup.u32   q0, %2            \n"  // duplicate 4 ints
-    "1:\n"
-    "vst1.u32   {q0}, [%0]!       \n"  // store
-    "subs       %1, %1, #16       \n"  // 16 processed per loop
-    "bhi        1b                \n"
-  : "+r"(dst),  // %0
-    "+r"(count) // %1
-  : "r"(v32)    // %2
-  : "q0", "memory"
-  );
-}
-
-#elif defined(WIN32) && !defined(COVERAGE_ENABLED)
-#define HAS_SETROW_SSE2
-__declspec(naked)
-static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
-  __asm {
-    mov        eax, [esp + 4]    // dst
-    movd       xmm7, [esp + 8]   // v32
-    mov        ecx, [esp + 12]   // count
-    pshufd     xmm7, xmm7, 0
-
-  wloop:
-    movdqa     [eax], xmm7
-    lea        eax, [eax + 16]
-    sub        ecx, 16
-    ja         wloop
-    ret
+// Mirror a plane of data
+void MirrorPlane(const uint8* src_y, int src_stride_y,
+                 uint8* dst_y, int dst_stride_y,
+                 int width, int height) {
+  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    MirrorRow = MirrorRow_NEON;
   }
-}
-
-#elif (defined(__x86_64__) || defined(__i386__)) && \
-    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-
-#define HAS_SETROW_SSE2
-static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
-  asm volatile(
-  "movd       %2, %%xmm7\n"
-  "pshufd     $0x0,%%xmm7,%%xmm7\n"
-"1:"
-  "movdqa     %%xmm7,(%0)\n"
-  "lea        0x10(%0),%0\n"
-  "sub        $0x10,%1\n"
-  "ja         1b\n"
-  : "+r"(dst),  // %0
-    "+r"(count) // %1
-  : "r"(v32)    // %2
-  : "memory"
-);
-}
 #endif
-
-static void SetRow8_C(uint8* dst, uint32 v8, int count) {
-  memset(dst, v8, count);
-}
-
-static void I420SetPlane(uint8* dst_y, int dst_stride_y,
-                         int width, int height,
-                         int value) {
-  void (*SetRow)(uint8* dst, uint32 value, int pix);
-#if defined(HAS_SETROW_NEON)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
-    SetRow = SetRow32_NEON;
-  } else
-#elif defined(HAS_SETROW_SSE2)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
-    SetRow = SetRow32_SSE2;
-  } else
+#if defined(HAS_MIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
+    MirrorRow = MirrorRow_SSE2;
+#if defined(HAS_MIRRORROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3) &&
+        IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
 #endif
-  {
-    SetRow = SetRow8_C;
   }
+#endif
 
-  uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
-  // Set plane
+  // Mirror plane
   for (int y = 0; y < height; ++y) {
-    SetRow(dst_y, v32, width);
+    MirrorRow(src_y, dst_y, width);
+    src_y += src_stride_y;
     dst_y += dst_stride_y;
   }
 }
 
-// Draw a rectangle into I420
-int I420Rect(uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int x, int y,
-             int width, int height,
-             int value_y, int value_u, int value_v) {
-  if (!dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0 ||
-      x < 0 || y < 0 ||
-      value_y < 0 || value_y > 255 ||
-      value_u < 0 || value_u > 255 ||
-      value_v < 0 || value_v > 255) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    int halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  uint8* start_y = dst_y + y * dst_stride_y + x;
-  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
-  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
-
-  I420SetPlane(start_y, dst_stride_y, width, height, value_y);
-  I420SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
-  I420SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
-  return 0;
-}
-
-// Helper function to copy yuv data without scaling.  Used
-// by our jpeg conversion callbacks to incrementally fill a yuv image.
-int I422ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
@@ -332,1244 +115,1314 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
   }
-
-  // Copy Y plane
-  I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-
-  // SubSample UV planes.
-  int x, y;
-  int halfwidth = (width + 1) >> 1;
-  for (y = 0; y < height; y += 2) {
-    const uint8* u0 = src_u;
-    const uint8* u1 = src_u + src_stride_u;
-    if ((y + 1) >= height) {
-      u1 = u0;
+  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+  void (*YUY2ToYRow)(const uint8* src_yuy2,
+                     uint8* dst_y, int pix);
+  YUY2ToYRow = YUY2ToYRow_C;
+  YUY2ToUV422Row = YUY2ToUV422Row_C;
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    if (width > 16) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+      YUY2ToYRow = YUY2ToYRow_Any_SSE2;
     }
-    for (x = 0; x < halfwidth; ++x) {
-      dst_u[x] = (u0[x] + u1[x] + 1) >> 1;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
+      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          YUY2ToYRow = YUY2ToYRow_SSE2;
+        }
+      }
     }
-    src_u += src_stride_u * 2;
-    dst_u += dst_stride_u;
   }
-  for (y = 0; y < height; y += 2) {
-    const uint8* v0 = src_v;
-    const uint8* v1 = src_v + src_stride_v;
-    if ((y + 1) >= height) {
-      v1 = v0;
+#elif defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 8) {
+      YUY2ToYRow = YUY2ToYRow_Any_NEON;
+      if (width > 16) {
+        YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+      }
     }
-    for (x = 0; x < halfwidth; ++x) {
-      dst_v[x] = (v0[x] + v1[x] + 1) >> 1;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
     }
-    src_v += src_stride_v * 2;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
-                           uint8* dst, int dst_stride_frame,
-                           int width, int height) {
-  // Copy plane
-  for (int y = 0; y < height; y += 2) {
-    memcpy(dst, src, width);
-    src += src_stride_0;
-    dst += dst_stride_frame;
-    memcpy(dst, src, width);
-    src += src_stride_1;
-    dst += dst_stride_frame;
   }
-}
-
-// Support converting from FOURCC_M420
-// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
-// easy conversion to I420.
-// M420 format description:
-// M420 is row biplanar 420: 2 rows of Y and 1 row of VU.
-// Chroma is half width / half height. (420)
-// src_stride_m420 is row planar.  Normally this will be the width in pixels.
-//   The UV plane is half width, but 2 values, so src_stride_m420 applies to
-//   this as well as the two Y planes.
-static int X420ToI420(const uint8* src_y,
-                      int src_stride_y0, int src_stride_y1,
-                      const uint8* src_uv, int src_stride_uv,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int width, int height) {
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    int halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-
-  int halfwidth = (width + 1) >> 1;
-  void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
-#if defined(HAS_SPLITUV_NEON)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
-      (halfwidth % 16 == 0) &&
-      IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
-      IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
-      IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) {
-    SplitUV = SplitUV_NEON;
-  } else
-#elif defined(HAS_SPLITUV_SSE2)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-      (halfwidth % 16 == 0) &&
-      IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
-      IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
-      IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) {
-    SplitUV = SplitUV_SSE2;
-  } else
 #endif
-  {
-    SplitUV = SplitUV_C;
-  }
-
-  I420CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
-                 width, height);
 
-  int halfheight = (height + 1) >> 1;
-  for (int y = 0; y < halfheight; ++y) {
-    // Copy a row of UV.
-    SplitUV(src_uv, dst_u, dst_v, halfwidth);
+  for (int y = 0; y < height; ++y) {
+    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_y += dst_stride_y;
     dst_u += dst_stride_u;
     dst_v += dst_stride_v;
-    src_uv += src_stride_uv;
   }
   return 0;
 }
 
-// Convert M420 to I420.
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
-                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height);
-}
-
-// Convert NV12 to I420.
-int NV12ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return X420ToI420(src_y, src_stride_y, src_stride_y,
-                    src_uv, src_stride_uv,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height);
-}
-
-// Convert NV12 to I420.  Deprecated.
-int NV12ToI420(const uint8* src_y,
-               const uint8* src_uv,
-               int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return X420ToI420(src_y, src_stride_frame, src_stride_frame,
-                    src_uv, src_stride_frame,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height);
-}
-
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
-#define HAS_SPLITYUY2_SSE2
-__declspec(naked)
-static void SplitYUY2_SSE2(const uint8* src_yuy2,
-                           uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        edx, [esp + 8 + 8]    // dst_y
-    mov        esi, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // pix
-    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
-    psrlw      xmm7, 8
-
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm0
-    movdqa     xmm3, xmm1
-    pand       xmm2, xmm7   // even bytes are Y
-    pand       xmm3, xmm7
-    packuswb   xmm2, xmm3
-    movdqa     [edx], xmm2
-    lea        edx, [edx + 16]
-    psrlw      xmm0, 8      // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm7  // U
-    packuswb   xmm0, xmm0
-    movq       qword ptr [esi], xmm0
-    lea        esi, [esi + 8]
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edi], xmm1
-    lea        edi, [edi + 8]
-    sub        ecx, 16
-    ja         wloop
-
-    pop        edi
-    pop        esi
-    ret
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  void (*UYVYToUV422Row)(const uint8* src_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+                     uint8* dst_y, int pix);
+  UYVYToYRow = UYVYToYRow_C;
+  UYVYToUV422Row = UYVYToUV422Row_C;
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    if (width > 16) {
+      UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+      UYVYToYRow = UYVYToYRow_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
+      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
+        UYVYToUV422Row = UYVYToUV422Row_SSE2;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          UYVYToYRow = UYVYToYRow_SSE2;
+        }
+      }
+    }
+  }
+#elif defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 8) {
+      UYVYToYRow = UYVYToYRow_Any_NEON;
+      if (width > 16) {
+        UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
+      }
+    }
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUV422Row = UYVYToUV422Row_NEON;
+    }
   }
-}
-
-#elif (defined(__x86_64__) || defined(__i386__)) && \
-    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#define HAS_SPLITYUY2_SSE2
-static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
-                           uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile(
-  "pcmpeqb    %%xmm7,%%xmm7\n"
-  "psrlw      $0x8,%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "lea        0x20(%0),%0\n"
-  "movdqa     %%xmm0,%%xmm2\n"
-  "movdqa     %%xmm1,%%xmm3\n"
-  "pand       %%xmm7,%%xmm2\n"
-  "pand       %%xmm7,%%xmm3\n"
-  "packuswb   %%xmm3,%%xmm2\n"
-  "movdqa     %%xmm2,(%1)\n"
-  "lea        0x10(%1),%1\n"
-  "psrlw      $0x8,%%xmm0\n"
-  "psrlw      $0x8,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm0\n"
-  "movdqa     %%xmm0,%%xmm1\n"
-  "pand       %%xmm7,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "movq       %%xmm0,(%2)\n"
-  "lea        0x8(%2),%2\n"
-  "psrlw      $0x8,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm1\n"
-  "movq       %%xmm1,(%3)\n"
-  "lea        0x8(%3),%3\n"
-  "sub        $0x10,%4\n"
-  "ja         1b\n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_y),       // %1
-    "+r"(dst_u),       // %2
-    "+r"(dst_v),       // %3
-    "+r"(pix)          // %4
-  :
-  : "memory"
-);
-}
 #endif
 
-static void SplitYUY2_C(const uint8* src_yuy2,
-                        uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
-  // Copy a row of YUY2.
-  for (int x = 0; x < pix; x += 2) {
-    dst_y[0] = src_yuy2[0];
-    dst_y[1] = src_yuy2[2];
-    dst_u[0] = src_yuy2[1];
-    dst_v[0] = src_yuy2[3];
-    src_yuy2 += 4;
-    dst_y += 2;
-    dst_u += 1;
-    dst_v += 1;
+  for (int y = 0; y < height; ++y) {
+    UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+    src_uyvy += src_stride_uyvy;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
   }
+  return 0;
 }
 
-// Convert Q420 to I420.
-// Format is rows of YY/YUYV
-int Q420ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_yuy2, int src_stride_yuy2,
+// Mirror I420 with optional flipping
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     int halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-  void (*SplitYUY2)(const uint8* src_yuy2,
-                    uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix);
-#if defined(HAS_SPLITYUY2_SSE2)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
-      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
-      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
-      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
-    SplitYUY2 = SplitYUY2_SSE2;
-  } else
-#endif
-  {
-    SplitYUY2 = SplitYUY2_C;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
   }
-  for (int y = 0; y < height; y += 2) {
-    memcpy(dst_y, src_y, width);
-    dst_y += dst_stride_y;
-    src_y += src_stride_y;
 
-    // Copy a row of YUY2.
-    SplitYUY2(src_yuy2, dst_y, dst_u, dst_v, width);
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-    src_yuy2 += src_stride_yuy2;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (dst_y) {
+    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
+  MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
   return 0;
 }
 
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
-#define HAS_YUY2TOI420ROW_SSE2
-__declspec(naked)
-void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
-                         uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]    // src_yuy2
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // pix
-    pcmpeqb    xmm7, xmm7        // generate mask 0x00ff00ff
-    psrlw      xmm7, 8
-
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm7   // even bytes are Y
-    pand       xmm1, xmm7
-    packuswb   xmm0, xmm1
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    ja         wloop
-    ret
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
   }
-}
-
-__declspec(naked)
-void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                          uint8* dst_u, uint8* dst_y, int pix) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // pix
-    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
-    psrlw      xmm7, 8
-
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-    psrlw      xmm0, 8      // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm7  // U
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edi], xmm1
-    lea        edi, [edi + 8]
-    sub        ecx, 16
-    ja         wloop
-
-    pop        edi
-    pop        esi
-    ret
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
   }
-}
 
-#define HAS_UYVYTOI420ROW_SSE2
-__declspec(naked)
-void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
-                         uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]    // src_uyvy
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // pix
-
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8    // odd bytes are Y
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    ja         wloop
-    ret
+  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+      ARGBMirrorRow_C;
+#if defined(HAS_ARGBMIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBMirrorRow = ARGBMirrorRow_SSSE3;
   }
-}
+#endif
 
-__declspec(naked)
-void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                          uint8* dst_u, uint8* dst_y, int pix) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // pix
-    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
-    psrlw      xmm7, 8
-
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-    pand       xmm0, xmm7   // UYVY -> UVUV
-    pand       xmm1, xmm7
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm7  // U
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edi], xmm1
-    lea        edi, [edi + 8]
-    sub        ecx, 16
-    ja         wloop
-
-    pop        edi
-    pop        esi
-    ret
+  // Mirror plane
+  for (int y = 0; y < height; ++y) {
+    ARGBMirrorRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
   }
+  return 0;
 }
 
-#elif (defined(__x86_64__) || defined(__i386__)) && \
-    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-
-#define HAS_YUY2TOI420ROW_SSE2
-static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
-                                uint8* dst_y, int pix) {
-  asm volatile(
-  "pcmpeqb    %%xmm7,%%xmm7\n"
-  "psrlw      $0x8,%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "lea        0x20(%0),%0\n"
-  "pand       %%xmm7,%%xmm0\n"
-  "pand       %%xmm7,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm0\n"
-  "movdqa     %%xmm0,(%1)\n"
-  "lea        0x10(%1),%1\n"
-  "sub        $0x10,%2\n"
-  "ja         1b\n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "memory"
-);
-}
-
-static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                                 uint8* dst_u, uint8* dst_y, int pix) {
-  asm volatile(
-  "pcmpeqb    %%xmm7,%%xmm7\n"
-  "psrlw      $0x8,%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "movdqa     (%0,%4,1),%%xmm2\n"
-  "movdqa     0x10(%0,%4,1),%%xmm3\n"
-  "lea        0x20(%0),%0\n"
-  "pavgb      %%xmm2,%%xmm0\n"
-  "pavgb      %%xmm3,%%xmm1\n"
-  "psrlw      $0x8,%%xmm0\n"
-  "psrlw      $0x8,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm0\n"
-  "movdqa     %%xmm0,%%xmm1\n"
-  "pand       %%xmm7,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "movq       %%xmm0,(%1)\n"
-  "lea        0x8(%1),%1\n"
-  "psrlw      $0x8,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm1\n"
-  "movq       %%xmm1,(%2)\n"
-  "lea        0x8(%2),%2\n"
-  "sub        $0x10,%3\n"
-  "ja         1b\n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_y),       // %2
-    "+r"(pix)          // %3
-  : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
-  : "memory"
-);
-}
-#define HAS_UYVYTOI420ROW_SSE2
-static void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
-                                uint8* dst_y, int pix) {
-  asm volatile(
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "lea        0x20(%0),%0\n"
-  "psrlw      $0x8,%%xmm0\n"
-  "psrlw      $0x8,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm0\n"
-  "movdqa     %%xmm0,(%1)\n"
-  "lea        0x10(%1),%1\n"
-  "sub        $0x10,%2\n"
-  "ja         1b\n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "memory"
-);
-}
-
-static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                                 uint8* dst_u, uint8* dst_y, int pix) {
-  asm volatile(
-  "pcmpeqb    %%xmm7,%%xmm7\n"
-  "psrlw      $0x8,%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "movdqa     (%0,%4,1),%%xmm2\n"
-  "movdqa     0x10(%0,%4,1),%%xmm3\n"
-  "lea        0x20(%0),%0\n"
-  "pavgb      %%xmm2,%%xmm0\n"
-  "pavgb      %%xmm3,%%xmm1\n"
-  "pand       %%xmm7,%%xmm0\n"
-  "pand       %%xmm7,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm0\n"
-  "movdqa     %%xmm0,%%xmm1\n"
-  "pand       %%xmm7,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "movq       %%xmm0,(%1)\n"
-  "lea        0x8(%1),%1\n"
-  "psrlw      $0x8,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm1\n"
-  "movq       %%xmm1,(%2)\n"
-  "lea        0x8(%2),%2\n"
-  "sub        $0x10,%3\n"
-  "ja         1b\n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_y),       // %2
-    "+r"(pix)          // %3
-  : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
-  : "memory"
-);
-}
+// Get a blender that optimized for the CPU, alignment and pixel count.
+// As there are 6 blenders to choose from, the caller should try to use
+// the same blend function for all pixels if possible.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend() {
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) = ARGBBlendRow_C;
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBBlendRow = ARGBBlendRow_SSSE3;
+    return ARGBBlendRow;
+  }
 #endif
-
-// Filter 2 rows of YUY2 UV's (422) into U and V (420)
-void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_stride_yuy2,
-                       uint8* dst_u, uint8* dst_v, int pix) {
-  // Output a row of UV values, filtering 2 rows of YUY2
-  for (int x = 0; x < pix; x += 2) {
-    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
-    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
-    src_yuy2 += 4;
-    dst_u += 1;
-    dst_v += 1;
+#if defined(HAS_ARGBBLENDROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBBlendRow = ARGBBlendRow_SSE2;
   }
+#endif
+  return ARGBBlendRow;
 }
 
-void YUY2ToI420RowY_C(const uint8* src_yuy2,
-                      uint8* dst_y, int pix) {
-  // Copy a row of yuy2 Y values
-  for (int x = 0; x < pix; ++x) {
-    dst_y[0] = src_yuy2[0];
-    src_yuy2 += 2;
-    dst_y += 1;
+// Alpha Blend 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+              const uint8* src_argb1, int src_stride_argb1,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
   }
-}
-
-void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_stride_uyvy,
-                       uint8* dst_u, uint8* dst_v, int pix) {
-  // Copy a row of uyvy UV values
-  for (int x = 0; x < pix; x += 2) {
-    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
-    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
-    src_uyvy += 4;
-    dst_u += 1;
-    dst_v += 1;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
   }
-}
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) = GetARGBBlend();
 
-void UYVYToI420RowY_C(const uint8* src_uyvy,
-                      uint8* dst_y, int pix) {
-  // Copy a row of uyvy Y values
-  for (int x = 0; x < pix; ++x) {
-    dst_y[0] = src_uyvy[1];
-    src_uyvy += 2;
-    dst_y += 1;
+  for (int y = 0; y < height; ++y) {
+    ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
   }
+  return 0;
 }
 
-// Convert YUY2 to I420.
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
                uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  // Negative height means invert the image.
+  if (!src_argb || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
   if (height < 0) {
     height = -height;
-    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
-    src_stride_yuy2 = -src_stride_yuy2;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
   }
-  void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int pix);
-  void (*YUY2ToI420RowY)(const uint8* src_yuy2,
-                         uint8* dst_y, int pix);
-#if defined(HAS_YUY2TOI420ROW_SSE2)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
-      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
-      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
-      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
-    YUY2ToI420RowY = YUY2ToI420RowY_SSE2;
-    YUY2ToI420RowUV = YUY2ToI420RowUV_SSE2;
-  } else
-#endif
-  {
-    YUY2ToI420RowY = YUY2ToI420RowY_C;
-    YUY2ToI420RowUV = YUY2ToI420RowUV_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+    ARGBToYRow = ARGBToYRow_SSSE3;
   }
+#endif
+
   for (int y = 0; y < height; ++y) {
-    if ((y & 1) == 0) {
-      if (y >= (height - 1) ) {  // last chroma on odd height clamp height
-        src_stride_yuy2 = 0;
-      }
-      YUY2ToI420RowUV(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    YUY2ToI420RowY(src_yuy2, dst_y, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
     dst_y += dst_stride_y;
-    src_yuy2 += src_stride_yuy2;
   }
   return 0;
 }
 
-// Convert UYVY to I420.
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+// ARGB little endian (bgra in memory) to I422
+// same as I420 except UV plane is full height
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  // Negative height means invert the image.
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
   if (height < 0) {
     height = -height;
-    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
-    src_stride_uyvy = -src_stride_uyvy;
-  }
-  void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int pix);
-  void (*UYVYToI420RowY)(const uint8* src_uyvy,
-                         uint8* dst_y, int pix);
-#if defined(HAS_UYVYTOI420ROW_SSE2)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src_uyvy, 16) && (src_stride_uyvy % 16 == 0) &&
-      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
-      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
-      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
-    UYVYToI420RowY = UYVYToI420RowY_SSE2;
-    UYVYToI420RowUV = UYVYToI420RowUV_SSE2;
-  } else
-#endif
-  {
-    UYVYToI420RowY = UYVYToI420RowY_C;
-    UYVYToI420RowUV = UYVYToI420RowUV_C;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
   }
-  for (int y = 0; y < height; ++y) {
-    if ((y & 1) == 0) {
-      if (y >= (height - 1) ) {  // last chroma on odd height clamp height
-        src_stride_uyvy = 0;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (width > 16) {
+      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+      ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+        ARGBToUVRow = ARGBToUVRow_SSSE3;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          ARGBToYRow = ARGBToYRow_SSSE3;
+        }
       }
-      UYVYToI420RowUV(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
     }
-    UYVYToI420RowY(src_uyvy, dst_y, width);
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
     dst_y += dst_stride_y;
-    src_uyvy += src_stride_uyvy;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
   }
   return 0;
 }
 
-// Convert I420 to ARGB.
-// TODO(fbarchard): Add SSE2 version and supply C version for fallback.
-int I420ToARGB(const uint8* src_y, int src_stride_y,
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
+               uint8* dst_bgra, int dst_stride_bgra,
                int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_bgra ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+    dst_stride_bgra = -dst_stride_bgra;
+  }
+  void (*I422ToBGRARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToBGRARow_C;
+#if defined(HAS_I422TOBGRAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToBGRARow = I422ToBGRARow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToBGRARow = I422ToBGRARow_NEON;
+    }
+  }
+#elif defined(HAS_I422TOBGRAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
+        I422ToBGRARow = I422ToBGRARow_SSSE3;
+      }
+    }
   }
+#endif
+
   for (int y = 0; y < height; ++y) {
-    FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
-    dst_argb += dst_stride_argb;
+    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+    dst_bgra += dst_stride_bgra;
     src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
+    src_u += src_stride_u;
+    src_v += src_stride_v;
   }
-  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
-  EMMS();
   return 0;
 }
 
-// Convert I420 to BGRA.
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
+               uint8* dst_abgr, int dst_stride_abgr,
                int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_abgr ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+    dst_stride_abgr = -dst_stride_abgr;
+  }
+  void (*I422ToABGRRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToABGRRow_C;
+#if defined(HAS_I422TOABGRROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToABGRRow = I422ToABGRRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToABGRRow = I422ToABGRRow_NEON;
+    }
+  }
+#elif defined(HAS_I422TOABGRROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
+        I422ToABGRRow = I422ToABGRRow_SSSE3;
+      }
+    }
   }
+#endif
+
   for (int y = 0; y < height; ++y) {
-    FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width);
-    dst_argb += dst_stride_argb;
+    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+    dst_abgr += dst_stride_abgr;
     src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
+    src_u += src_stride_u;
+    src_v += src_stride_v;
   }
-  EMMS();
   return 0;
 }
 
-// Convert I420 to BGRA.
-int I420ToABGR(const uint8* src_y, int src_stride_y,
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
                int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_rgba ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+  void (*I422ToRGBARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToRGBARow_C;
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
   }
+#elif defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
+        I422ToRGBARow = I422ToRGBARow_SSSE3;
+      }
+    }
+  }
+#endif
+
   for (int y = 0; y < height; ++y) {
-    FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width);
-    dst_argb += dst_stride_argb;
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+    dst_rgba += dst_stride_rgba;
     src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
+    src_u += src_stride_u;
+    src_v += src_stride_v;
   }
-  EMMS();
   return 0;
 }
 
-// Convert I422 to ARGB.
-int I422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
                int width, int height) {
+  if (!src_argb || !dst_rgba ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  void (*ARGBToRGBARow)(const uint8* src_argb, uint8* dst_rgba, int pix) =
+      ARGBToRGBARow_C;
+#if defined(HAS_ARGBTORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+      IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
+    ARGBToRGBARow = ARGBToRGBARow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBTORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBToRGBARow = ARGBToRGBARow_NEON;
   }
+#endif
+
   for (int y = 0; y < height; ++y) {
-    FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
+    ARGBToRGBARow(src_argb, dst_rgba, width);
+    src_argb += src_stride_argb;
+    dst_rgba += dst_stride_rgba;
   }
-  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
-  EMMS();
   return 0;
 }
 
-// Convert I444 to ARGB.
-int I444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  // Negative height means invert the image.
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height) {
+  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB24Row_C;
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+    if (width * 3 <= kMaxStride) {
+      ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+    }
+    if (IS_ALIGNED(width, 16) &&
+        IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width * 3 <= kMaxStride) {
+      ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+    }
   }
+#endif
+
   for (int y = 0; y < height; ++y) {
-    FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
+    ARGBToRGB24Row(src_argb, dst_rgb24, width);
+    src_argb += src_stride_argb;
+    dst_rgb24 += dst_stride_rgb24;
   }
-  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
-  EMMS();
   return 0;
 }
 
-// Convert I400 to ARGB.
-int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
-                         uint8* dst_argb, int dst_stride_argb,
-                         int width, int height) {
-  // Negative height means invert the image.
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_raw, int dst_stride_raw,
+              int width, int height) {
+  if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+    return -1;
+  }
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
   }
+  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRAWRow_C;
+#if defined(HAS_ARGBTORAWROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+    if (width * 3 <= kMaxStride) {
+      ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+    }
+    if (IS_ALIGNED(width, 16) &&
+        IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width * 3 <= kMaxStride) {
+      ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRAWRow = ARGBToRAWRow_NEON;
+    }
+  }
+#endif
+
   for (int y = 0; y < height; ++y) {
-    FastConvertYToRGB32Row(src_y, dst_argb, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
+    ARGBToRAWRow(src_argb, dst_raw, width);
+    src_argb += src_stride_argb;
+    dst_raw += dst_stride_raw;
   }
-  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
-  EMMS();
   return 0;
 }
 
-// TODO(fbarchard): 64 bit version
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
-
-#define HAS_I400TOARGBROW_SSE2
-__declspec(naked)
-static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
-  __asm {
-    mov        eax, [esp + 4]        // src_y
-    mov        edx, [esp + 8]        // dst_argb
-    mov        ecx, [esp + 12]       // pix
-    pcmpeqb    xmm7, xmm7            // generate mask 0xff000000
-    pslld      xmm7, 24
-
-  wloop:
-    movq       xmm0, qword ptr [eax]
-    lea        eax,  [eax + 8]
-    punpcklbw  xmm0, xmm0
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm0
-    punpckhwd  xmm1, xmm1
-    por        xmm0, xmm7
-    por        xmm1, xmm7
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    ja         wloop
-    ret
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
   }
-}
-
-#define HAS_ABGRTOARGBROW_SSSE3
-__declspec(naked)
-static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
-                                int pix) {
-__asm {
-    mov       eax, [esp + 4]   // src_abgr
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm7, _kShuffleMaskABGRToARGB
-
- convertloop :
-    movdqa    xmm0, [eax]
-    lea       eax, [eax + 16]
-    pshufb    xmm0, xmm7
-    movdqa    [edx], xmm0
-    lea       edx, [edx + 16]
-    sub       ecx, 4
-    ja        convertloop
-    ret
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
   }
-}
+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB565Row_C;
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+    if (width * 2 <= kMaxStride) {
+      ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+    }
+  }
+#endif
 
-#define HAS_BGRATOARGBROW_SSSE3
-__declspec(naked)
-static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
-                                int pix) {
-__asm {
-    mov       eax, [esp + 4]   // src_bgra
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm7, _kShuffleMaskBGRAToARGB
-
- convertloop :
-    movdqa    xmm0, [eax]
-    lea       eax, [eax + 16]
-    pshufb    xmm0, xmm7
-    movdqa    [edx], xmm0
-    lea       edx, [edx + 16]
-    sub       ecx, 4
-    ja        convertloop
-    ret
+  for (int y = 0; y < height; ++y) {
+    ARGBToRGB565Row(src_argb, dst_rgb565, width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
   }
+  return 0;
 }
 
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height) {
+  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToARGB1555Row_C;
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+    if (width * 2 <= kMaxStride) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+    }
+  }
+#endif
 
-#elif (defined(__x86_64__) || defined(__i386__)) && \
-    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-
-// TODO(yuche): consider moving ARGB related codes to a separate file.
-#define HAS_I400TOARGBROW_SSE2
-static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
-  asm volatile(
-  "pcmpeqb    %%xmm7,%%xmm7\n"
-  "pslld      $0x18,%%xmm7\n"
-"1:"
-  "movq       (%0),%%xmm0\n"
-  "lea        0x8(%0),%0\n"
-  "punpcklbw  %%xmm0,%%xmm0\n"
-  "movdqa     %%xmm0,%%xmm1\n"
-  "punpcklwd  %%xmm0,%%xmm0\n"
-  "punpckhwd  %%xmm1,%%xmm1\n"
-  "por        %%xmm7,%%xmm0\n"
-  "por        %%xmm7,%%xmm1\n"
-  "movdqa     %%xmm0,(%1)\n"
-  "movdqa     %%xmm1,0x10(%1)\n"
-  "lea        0x20(%1),%1\n"
-  "sub        $0x8,%2\n"
-  "ja         1b\n"
-  : "+r"(src_y),     // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  :
-  : "memory"
-);
+  for (int y = 0; y < height; ++y) {
+    ARGBToARGB1555Row(src_argb, dst_argb1555, width);
+    src_argb += src_stride_argb;
+    dst_argb1555 += dst_stride_argb1555;
+  }
+  return 0;
 }
 
-#define HAS_ABGRTOARGBROW_SSSE3
-static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
-                                int pix) {
-  asm volatile(
-  "movdqa     (%3),%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "lea        0x10(%0),%0\n"
-  "pshufb     %%xmm7,%%xmm0\n"
-  "movdqa     %%xmm0,(%1)\n"
-  "lea        0x10(%1),%1\n"
-  "sub        $0x4,%2\n"
-  "ja         1b\n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "r"(kShuffleMaskABGRToARGB)  // %3
-  : "memory"
-);
-}
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height) {
+  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToARGB4444Row_C;
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+    if (width * 2 <= kMaxStride) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+    }
+  }
+#endif
 
-#define HAS_BGRATOARGBROW_SSSE3
-static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
-                                int pix) {
-  asm volatile(
-  "movdqa     (%3),%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "lea        0x10(%0),%0\n"
-  "pshufb     %%xmm7,%%xmm0\n"
-  "movdqa     %%xmm0,(%1)\n"
-  "lea        0x10(%1),%1\n"
-  "sub        $0x4,%2\n"
-  "ja         1b\n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "r"(kShuffleMaskBGRAToARGB)  // %3
-  : "memory"
-);
+  for (int y = 0; y < height; ++y) {
+    ARGBToARGB4444Row(src_argb, dst_argb4444, width);
+    src_argb += src_stride_argb;
+    dst_argb4444 += dst_stride_argb4444;
+  }
+  return 0;
 }
 
+// Convert NV12 to RGB565.
+// TODO(fbarchard): (Re) Optimize for Neon.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV12ToARGBRow_C;
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) {
+    NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width * 4 <= kMaxStride) {
+    NV12ToARGBRow = NV12ToARGBRow_NEON;
+  }
 #endif
 
-static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
-  // Copy a Y to RGB.
-  for (int x = 0; x < pix; ++x) {
-    uint8 y = src_y[0];
-    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
-    dst_argb[3] = 255u;
-    dst_argb += 4;
-    ++src_y;
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB565Row_C;
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
   }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    NV12ToARGBRow(src_y, src_uv, row, width);
+    ARGBToRGB565Row(row, dst_rgb565, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
 }
 
-// Convert I400 to ARGB.
-int I400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_vu, int src_stride_vu,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  if (!src_y || !src_vu || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+  void (*NV21ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV21ToARGBRow_C;
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) {
+    NV21ToARGBRow = NV21ToARGBRow_SSSE3;
   }
-  void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix);
-#if defined(HAS_I400TOARGBROW_SSE2)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-      (width % 8 == 0) &&
-      IS_ALIGNED(src_y, 8) && (src_stride_y % 8 == 0) &&
-      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
-    I400ToARGBRow = I400ToARGBRow_SSE2;
-  } else
 #endif
-  {
-    I400ToARGBRow = I400ToARGBRow_C;
+
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB565Row_C;
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
   }
+#endif
 
   for (int y = 0; y < height; ++y) {
-    I400ToARGBRow(src_y, dst_argb, width);
+    NV21ToARGBRow(src_y, src_vu, row, width);
+    ARGBToRGB565Row(row, dst_rgb565, width);
+    dst_rgb565 += dst_stride_rgb565;
     src_y += src_stride_y;
-    dst_argb += dst_stride_argb;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
   }
   return 0;
 }
 
-static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
-    // To support in-place conversion.
-    uint8 r = src_abgr[0];
-    uint8 g = src_abgr[1];
-    uint8 b = src_abgr[2];
-    uint8 a = src_abgr[3];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = a;
-    dst_argb += 4;
-    src_abgr += 4;
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+              int width, int height,
+              uint32 value) {
+  void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow8_C;
+#if defined(HAS_SETROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+    SetRow = SetRow8_NEON;
+  }
+#endif
+#if defined(HAS_SETROW_X86)
+  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+    SetRow = SetRow8_X86;
+  }
+#endif
+
+  uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
+  // Set plane
+  for (int y = 0; y < height; ++y) {
+    SetRow(dst_y, v32, width);
+    dst_y += dst_stride_y;
   }
 }
 
-int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+// Draw a rectangle into I420
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int x, int y,
+             int width, int height,
+             int value_y, int value_u, int value_v) {
+  if (!dst_y || !dst_u || !dst_v ||
+      width <= 0 || height <= 0 ||
+      x < 0 || y < 0 ||
+      value_y < 0 || value_y > 255 ||
+      value_u < 0 || value_u > 255 ||
+      value_v < 0 || value_v > 255) {
+    return -1;
+  }
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  uint8* start_y = dst_y + y * dst_stride_y + x;
+  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+
+  SetPlane(start_y, dst_stride_y, width, height, value_y);
+  SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
+  SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
+  return 0;
+}
+
+// Draw a rectangle into ARGB
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+             int dst_x, int dst_y,
+             int width, int height,
+             uint32 value) {
+  if (!dst_argb ||
+      width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+#if defined(HAS_SETROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    SetRows32_NEON(dst, value, width, dst_stride_argb, height);
+    return 0;
+  }
+#endif
+#if defined(HAS_SETROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    SetRows32_X86(dst, value, width, dst_stride_argb, height);
+    return 0;
+  }
+#endif
+  SetRows32_C(dst, value, width, dst_stride_argb, height);
+  return 0;
+}
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+// An unattenutated ARGB alpha blend uses the formula
+// p = a * f + (1 - a) * b
+// where
+//   p is output pixel
+//   f is foreground pixel
+//   b is background pixel
+//   a is alpha value from foreground pixel
+// An preattenutated ARGB alpha blend uses the formula
+// p = f + (1 - a) * b
+// where
+//   f is foreground pixel premultiplied by alpha
+
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
   if (height < 0) {
     height = -height;
-    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
-    src_stride_abgr = -src_stride_abgr;
-  }
-void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix);
-#if defined(HAS_ABGRTOARGBROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 4 == 0) &&
-      IS_ALIGNED(src_abgr, 16) && (src_stride_abgr % 16 == 0) &&
-      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
-    ABGRToARGBRow = ABGRToARGBRow_SSSE3;
-  } else
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+#if defined(HAS_ARGBATTENUATE_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
+  }
 #endif
-  {
-    ABGRToARGBRow = ABGRToARGBRow_C;
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
   }
+#endif
 
   for (int y = 0; y < height; ++y) {
-    ABGRToARGBRow(src_abgr, dst_argb, width);
-    src_abgr += src_stride_abgr;
+    ARGBAttenuateRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
     dst_argb += dst_stride_argb;
   }
   return 0;
 }
 
-static void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
-    // To support in-place conversion.
-    uint8 a = src_bgra[0];
-    uint8 r = src_bgra[1];
-    uint8 g = src_bgra[2];
-    uint8 b = src_bgra[3];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = a;
-    dst_argb += 4;
-    src_bgra += 4;
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
   }
-}
-
-// Convert BGRA to ARGB.
-int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
   if (height < 0) {
     height = -height;
-    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
-    src_stride_bgra = -src_stride_bgra;
-  }
-  void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix);
-#if defined(HAS_BGRATOARGBROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 4 == 0) &&
-      IS_ALIGNED(src_bgra, 16) && (src_stride_bgra % 16 == 0) &&
-      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
-    BGRAToARGBRow = BGRAToARGBRow_SSSE3;
-  } else
-#endif
-  {
-    BGRAToARGBRow = BGRAToARGBRow_C;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
   }
+  void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
+                             int width) = ARGBUnattenuateRow_C;
+#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
+  }
+#endif
 
   for (int y = 0; y < height; ++y) {
-    BGRAToARGBRow(src_bgra, dst_argb, width);
-    src_bgra += src_stride_bgra;
+    ARGBUnattenuateRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
     dst_argb += dst_stride_argb;
   }
   return 0;
 }
 
-// Convert ARGB to I400.
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
+// Convert ARGB to Grayed ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
   if (height < 0) {
     height = -height;
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 4 == 0) &&
-      IS_ALIGNED(src_argb, 16) && (src_stride_argb % 16 == 0) &&
-      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
-    ARGBToYRow = ARGBToYRow_SSSE3;
-  } else
+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+                      int width) = ARGBGrayRow_C;
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBGrayRow = ARGBGrayRow_SSSE3;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    ARGBGrayRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+             int dst_x, int dst_y,
+             int width, int height) {
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+                      int width) = ARGBGrayRow_C;
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBGrayRow = ARGBGrayRow_SSSE3;
+  }
 #endif
-  {
-    ARGBToYRow = ARGBToYRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  for (int y = 0; y < height; ++y) {
+    ARGBGrayRow(dst, dst, width);
+    dst += dst_stride_argb;
   }
+  return 0;
+}
 
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+              int dst_x, int dst_y, int width, int height) {
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
+#if defined(HAS_ARGBSEPIAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBSepiaRow = ARGBSepiaRow_SSSE3;
+  }
+#endif
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   for (int y = 0; y < height; ++y) {
-    ARGBToYRow(src_argb, dst_y, width);
+    ARGBSepiaRow(dst, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a 4x3 matrix rotation to each ARGB pixel.
+LIBYUV_API
+int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                    const int8* matrix_argb,
+                    int dst_x, int dst_y, int width, int height) {
+  if (!dst_argb || !matrix_argb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  void (*ARGBColorMatrixRow)(uint8* dst_argb, const int8* matrix_argb,
+                             int width) = ARGBColorMatrixRow_C;
+#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
+  }
+#endif
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  for (int y = 0; y < height; ++y) {
+    ARGBColorMatrixRow(dst, matrix_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                   const uint8* table_argb,
+                   int dst_x, int dst_y, int width, int height) {
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+                            int width) = ARGBColorTableRow_C;
+#if defined(HAS_ARGBCOLORTABLEROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    ARGBColorTableRow = ARGBColorTableRow_X86;
+  }
+#endif
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  for (int y = 0; y < height; ++y) {
+    ARGBColorTableRow(dst, table_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// ARGBQuantize is used to posterize art.
+// e.g. rgb / qvalue * qvalue + qvalue / 2
+// But the low levels implement efficiently with 3 parameters, and could be
+// used for other high level operations.
+// The divide is replaces with a multiply by reciprocal fixed point multiply.
+// Caveat - although SSE2 saturates, the C function does not and should be used
+// with care if doing anything but quantization.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+                 int scale, int interval_size, int interval_offset,
+                 int dst_x, int dst_y, int width, int height) {
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
+      interval_size < 1 || interval_size > 255) {
+    return -1;
+  }
+  void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) = ARGBQuantizeRow_C;
+#if defined(HAS_ARGBQUANTIZEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
+  }
+#endif
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  for (int y = 0; y < height; ++y) {
+    ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+                             int32* dst_cumsum, int dst_stride32_cumsum,
+                             int width, int height) {
+  if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
+    return -1;
+  }
+  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+  }
+#endif
+  memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4);  // 4 int per pixel.
+  int32* previous_cumsum = dst_cumsum;
+  for (int y = 0; y < height; ++y) {
+    ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
+    previous_cumsum = dst_cumsum;
+    dst_cumsum += dst_stride32_cumsum;
     src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
   }
   return 0;
 }
 
+// Blur ARGB image.
+// Caller should allocate CumulativeSum table of width * height * 16 bytes
+// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
+// as the buffer is treated as circular.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int32* dst_cumsum, int dst_stride32_cumsum,
+             int width, int height, int radius) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+  void (*CumulativeSumToAverage)(const int32* topleft, const int32* botleft,
+      int width, int area, uint8* dst, int count) = CumulativeSumToAverage_C;
+#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+    CumulativeSumToAverage = CumulativeSumToAverage_SSE2;
+  }
+#endif
+  // Compute enough CumulativeSum for first row to be blurred. After this
+  // one row of CumulativeSum is updated at a time.
+  ARGBComputeCumulativeSum(src_argb, src_stride_argb,
+                           dst_cumsum, dst_stride32_cumsum,
+                           width, radius);
 
-// Convert RAW to ARGB.
-int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+  src_argb = src_argb + radius * src_stride_argb;
+  int32* cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
+
+  const int32* max_cumsum_bot_row =
+      &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
+  const int32* cumsum_top_row = &dst_cumsum[0];
+
+  for (int y = 0; y < height; ++y) {
+    int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
+    int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
+    int area = radius * (bot_y - top_y);
+
+    // Increment cumsum_top_row pointer with circular buffer wrap around.
+    if (top_y) {
+      cumsum_top_row += dst_stride32_cumsum;
+      if (cumsum_top_row >= max_cumsum_bot_row) {
+        cumsum_top_row = dst_cumsum;
+      }
+    }
+    // Increment cumsum_bot_row pointer with circular buffer wrap around and
+    // then fill in a row of CumulativeSum.
+    if ((y + radius) < height) {
+      const int32* prev_cumsum_bot_row = cumsum_bot_row;
+      cumsum_bot_row += dst_stride32_cumsum;
+      if (cumsum_bot_row >= max_cumsum_bot_row) {
+        cumsum_bot_row = dst_cumsum;
+      }
+      ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,
+                              width);
+      src_argb += src_stride_argb;
+    }
+
+    // Left clipped.
+    int boxwidth = radius * 4;
+    int x;
+    for (x = 0; x < radius + 1; ++x) {
+      CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row,
+                              boxwidth, area, &dst_argb[x * 4], 1);
+      area += (bot_y - top_y);
+      boxwidth += 4;
+    }
+
+    // Middle unclipped.
+    int n = (width - 1) - radius - x + 1;
+    CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row,
+                           boxwidth, area, &dst_argb[x * 4], n);
+
+    // Right clipped.
+    for (x += n; x <= width - 1; ++x) {
+      area -= (bot_y - top_y);
+      boxwidth -= 4;
+      CumulativeSumToAverage(cumsum_top_row + (x - radius - 1) * 4,
+                             cumsum_bot_row + (x - radius - 1) * 4,
+                             boxwidth, area, &dst_argb[x * 4], 1);
+    }
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Multiply ARGB image by a specified ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
               uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
+              int width, int height, uint32 value) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
+    return -1;
+  }
   if (height < 0) {
     height = -height;
-    src_raw = src_raw + (height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-  void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
-#if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
-      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
-    RAWToARGBRow = RAWToARGBRow_SSSE3;
-  } else
-#endif
-  {
-    RAWToARGBRow = RAWToARGBRow_C;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
   }
+  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
+                       int width, uint32 value) = ARGBShadeRow_C;
+#if defined(HAS_ARGBSHADE_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBShadeRow = ARGBShadeRow_SSE2;
+  }
+#endif
 
   for (int y = 0; y < height; ++y) {
-    RAWToARGBRow(src_raw, dst_argb, width);
-    src_raw += src_stride_raw;
+    ARGBShadeRow(src_argb, dst_argb, width, value);
+    src_argb += src_stride_argb;
     dst_argb += dst_stride_argb;
   }
   return 0;
 }
 
-// Convert BG24 to ARGB.
-int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+// Interpolate 2 ARGB images by specified amount (0 to 255).
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+                    const uint8* src_argb1, int src_stride_argb1,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height, int interpolation) {
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
-    src_stride_bg24 = -src_stride_bg24;
-  }
-  void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
-#if defined(HAS_BG24TOARGBROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
-      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
-    BG24ToARGBRow = BG24ToARGBRow_SSSE3;
-  } else
-#endif
-  {
-    BG24ToARGBRow = BG24ToARGBRow_C;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
   }
-
+  void (*ARGBInterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride, int dst_width,
+                              int source_y_fraction) = ARGBInterpolateRow_C;
+#if defined(HAS_ARGBINTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
+      IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBInterpolateRow = ARGBInterpolateRow_SSSE3;
+  }
+#endif
   for (int y = 0; y < height; ++y) {
-    BG24ToARGBRow(src_bg24, dst_argb, width);
-    src_bg24 += src_stride_bg24;
+    ARGBInterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
+                       width, interpolation);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
     dst_argb += dst_stride_argb;
   }
   return 0;
 }
 
+#ifdef __cplusplus
+}  // extern "C"
 }  // namespace libyuv
-
+#endif
diff --git a/files/source/rotate.cc b/files/source/rotate.cc
index 12cdd7e1..cac3fa0b 100644
--- a/files/source/rotate.cc
+++ b/files/source/rotate.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,49 +8,44 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
-#include "rotate_priv.h"
 
 #include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
 
+#ifdef __cplusplus
 namespace libyuv {
+extern "C" {
+#endif
 
-#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
-    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#if defined(_MSC_VER)
-#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
+#if !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#if defined(__APPLE__) && defined(__i386__)
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".private_extern _" #name "                \n"                             \
+    ".align 4,0x90                             \n"                             \
+"_" #name ":                                   \n"
+#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".align 4,0x90                             \n"                             \
+"_" #name ":                                   \n"
 #else
-#define TALIGN16(t, var) t var __attribute__((aligned(16)))
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".align 4,0x90                             \n"                             \
+#name ":                                       \n"
 #endif
-// Shuffle table for reversing the bytes.
-extern "C" TALIGN16(const uint8, kShuffleReverse[16]) =
-  { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u };
-// Shuffle table for reversing the bytes of UV channels.
-extern "C" TALIGN16(const uint8, kShuffleReverseUV[16]) =
-  { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u };
 #endif
 
-typedef void (*reverse_uv_func)(const uint8*, uint8*, uint8*, int);
-typedef void (*reverse_func)(const uint8*, uint8*, int);
-typedef void (*rotate_uv_wx8_func)(const uint8*, int,
-                                   uint8*, int,
-                                   uint8*, int, int);
-typedef void (*rotate_uv_wxh_func)(const uint8*, int,
-                                   uint8*, int,
-                                   uint8*, int, int, int);
-typedef void (*rotate_wx8_func)(const uint8*, int, uint8*, int, int);
-typedef void (*rotate_wxh_func)(const uint8*, int, uint8*, int, int, int);
-
-#if 0 // Need to add rotate_neon.s to the build to enable this
-#ifdef __ARM_NEON__
-extern "C" {
-void RestoreRegisters_NEON(unsigned long long *restore);
-void SaveRegisters_NEON(unsigned long long *store);
-#define HAS_REVERSE_LINE_NEON
-void ReverseLine_NEON(const uint8* src, uint8* dst, int width);
-#define HAS_REVERSE_LINE_UV_NEON
-void ReverseLineUV_NEON(const uint8* src,
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+#define HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+#define HAS_MIRRORROW_UV_NEON
+void MirrorRowUV_NEON(const uint8* src,
                         uint8* dst_a, uint8* dst_b,
                         int width);
 #define HAS_TRANSPOSE_WX8_NEON
@@ -61,16 +56,14 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                          uint8* dst_a, int dst_stride_a,
                          uint8* dst_b, int dst_stride_b,
                          int width);
-}  // extern "C"
-#endif
-#endif
+#endif  // defined(__ARM_NEON__)
 
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_TRANSPOSE_WX8_SSSE3
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                                uint8* dst, int dst_stride, int width) {
-__asm {
+  __asm {
     push      edi
     push      esi
     push      ebp
@@ -79,9 +72,11 @@ __asm {
     mov       edx, [esp + 12 + 12]  // dst
     mov       esi, [esp + 12 + 16]  // dst_stride
     mov       ecx, [esp + 12 + 20]  // width
- convertloop :
+
     // Read in the data from the source pointer.
     // First round of bit swap.
+    align      16
+ convertloop:
     movq      xmm0, qword ptr [eax]
     lea       ebp, [eax + 8]
     movq      xmm1, qword ptr [eax + edi]
@@ -144,10 +139,10 @@ __asm {
     movq      qword ptr [edx], xmm3
     movdqa    xmm7, xmm3
     palignr   xmm7, xmm7, 8
+    sub       ecx, 8
     movq      qword ptr [edx + esi], xmm7
     lea       edx, [edx + 2 * esi]
-    sub       ecx, 8
-    ja        convertloop
+    jg        convertloop
 
     pop       ebp
     pop       esi
@@ -157,12 +152,12 @@ __asm {
 }
 
 #define HAS_TRANSPOSE_UVWX8_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                 uint8* dst_a, int dst_stride_a,
                                 uint8* dst_b, int dst_stride_b,
                                 int w) {
-__asm {
+  __asm {
     push      ebx
     push      esi
     push      edi
@@ -178,7 +173,9 @@ __asm {
     and       esp, ~15
     mov       [esp + 16], ecx
     mov       ecx, [ecx + 16 + 28]  // w
- convertloop :
+
+    align      16
+ convertloop:
     // Read in the data from the source pointer.
     // First round of bit swap.
     movdqa    xmm0, [eax]
@@ -268,12 +265,12 @@ __asm {
     movlpd    qword ptr [edx], xmm3
     movhpd    qword ptr [ebx], xmm3
     punpckhdq xmm0, xmm7
+    sub       ecx, 8
     movlpd    qword ptr [edx + esi], xmm0
     lea       edx, [edx + 2 * esi]
     movhpd    qword ptr [ebx + ebp], xmm0
     lea       ebx, [ebx + 2 * ebp]
-    sub       ecx, 8
-    ja        convertloop
+    jg        convertloop
 
     mov       esp, [esp + 16]
     pop       ebp
@@ -283,356 +280,355 @@ __asm {
     ret
   }
 }
-#elif (defined(__i386__) || defined(__x86_64__)) && \
-    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__))
 #define HAS_TRANSPOSE_WX8_SSSE3
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                                uint8* dst, int dst_stride, int width) {
-  asm volatile(
-"1:"
-  // Read in the data from the source pointer.
-  // First round of bit swap.
-  "movq       (%0),%%xmm0\n"
-  "movq       (%0,%3),%%xmm1\n"
-  "lea        (%0,%3,2),%0\n"
-  "punpcklbw  %%xmm1,%%xmm0\n"
-  "movq       (%0),%%xmm2\n"
-  "movdqa     %%xmm0,%%xmm1\n"
-  "palignr    $0x8,%%xmm1,%%xmm1\n"
-  "movq       (%0,%3),%%xmm3\n"
-  "lea        (%0,%3,2),%0\n"
-  "punpcklbw  %%xmm3,%%xmm2\n"
-  "movdqa     %%xmm2,%%xmm3\n"
-  "movq       (%0),%%xmm4\n"
-  "palignr    $0x8,%%xmm3,%%xmm3\n"
-  "movq       (%0,%3),%%xmm5\n"
-  "lea        (%0,%3,2),%0\n"
-  "punpcklbw  %%xmm5,%%xmm4\n"
-  "movdqa     %%xmm4,%%xmm5\n"
-  "movq       (%0),%%xmm6\n"
-  "palignr    $0x8,%%xmm5,%%xmm5\n"
-  "movq       (%0,%3),%%xmm7\n"
-  "lea        (%0,%3,2),%0\n"
-  "punpcklbw  %%xmm7,%%xmm6\n"
-  "neg        %3\n"
-  "movdqa     %%xmm6,%%xmm7\n"
-  "lea        0x8(%0,%3,8),%0\n"
-  "palignr    $0x8,%%xmm7,%%xmm7\n"
-  "neg        %3\n"
-   // Second round of bit swap.
-  "punpcklwd  %%xmm2,%%xmm0\n"
-  "punpcklwd  %%xmm3,%%xmm1\n"
-  "movdqa     %%xmm0,%%xmm2\n"
-  "movdqa     %%xmm1,%%xmm3\n"
-  "palignr    $0x8,%%xmm2,%%xmm2\n"
-  "palignr    $0x8,%%xmm3,%%xmm3\n"
-  "punpcklwd  %%xmm6,%%xmm4\n"
-  "punpcklwd  %%xmm7,%%xmm5\n"
-  "movdqa     %%xmm4,%%xmm6\n"
-  "movdqa     %%xmm5,%%xmm7\n"
-  "palignr    $0x8,%%xmm6,%%xmm6\n"
-  "palignr    $0x8,%%xmm7,%%xmm7\n"
-  // Third round of bit swap.
-  // Write to the destination pointer.
-  "punpckldq  %%xmm4,%%xmm0\n"
-  "movq       %%xmm0,(%1)\n"
-  "movdqa     %%xmm0,%%xmm4\n"
-  "palignr    $0x8,%%xmm4,%%xmm4\n"
-  "movq       %%xmm4,(%1,%4)\n"
-  "lea        (%1,%4,2),%1\n"
-  "punpckldq  %%xmm6,%%xmm2\n"
-  "movdqa     %%xmm2,%%xmm6\n"
-  "movq       %%xmm2,(%1)\n"
-  "palignr    $0x8,%%xmm6,%%xmm6\n"
-  "punpckldq  %%xmm5,%%xmm1\n"
-  "movq       %%xmm6,(%1,%4)\n"
-  "lea        (%1,%4,2),%1\n"
-  "movdqa     %%xmm1,%%xmm5\n"
-  "movq       %%xmm1,(%1)\n"
-  "palignr    $0x8,%%xmm5,%%xmm5\n"
-  "movq       %%xmm5,(%1,%4)\n"
-  "lea        (%1,%4,2),%1\n"
-  "punpckldq  %%xmm7,%%xmm3\n"
-  "movq       %%xmm3,(%1)\n"
-  "movdqa     %%xmm3,%%xmm7\n"
-  "palignr    $0x8,%%xmm7,%%xmm7\n"
-  "movq       %%xmm7,(%1,%4)\n"
-  "lea        (%1,%4,2),%1\n"
-  "sub        $0x8,%2\n"
-  "ja         1b\n"
-  : "+r"(src),    // %0
-    "+r"(dst),    // %1
-    "+r"(width)   // %2
-  : "r"(static_cast<intptr_t>(src_stride)),  // %3
-    "r"(static_cast<intptr_t>(dst_stride))   // %4
-  : "memory"
-);
+  asm volatile (
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    ".p2align  4                                 \n"
+  "1:                                            \n"
+    "movq       (%0),%%xmm0                      \n"
+    "movq       (%0,%3),%%xmm1                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm1,%%xmm0                    \n"
+    "movq       (%0),%%xmm2                      \n"
+    "movdqa     %%xmm0,%%xmm1                    \n"
+    "palignr    $0x8,%%xmm1,%%xmm1               \n"
+    "movq       (%0,%3),%%xmm3                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm3,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm3                    \n"
+    "movq       (%0),%%xmm4                      \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "movq       (%0,%3),%%xmm5                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm5,%%xmm4                    \n"
+    "movdqa     %%xmm4,%%xmm5                    \n"
+    "movq       (%0),%%xmm6                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       (%0,%3),%%xmm7                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm7,%%xmm6                    \n"
+    "neg        %3                               \n"
+    "movdqa     %%xmm6,%%xmm7                    \n"
+    "lea        0x8(%0,%3,8),%0                  \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "neg        %3                               \n"
+     // Second round of bit swap.
+    "punpcklwd  %%xmm2,%%xmm0                    \n"
+    "punpcklwd  %%xmm3,%%xmm1                    \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "palignr    $0x8,%%xmm2,%%xmm2               \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "punpcklwd  %%xmm6,%%xmm4                    \n"
+    "punpcklwd  %%xmm7,%%xmm5                    \n"
+    "movdqa     %%xmm4,%%xmm6                    \n"
+    "movdqa     %%xmm5,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    "punpckldq  %%xmm4,%%xmm0                    \n"
+    "movq       %%xmm0,(%1)                      \n"
+    "movdqa     %%xmm0,%%xmm4                    \n"
+    "palignr    $0x8,%%xmm4,%%xmm4               \n"
+    "movq       %%xmm4,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm6,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm6                    \n"
+    "movq       %%xmm2,(%1)                      \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "punpckldq  %%xmm5,%%xmm1                    \n"
+    "movq       %%xmm6,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "movdqa     %%xmm1,%%xmm5                    \n"
+    "movq       %%xmm1,(%1)                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       %%xmm5,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm7,%%xmm3                    \n"
+    "movq       %%xmm3,(%1)                      \n"
+    "movdqa     %%xmm3,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "sub        $0x8,%2                          \n"
+    "movq       %%xmm7,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "jg         1b                               \n"
+    : "+r"(src),    // %0
+      "+r"(dst),    // %1
+      "+r"(width)   // %2
+    : "r"(static_cast<intptr_t>(src_stride)),  // %3
+      "r"(static_cast<intptr_t>(dst_stride))   // %4
+    : "memory", "cc"
+  #if defined(__SSE2__)
+      , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  #endif
+  );
 }
 
-#if defined (__i386__)
+#if !defined(YUV_DISABLE_ASM) && defined (__i386__)
 #define HAS_TRANSPOSE_UVWX8_SSE2
 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                     uint8* dst_a, int dst_stride_a,
                                     uint8* dst_b, int dst_stride_b,
                                     int w);
-  asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _TransposeUVWx8_SSE2\n"
-"_TransposeUVWx8_SSE2:\n"
-#else
-    ".global TransposeUVWx8_SSE2\n"
-"TransposeUVWx8_SSE2:\n"
-#endif
-    "push   %ebx\n"
-    "push   %esi\n"
-    "push   %edi\n"
-    "push   %ebp\n"
-    "mov    0x14(%esp),%eax\n"
-    "mov    0x18(%esp),%edi\n"
-    "mov    0x1c(%esp),%edx\n"
-    "mov    0x20(%esp),%esi\n"
-    "mov    0x24(%esp),%ebx\n"
-    "mov    0x28(%esp),%ebp\n"
-    "mov    %esp,%ecx\n"
-    "sub    $0x14,%esp\n"
-    "and    $0xfffffff0,%esp\n"
-    "mov    %ecx,0x10(%esp)\n"
-    "mov    0x2c(%ecx),%ecx\n"
+  asm (
+    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
+    "push   %ebx                               \n"
+    "push   %esi                               \n"
+    "push   %edi                               \n"
+    "push   %ebp                               \n"
+    "mov    0x14(%esp),%eax                    \n"
+    "mov    0x18(%esp),%edi                    \n"
+    "mov    0x1c(%esp),%edx                    \n"
+    "mov    0x20(%esp),%esi                    \n"
+    "mov    0x24(%esp),%ebx                    \n"
+    "mov    0x28(%esp),%ebp                    \n"
+    "mov    %esp,%ecx                          \n"
+    "sub    $0x14,%esp                         \n"
+    "and    $0xfffffff0,%esp                   \n"
+    "mov    %ecx,0x10(%esp)                    \n"
+    "mov    0x2c(%ecx),%ecx                    \n"
 
-"1:"
-    "movdqa (%eax),%xmm0\n"
-    "movdqa (%eax,%edi,1),%xmm1\n"
-    "lea    (%eax,%edi,2),%eax\n"
-    "movdqa %xmm0,%xmm7\n"
-    "punpcklbw %xmm1,%xmm0\n"
-    "punpckhbw %xmm1,%xmm7\n"
-    "movdqa %xmm7,%xmm1\n"
-    "movdqa (%eax),%xmm2\n"
-    "movdqa (%eax,%edi,1),%xmm3\n"
-    "lea    (%eax,%edi,2),%eax\n"
-    "movdqa %xmm2,%xmm7\n"
-    "punpcklbw %xmm3,%xmm2\n"
-    "punpckhbw %xmm3,%xmm7\n"
-    "movdqa %xmm7,%xmm3\n"
-    "movdqa (%eax),%xmm4\n"
-    "movdqa (%eax,%edi,1),%xmm5\n"
-    "lea    (%eax,%edi,2),%eax\n"
-    "movdqa %xmm4,%xmm7\n"
-    "punpcklbw %xmm5,%xmm4\n"
-    "punpckhbw %xmm5,%xmm7\n"
-    "movdqa %xmm7,%xmm5\n"
-    "movdqa (%eax),%xmm6\n"
-    "movdqa (%eax,%edi,1),%xmm7\n"
-    "lea    (%eax,%edi,2),%eax\n"
-    "movdqa %xmm5,(%esp)\n"
-    "neg    %edi\n"
-    "movdqa %xmm6,%xmm5\n"
-    "punpcklbw %xmm7,%xmm6\n"
-    "punpckhbw %xmm7,%xmm5\n"
-    "movdqa %xmm5,%xmm7\n"
-    "lea    0x10(%eax,%edi,8),%eax\n"
-    "neg    %edi\n"
-    "movdqa %xmm0,%xmm5\n"
-    "punpcklwd %xmm2,%xmm0\n"
-    "punpckhwd %xmm2,%xmm5\n"
-    "movdqa %xmm5,%xmm2\n"
-    "movdqa %xmm1,%xmm5\n"
-    "punpcklwd %xmm3,%xmm1\n"
-    "punpckhwd %xmm3,%xmm5\n"
-    "movdqa %xmm5,%xmm3\n"
-    "movdqa %xmm4,%xmm5\n"
-    "punpcklwd %xmm6,%xmm4\n"
-    "punpckhwd %xmm6,%xmm5\n"
-    "movdqa %xmm5,%xmm6\n"
-    "movdqa (%esp),%xmm5\n"
-    "movdqa %xmm6,(%esp)\n"
-    "movdqa %xmm5,%xmm6\n"
-    "punpcklwd %xmm7,%xmm5\n"
-    "punpckhwd %xmm7,%xmm6\n"
-    "movdqa %xmm6,%xmm7\n"
-    "movdqa %xmm0,%xmm6\n"
-    "punpckldq %xmm4,%xmm0\n"
-    "punpckhdq %xmm4,%xmm6\n"
-    "movdqa %xmm6,%xmm4\n"
-    "movdqa (%esp),%xmm6\n"
-    "movlpd %xmm0,(%edx)\n"
-    "movhpd %xmm0,(%ebx)\n"
-    "movlpd %xmm4,(%edx,%esi,1)\n"
-    "lea    (%edx,%esi,2),%edx\n"
-    "movhpd %xmm4,(%ebx,%ebp,1)\n"
-    "lea    (%ebx,%ebp,2),%ebx\n"
-    "movdqa %xmm2,%xmm0\n"
-    "punpckldq %xmm6,%xmm2\n"
-    "movlpd %xmm2,(%edx)\n"
-    "movhpd %xmm2,(%ebx)\n"
-    "punpckhdq %xmm6,%xmm0\n"
-    "movlpd %xmm0,(%edx,%esi,1)\n"
-    "lea    (%edx,%esi,2),%edx\n"
-    "movhpd %xmm0,(%ebx,%ebp,1)\n"
-    "lea    (%ebx,%ebp,2),%ebx\n"
-    "movdqa %xmm1,%xmm0\n"
-    "punpckldq %xmm5,%xmm1\n"
-    "movlpd %xmm1,(%edx)\n"
-    "movhpd %xmm1,(%ebx)\n"
-    "punpckhdq %xmm5,%xmm0\n"
-    "movlpd %xmm0,(%edx,%esi,1)\n"
-    "lea    (%edx,%esi,2),%edx\n"
-    "movhpd %xmm0,(%ebx,%ebp,1)\n"
-    "lea    (%ebx,%ebp,2),%ebx\n"
-    "movdqa %xmm3,%xmm0\n"
-    "punpckldq %xmm7,%xmm3\n"
-    "movlpd %xmm3,(%edx)\n"
-    "movhpd %xmm3,(%ebx)\n"
-    "punpckhdq %xmm7,%xmm0\n"
-    "movlpd %xmm0,(%edx,%esi,1)\n"
-    "lea    (%edx,%esi,2),%edx\n"
-    "movhpd %xmm0,(%ebx,%ebp,1)\n"
-    "lea    (%ebx,%ebp,2),%ebx\n"
-    "sub    $0x8,%ecx\n"
-    "ja     1b\n"
-    "mov    0x10(%esp),%esp\n"
-    "pop    %ebp\n"
-    "pop    %edi\n"
-    "pop    %esi\n"
-    "pop    %ebx\n"
-    "ret\n"
+"1:                                            \n"
+    "movdqa (%eax),%xmm0                       \n"
+    "movdqa (%eax,%edi,1),%xmm1                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm0,%xmm7                        \n"
+    "punpcklbw %xmm1,%xmm0                     \n"
+    "punpckhbw %xmm1,%xmm7                     \n"
+    "movdqa %xmm7,%xmm1                        \n"
+    "movdqa (%eax),%xmm2                       \n"
+    "movdqa (%eax,%edi,1),%xmm3                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm2,%xmm7                        \n"
+    "punpcklbw %xmm3,%xmm2                     \n"
+    "punpckhbw %xmm3,%xmm7                     \n"
+    "movdqa %xmm7,%xmm3                        \n"
+    "movdqa (%eax),%xmm4                       \n"
+    "movdqa (%eax,%edi,1),%xmm5                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm4,%xmm7                        \n"
+    "punpcklbw %xmm5,%xmm4                     \n"
+    "punpckhbw %xmm5,%xmm7                     \n"
+    "movdqa %xmm7,%xmm5                        \n"
+    "movdqa (%eax),%xmm6                       \n"
+    "movdqa (%eax,%edi,1),%xmm7                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm5,(%esp)                       \n"
+    "neg    %edi                               \n"
+    "movdqa %xmm6,%xmm5                        \n"
+    "punpcklbw %xmm7,%xmm6                     \n"
+    "punpckhbw %xmm7,%xmm5                     \n"
+    "movdqa %xmm5,%xmm7                        \n"
+    "lea    0x10(%eax,%edi,8),%eax             \n"
+    "neg    %edi                               \n"
+    "movdqa %xmm0,%xmm5                        \n"
+    "punpcklwd %xmm2,%xmm0                     \n"
+    "punpckhwd %xmm2,%xmm5                     \n"
+    "movdqa %xmm5,%xmm2                        \n"
+    "movdqa %xmm1,%xmm5                        \n"
+    "punpcklwd %xmm3,%xmm1                     \n"
+    "punpckhwd %xmm3,%xmm5                     \n"
+    "movdqa %xmm5,%xmm3                        \n"
+    "movdqa %xmm4,%xmm5                        \n"
+    "punpcklwd %xmm6,%xmm4                     \n"
+    "punpckhwd %xmm6,%xmm5                     \n"
+    "movdqa %xmm5,%xmm6                        \n"
+    "movdqa (%esp),%xmm5                       \n"
+    "movdqa %xmm6,(%esp)                       \n"
+    "movdqa %xmm5,%xmm6                        \n"
+    "punpcklwd %xmm7,%xmm5                     \n"
+    "punpckhwd %xmm7,%xmm6                     \n"
+    "movdqa %xmm6,%xmm7                        \n"
+    "movdqa %xmm0,%xmm6                        \n"
+    "punpckldq %xmm4,%xmm0                     \n"
+    "punpckhdq %xmm4,%xmm6                     \n"
+    "movdqa %xmm6,%xmm4                        \n"
+    "movdqa (%esp),%xmm6                       \n"
+    "movlpd %xmm0,(%edx)                       \n"
+    "movhpd %xmm0,(%ebx)                       \n"
+    "movlpd %xmm4,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm2,%xmm0                        \n"
+    "punpckldq %xmm6,%xmm2                     \n"
+    "movlpd %xmm2,(%edx)                       \n"
+    "movhpd %xmm2,(%ebx)                       \n"
+    "punpckhdq %xmm6,%xmm0                     \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm1,%xmm0                        \n"
+    "punpckldq %xmm5,%xmm1                     \n"
+    "movlpd %xmm1,(%edx)                       \n"
+    "movhpd %xmm1,(%ebx)                       \n"
+    "punpckhdq %xmm5,%xmm0                     \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm3,%xmm0                        \n"
+    "punpckldq %xmm7,%xmm3                     \n"
+    "movlpd %xmm3,(%edx)                       \n"
+    "movhpd %xmm3,(%ebx)                       \n"
+    "punpckhdq %xmm7,%xmm0                     \n"
+    "sub    $0x8,%ecx                          \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "jg     1b                                 \n"
+    "mov    0x10(%esp),%esp                    \n"
+    "pop    %ebp                               \n"
+    "pop    %edi                               \n"
+    "pop    %esi                               \n"
+    "pop    %ebx                               \n"
+    "ret                                       \n"
 );
-#elif defined (__x86_64__)
+#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
                                     uint8* dst, int dst_stride, int width) {
-  asm volatile(
-"1:"
+  asm volatile (
   // Read in the data from the source pointer.
   // First round of bit swap.
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     (%0,%3),%%xmm1\n"
-  "lea        (%0,%3,2),%0\n"
-  "movdqa     %%xmm0,%%xmm8\n"
-  "punpcklbw  %%xmm1,%%xmm0\n"
-  "punpckhbw  %%xmm1,%%xmm8\n"
-  "movdqa     (%0),%%xmm2\n"
-  "movdqa     %%xmm0,%%xmm1\n"
-  "movdqa     %%xmm8,%%xmm9\n"
-  "palignr    $0x8,%%xmm1,%%xmm1\n"
-  "palignr    $0x8,%%xmm9,%%xmm9\n"
-  "movdqa     (%0,%3),%%xmm3\n"
-  "lea        (%0,%3,2),%0\n"
-  "movdqa     %%xmm2,%%xmm10\n"
-  "punpcklbw  %%xmm3,%%xmm2\n"
-  "punpckhbw  %%xmm3,%%xmm10\n"
-  "movdqa     %%xmm2,%%xmm3\n"
-  "movdqa     %%xmm10,%%xmm11\n"
-  "movdqa     (%0),%%xmm4\n"
-  "palignr    $0x8,%%xmm3,%%xmm3\n"
-  "palignr    $0x8,%%xmm11,%%xmm11\n"
-  "movdqa     (%0,%3),%%xmm5\n"
-  "lea        (%0,%3,2),%0\n"
-  "movdqa     %%xmm4,%%xmm12\n"
-  "punpcklbw  %%xmm5,%%xmm4\n"
-  "punpckhbw  %%xmm5,%%xmm12\n"
-  "movdqa     %%xmm4,%%xmm5\n"
-  "movdqa     %%xmm12,%%xmm13\n"
-  "movdqa     (%0),%%xmm6\n"
-  "palignr    $0x8,%%xmm5,%%xmm5\n"
-  "palignr    $0x8,%%xmm13,%%xmm13\n"
-  "movdqa     (%0,%3),%%xmm7\n"
-  "lea        (%0,%3,2),%0\n"
-  "movdqa     %%xmm6,%%xmm14\n"
-  "punpcklbw  %%xmm7,%%xmm6\n"
-  "punpckhbw  %%xmm7,%%xmm14\n"
-  "neg        %3\n"
-  "movdqa     %%xmm6,%%xmm7\n"
-  "movdqa     %%xmm14,%%xmm15\n"
-  "lea        0x10(%0,%3,8),%0\n"
-  "palignr    $0x8,%%xmm7,%%xmm7\n"
-  "palignr    $0x8,%%xmm15,%%xmm15\n"
-  "neg        %3\n"
+  ".p2align  4                                 \n"
+"1:                                            \n"
+  "movdqa     (%0),%%xmm0                      \n"
+  "movdqa     (%0,%3),%%xmm1                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpcklbw  %%xmm1,%%xmm0                    \n"
+  "punpckhbw  %%xmm1,%%xmm8                    \n"
+  "movdqa     (%0),%%xmm2                      \n"
+  "movdqa     %%xmm0,%%xmm1                    \n"
+  "movdqa     %%xmm8,%%xmm9                    \n"
+  "palignr    $0x8,%%xmm1,%%xmm1               \n"
+  "palignr    $0x8,%%xmm9,%%xmm9               \n"
+  "movdqa     (%0,%3),%%xmm3                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm2,%%xmm10                   \n"
+  "punpcklbw  %%xmm3,%%xmm2                    \n"
+  "punpckhbw  %%xmm3,%%xmm10                   \n"
+  "movdqa     %%xmm2,%%xmm3                    \n"
+  "movdqa     %%xmm10,%%xmm11                  \n"
+  "movdqa     (%0),%%xmm4                      \n"
+  "palignr    $0x8,%%xmm3,%%xmm3               \n"
+  "palignr    $0x8,%%xmm11,%%xmm11             \n"
+  "movdqa     (%0,%3),%%xmm5                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm4,%%xmm12                   \n"
+  "punpcklbw  %%xmm5,%%xmm4                    \n"
+  "punpckhbw  %%xmm5,%%xmm12                   \n"
+  "movdqa     %%xmm4,%%xmm5                    \n"
+  "movdqa     %%xmm12,%%xmm13                  \n"
+  "movdqa     (%0),%%xmm6                      \n"
+  "palignr    $0x8,%%xmm5,%%xmm5               \n"
+  "palignr    $0x8,%%xmm13,%%xmm13             \n"
+  "movdqa     (%0,%3),%%xmm7                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm6,%%xmm14                   \n"
+  "punpcklbw  %%xmm7,%%xmm6                    \n"
+  "punpckhbw  %%xmm7,%%xmm14                   \n"
+  "neg        %3                               \n"
+  "movdqa     %%xmm6,%%xmm7                    \n"
+  "movdqa     %%xmm14,%%xmm15                  \n"
+  "lea        0x10(%0,%3,8),%0                 \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  "neg        %3                               \n"
    // Second round of bit swap.
-  "punpcklwd  %%xmm2,%%xmm0\n"
-  "punpcklwd  %%xmm3,%%xmm1\n"
-  "movdqa     %%xmm0,%%xmm2\n"
-  "movdqa     %%xmm1,%%xmm3\n"
-  "palignr    $0x8,%%xmm2,%%xmm2\n"
-  "palignr    $0x8,%%xmm3,%%xmm3\n"
-  "punpcklwd  %%xmm6,%%xmm4\n"
-  "punpcklwd  %%xmm7,%%xmm5\n"
-  "movdqa     %%xmm4,%%xmm6\n"
-  "movdqa     %%xmm5,%%xmm7\n"
-  "palignr    $0x8,%%xmm6,%%xmm6\n"
-  "palignr    $0x8,%%xmm7,%%xmm7\n"
-  "punpcklwd  %%xmm10,%%xmm8\n"
-  "punpcklwd  %%xmm11,%%xmm9\n"
-  "movdqa     %%xmm8,%%xmm10\n"
-  "movdqa     %%xmm9,%%xmm11\n"
-  "palignr    $0x8,%%xmm10,%%xmm10\n"
-  "palignr    $0x8,%%xmm11,%%xmm11\n"
-  "punpcklwd  %%xmm14,%%xmm12\n"
-  "punpcklwd  %%xmm15,%%xmm13\n"
-  "movdqa     %%xmm12,%%xmm14\n"
-  "movdqa     %%xmm13,%%xmm15\n"
-  "palignr    $0x8,%%xmm14,%%xmm14\n"
-  "palignr    $0x8,%%xmm15,%%xmm15\n"
+  "punpcklwd  %%xmm2,%%xmm0                    \n"
+  "punpcklwd  %%xmm3,%%xmm1                    \n"
+  "movdqa     %%xmm0,%%xmm2                    \n"
+  "movdqa     %%xmm1,%%xmm3                    \n"
+  "palignr    $0x8,%%xmm2,%%xmm2               \n"
+  "palignr    $0x8,%%xmm3,%%xmm3               \n"
+  "punpcklwd  %%xmm6,%%xmm4                    \n"
+  "punpcklwd  %%xmm7,%%xmm5                    \n"
+  "movdqa     %%xmm4,%%xmm6                    \n"
+  "movdqa     %%xmm5,%%xmm7                    \n"
+  "palignr    $0x8,%%xmm6,%%xmm6               \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "punpcklwd  %%xmm10,%%xmm8                   \n"
+  "punpcklwd  %%xmm11,%%xmm9                   \n"
+  "movdqa     %%xmm8,%%xmm10                   \n"
+  "movdqa     %%xmm9,%%xmm11                   \n"
+  "palignr    $0x8,%%xmm10,%%xmm10             \n"
+  "palignr    $0x8,%%xmm11,%%xmm11             \n"
+  "punpcklwd  %%xmm14,%%xmm12                  \n"
+  "punpcklwd  %%xmm15,%%xmm13                  \n"
+  "movdqa     %%xmm12,%%xmm14                  \n"
+  "movdqa     %%xmm13,%%xmm15                  \n"
+  "palignr    $0x8,%%xmm14,%%xmm14             \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
   // Third round of bit swap.
   // Write to the destination pointer.
-  "punpckldq  %%xmm4,%%xmm0\n"
-  "movq       %%xmm0,(%1)\n"
-  "movdqa     %%xmm0,%%xmm4\n"
-  "palignr    $0x8,%%xmm4,%%xmm4\n"
-  "movq       %%xmm4,(%1,%4)\n"
-  "lea        (%1,%4,2),%1\n"
-  "punpckldq  %%xmm6,%%xmm2\n"
-  "movdqa     %%xmm2,%%xmm6\n"
-  "movq       %%xmm2,(%1)\n"
-  "palignr    $0x8,%%xmm6,%%xmm6\n"
-  "punpckldq  %%xmm5,%%xmm1\n"
-  "movq       %%xmm6,(%1,%4)\n"
-  "lea        (%1,%4,2),%1\n"
-  "movdqa     %%xmm1,%%xmm5\n"
-  "movq       %%xmm1,(%1)\n"
-  "palignr    $0x8,%%xmm5,%%xmm5\n"
-  "movq       %%xmm5,(%1,%4)\n"
-  "lea        (%1,%4,2),%1\n"
-  "punpckldq  %%xmm7,%%xmm3\n"
-  "movq       %%xmm3,(%1)\n"
-  "movdqa     %%xmm3,%%xmm7\n"
-  "palignr    $0x8,%%xmm7,%%xmm7\n"
-  "movq       %%xmm7,(%1,%4)\n"
-  "lea        (%1,%4,2),%1\n"
-  "punpckldq  %%xmm12,%%xmm8\n"
-  "movq       %%xmm8,(%1)\n"
-  "movdqa     %%xmm8,%%xmm12\n"
-  "palignr    $0x8,%%xmm12,%%xmm12\n"
-  "movq       %%xmm12,(%1,%4)\n"
-  "lea        (%1,%4,2),%1\n"
-  "punpckldq  %%xmm14,%%xmm10\n"
-  "movdqa     %%xmm10,%%xmm14\n"
-  "movq       %%xmm10,(%1)\n"
-  "palignr    $0x8,%%xmm14,%%xmm14\n"
-  "punpckldq  %%xmm13,%%xmm9\n"
-  "movq       %%xmm14,(%1,%4)\n"
-  "lea        (%1,%4,2),%1\n"
-  "movdqa     %%xmm9,%%xmm13\n"
-  "movq       %%xmm9,(%1)\n"
-  "palignr    $0x8,%%xmm13,%%xmm13\n"
-  "movq       %%xmm13,(%1,%4)\n"
-  "lea        (%1,%4,2),%1\n"
-  "punpckldq  %%xmm15,%%xmm11\n"
-  "movq       %%xmm11,(%1)\n"
-  "movdqa     %%xmm11,%%xmm15\n"
-  "palignr    $0x8,%%xmm15,%%xmm15\n"
-  "movq       %%xmm15,(%1,%4)\n"
-  "lea        (%1,%4,2),%1\n"
-  "sub        $0x10,%2\n"
-  "ja         1b\n"
+  "punpckldq  %%xmm4,%%xmm0                    \n"
+  "movq       %%xmm0,(%1)                      \n"
+  "movdqa     %%xmm0,%%xmm4                    \n"
+  "palignr    $0x8,%%xmm4,%%xmm4               \n"
+  "movq       %%xmm4,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm6,%%xmm2                    \n"
+  "movdqa     %%xmm2,%%xmm6                    \n"
+  "movq       %%xmm2,(%1)                      \n"
+  "palignr    $0x8,%%xmm6,%%xmm6               \n"
+  "punpckldq  %%xmm5,%%xmm1                    \n"
+  "movq       %%xmm6,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "movdqa     %%xmm1,%%xmm5                    \n"
+  "movq       %%xmm1,(%1)                      \n"
+  "palignr    $0x8,%%xmm5,%%xmm5               \n"
+  "movq       %%xmm5,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm7,%%xmm3                    \n"
+  "movq       %%xmm3,(%1)                      \n"
+  "movdqa     %%xmm3,%%xmm7                    \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "movq       %%xmm7,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm12,%%xmm8                   \n"
+  "movq       %%xmm8,(%1)                      \n"
+  "movdqa     %%xmm8,%%xmm12                   \n"
+  "palignr    $0x8,%%xmm12,%%xmm12             \n"
+  "movq       %%xmm12,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm14,%%xmm10                  \n"
+  "movdqa     %%xmm10,%%xmm14                  \n"
+  "movq       %%xmm10,(%1)                     \n"
+  "palignr    $0x8,%%xmm14,%%xmm14             \n"
+  "punpckldq  %%xmm13,%%xmm9                   \n"
+  "movq       %%xmm14,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "movdqa     %%xmm9,%%xmm13                   \n"
+  "movq       %%xmm9,(%1)                      \n"
+  "palignr    $0x8,%%xmm13,%%xmm13             \n"
+  "movq       %%xmm13,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm15,%%xmm11                  \n"
+  "movq       %%xmm11,(%1)                     \n"
+  "movdqa     %%xmm11,%%xmm15                  \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  "sub        $0x10,%2                         \n"
+  "movq       %%xmm15,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "jg         1b                               \n"
   : "+r"(src),    // %0
     "+r"(dst),    // %1
     "+r"(width)   // %2
   : "r"(static_cast<intptr_t>(src_stride)),  // %3
     "r"(static_cast<intptr_t>(dst_stride))   // %4
-  : "memory"
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
 );
 }
 
@@ -641,98 +637,99 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                 uint8* dst_a, int dst_stride_a,
                                 uint8* dst_b, int dst_stride_b,
                                 int w) {
-  asm volatile(
-"1:"
+  asm volatile (
   // Read in the data from the source pointer.
   // First round of bit swap.
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     (%0,%4),%%xmm1\n"
-  "lea        (%0,%4,2),%0\n"
-  "movdqa     %%xmm0,%%xmm8\n"
-  "punpcklbw  %%xmm1,%%xmm0\n"
-  "punpckhbw  %%xmm1,%%xmm8\n"
-  "movdqa     %%xmm8,%%xmm1\n"
-  "movdqa     (%0),%%xmm2\n"
-  "movdqa     (%0,%4),%%xmm3\n"
-  "lea        (%0,%4,2),%0\n"
-  "movdqa     %%xmm2,%%xmm8\n"
-  "punpcklbw  %%xmm3,%%xmm2\n"
-  "punpckhbw  %%xmm3,%%xmm8\n"
-  "movdqa     %%xmm8,%%xmm3\n"
-  "movdqa     (%0),%%xmm4\n"
-  "movdqa     (%0,%4),%%xmm5\n"
-  "lea        (%0,%4,2),%0\n"
-  "movdqa     %%xmm4,%%xmm8\n"
-  "punpcklbw  %%xmm5,%%xmm4\n"
-  "punpckhbw  %%xmm5,%%xmm8\n"
-  "movdqa     %%xmm8,%%xmm5\n"
-  "movdqa     (%0),%%xmm6\n"
-  "movdqa     (%0,%4),%%xmm7\n"
-  "lea        (%0,%4,2),%0\n"
-  "movdqa     %%xmm6,%%xmm8\n"
-  "punpcklbw  %%xmm7,%%xmm6\n"
-  "neg        %4\n"
-  "lea        0x10(%0,%4,8),%0\n"
-  "punpckhbw  %%xmm7,%%xmm8\n"
-  "movdqa     %%xmm8,%%xmm7\n"
-  "neg        %4\n"
+  ".p2align  4                                 \n"
+"1:                                            \n"
+  "movdqa     (%0),%%xmm0                      \n"
+  "movdqa     (%0,%4),%%xmm1                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpcklbw  %%xmm1,%%xmm0                    \n"
+  "punpckhbw  %%xmm1,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm1                    \n"
+  "movdqa     (%0),%%xmm2                      \n"
+  "movdqa     (%0,%4),%%xmm3                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm2,%%xmm8                    \n"
+  "punpcklbw  %%xmm3,%%xmm2                    \n"
+  "punpckhbw  %%xmm3,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm3                    \n"
+  "movdqa     (%0),%%xmm4                      \n"
+  "movdqa     (%0,%4),%%xmm5                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm4,%%xmm8                    \n"
+  "punpcklbw  %%xmm5,%%xmm4                    \n"
+  "punpckhbw  %%xmm5,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm5                    \n"
+  "movdqa     (%0),%%xmm6                      \n"
+  "movdqa     (%0,%4),%%xmm7                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm6,%%xmm8                    \n"
+  "punpcklbw  %%xmm7,%%xmm6                    \n"
+  "neg        %4                               \n"
+  "lea        0x10(%0,%4,8),%0                 \n"
+  "punpckhbw  %%xmm7,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm7                    \n"
+  "neg        %4                               \n"
    // Second round of bit swap.
-  "movdqa     %%xmm0,%%xmm8\n"
-  "movdqa     %%xmm1,%%xmm9\n"
-  "punpckhwd  %%xmm2,%%xmm8\n"
-  "punpckhwd  %%xmm3,%%xmm9\n"
-  "punpcklwd  %%xmm2,%%xmm0\n"
-  "punpcklwd  %%xmm3,%%xmm1\n"
-  "movdqa     %%xmm8,%%xmm2\n"
-  "movdqa     %%xmm9,%%xmm3\n"
-  "movdqa     %%xmm4,%%xmm8\n"
-  "movdqa     %%xmm5,%%xmm9\n"
-  "punpckhwd  %%xmm6,%%xmm8\n"
-  "punpckhwd  %%xmm7,%%xmm9\n"
-  "punpcklwd  %%xmm6,%%xmm4\n"
-  "punpcklwd  %%xmm7,%%xmm5\n"
-  "movdqa     %%xmm8,%%xmm6\n"
-  "movdqa     %%xmm9,%%xmm7\n"
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "movdqa     %%xmm1,%%xmm9                    \n"
+  "punpckhwd  %%xmm2,%%xmm8                    \n"
+  "punpckhwd  %%xmm3,%%xmm9                    \n"
+  "punpcklwd  %%xmm2,%%xmm0                    \n"
+  "punpcklwd  %%xmm3,%%xmm1                    \n"
+  "movdqa     %%xmm8,%%xmm2                    \n"
+  "movdqa     %%xmm9,%%xmm3                    \n"
+  "movdqa     %%xmm4,%%xmm8                    \n"
+  "movdqa     %%xmm5,%%xmm9                    \n"
+  "punpckhwd  %%xmm6,%%xmm8                    \n"
+  "punpckhwd  %%xmm7,%%xmm9                    \n"
+  "punpcklwd  %%xmm6,%%xmm4                    \n"
+  "punpcklwd  %%xmm7,%%xmm5                    \n"
+  "movdqa     %%xmm8,%%xmm6                    \n"
+  "movdqa     %%xmm9,%%xmm7                    \n"
   // Third round of bit swap.
   // Write to the destination pointer.
-  "movdqa     %%xmm0,%%xmm8\n"
-  "punpckldq  %%xmm4,%%xmm0\n"
-  "movlpd     %%xmm0,(%1)\n"  // Write back U channel
-  "movhpd     %%xmm0,(%2)\n"  // Write back V channel
-  "punpckhdq  %%xmm4,%%xmm8\n"
-  "movlpd     %%xmm8,(%1,%5)\n"
-  "lea        (%1,%5,2),%1\n"
-  "movhpd     %%xmm8,(%2,%6)\n"
-  "lea        (%2,%6,2),%2\n"
-  "movdqa     %%xmm2,%%xmm8\n"
-  "punpckldq  %%xmm6,%%xmm2\n"
-  "movlpd     %%xmm2,(%1)\n"
-  "movhpd     %%xmm2,(%2)\n"
-  "punpckhdq  %%xmm6,%%xmm8\n"
-  "movlpd     %%xmm8,(%1,%5)\n"
-  "lea        (%1,%5,2),%1\n"
-  "movhpd     %%xmm8,(%2,%6)\n"
-  "lea        (%2,%6,2),%2\n"
-  "movdqa     %%xmm1,%%xmm8\n"
-  "punpckldq  %%xmm5,%%xmm1\n"
-  "movlpd     %%xmm1,(%1)\n"
-  "movhpd     %%xmm1,(%2)\n"
-  "punpckhdq  %%xmm5,%%xmm8\n"
-  "movlpd     %%xmm8,(%1,%5)\n"
-  "lea        (%1,%5,2),%1\n"
-  "movhpd     %%xmm8,(%2,%6)\n"
-  "lea        (%2,%6,2),%2\n"
-  "movdqa     %%xmm3,%%xmm8\n"
-  "punpckldq  %%xmm7,%%xmm3\n"
-  "movlpd     %%xmm3,(%1)\n"
-  "movhpd     %%xmm3,(%2)\n"
-  "punpckhdq  %%xmm7,%%xmm8\n"
-  "movlpd     %%xmm8,(%1,%5)\n"
-  "lea        (%1,%5,2),%1\n"
-  "movhpd     %%xmm8,(%2,%6)\n"
-  "lea        (%2,%6,2),%2\n"
-  "sub        $0x8,%3\n"
-  "ja         1b\n"
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpckldq  %%xmm4,%%xmm0                    \n"
+  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
+  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
+  "punpckhdq  %%xmm4,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm2,%%xmm8                    \n"
+  "punpckldq  %%xmm6,%%xmm2                    \n"
+  "movlpd     %%xmm2,(%1)                      \n"
+  "movhpd     %%xmm2,(%2)                      \n"
+  "punpckhdq  %%xmm6,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm1,%%xmm8                    \n"
+  "punpckldq  %%xmm5,%%xmm1                    \n"
+  "movlpd     %%xmm1,(%1)                      \n"
+  "movhpd     %%xmm1,(%2)                      \n"
+  "punpckhdq  %%xmm5,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm3,%%xmm8                    \n"
+  "punpckldq  %%xmm7,%%xmm3                    \n"
+  "movlpd     %%xmm3,(%1)                      \n"
+  "movhpd     %%xmm3,(%2)                      \n"
+  "punpckhdq  %%xmm7,%%xmm8                    \n"
+  "sub        $0x8,%3                          \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "jg         1b                               \n"
   : "+r"(src),    // %0
     "+r"(dst_a),  // %1
     "+r"(dst_b),  // %2
@@ -740,7 +737,9 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
   : "r"(static_cast<intptr_t>(src_stride)),    // %4
     "r"(static_cast<intptr_t>(dst_stride_a)),  // %5
     "r"(static_cast<intptr_t>(dst_stride_b))   // %6
-  : "memory"
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+    "xmm8", "xmm9"
 );
 }
 #endif
@@ -748,9 +747,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
 
 static void TransposeWx8_C(const uint8* src, int src_stride,
                            uint8* dst, int dst_stride,
-                           int w) {
-  int i;
-  for (i = 0; i < w; ++i) {
+                           int width) {
+  for (int i = 0; i < width; ++i) {
     dst[0] = src[0 * src_stride];
     dst[1] = src[1 * src_stride];
     dst[2] = src[2 * src_stride];
@@ -767,184 +765,143 @@ static void TransposeWx8_C(const uint8* src, int src_stride,
 static void TransposeWxH_C(const uint8* src, int src_stride,
                            uint8* dst, int dst_stride,
                            int width, int height) {
-  int i, j;
-  for (i = 0; i < width; ++i)
-    for (j = 0; j < height; ++j)
+  for (int i = 0; i < width; ++i) {
+    for (int j = 0; j < height; ++j) {
       dst[i * dst_stride + j] = src[j * src_stride + i];
+    }
+  }
 }
 
+LIBYUV_API
 void TransposePlane(const uint8* src, int src_stride,
                     uint8* dst, int dst_stride,
                     int width, int height) {
-  int i = height;
-  rotate_wx8_func TransposeWx8;
-  rotate_wxh_func TransposeWxH;
-
+  void (*TransposeWx8)(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride,
+                       int width) = TransposeWx8_C;
 #if defined(HAS_TRANSPOSE_WX8_NEON)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
-      (width % 8 == 0) &&
-      IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
-      IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     TransposeWx8 = TransposeWx8_NEON;
-    TransposeWxH = TransposeWxH_C;
-  } else
-#endif
-#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
-      IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
-    TransposeWx8 = TransposeWx8_FAST_SSSE3;
-    TransposeWxH = TransposeWxH_C;
-  } else
+  }
 #endif
 #if defined(HAS_TRANSPOSE_WX8_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 8 == 0) &&
-      IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
-      IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
     TransposeWx8 = TransposeWx8_SSSE3;
-    TransposeWxH = TransposeWxH_C;
-  } else
+  }
 #endif
-  {
-    TransposeWx8 = TransposeWx8_C;
-    TransposeWxH = TransposeWxH_C;
+#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+    TransposeWx8 = TransposeWx8_FAST_SSSE3;
   }
+#endif
 
-  // work across the source in 8x8 tiles
+  // Work across the source in 8x8 tiles
+  int i = height;
   while (i >= 8) {
     TransposeWx8(src, src_stride, dst, dst_stride, width);
-
-    src += 8 * src_stride;    // go down 8 rows
-    dst += 8;                 // move over 8 columns
-    i   -= 8;
+    src += 8 * src_stride;    // Go down 8 rows.
+    dst += 8;                 // Move over 8 columns.
+    i -= 8;
   }
 
-  TransposeWxH(src, src_stride, dst, dst_stride, width, i);
+  TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
 }
 
+LIBYUV_API
 void RotatePlane90(const uint8* src, int src_stride,
                    uint8* dst, int dst_stride,
                    int width, int height) {
   // Rotate by 90 is a transpose with the source read
-  // from bottom to top.  So set the source pointer to the end
+  // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
   src += src_stride * (height - 1);
   src_stride = -src_stride;
-
   TransposePlane(src, src_stride, dst, dst_stride, width, height);
 }
 
+LIBYUV_API
 void RotatePlane270(const uint8* src, int src_stride,
                     uint8* dst, int dst_stride,
                     int width, int height) {
   // Rotate by 270 is a transpose with the destination written
-  // from bottom to top.  So set the destination pointer to the end
+  // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
   dst += dst_stride * (width - 1);
   dst_stride = -dst_stride;
-
   TransposePlane(src, src_stride, dst, dst_stride, width, height);
 }
 
-static void ReverseLine_C(const uint8* src, uint8* dst, int width) {
-  int i;
-  src += width - 1;
-  for (i = 0; i < width; ++i) {
-    dst[i] = src[0];
-    --src;
-  }
-}
-
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
-#define HAS_REVERSE_LINE_SSSE3
-__declspec(naked)
-static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
-__asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
-    mov       ecx, [esp + 12]  // width
-    movdqa    xmm7, _kShuffleReverse
-    lea       eax, [eax + ecx - 16]
- convertloop :
-    movdqa    xmm0, [eax]
-    lea       eax, [eax - 16]
-    pshufb    xmm0, xmm7
-    movdqa    [edx], xmm0
-    lea       edx, [edx + 16]
-    sub       ecx, 16
-    ja        convertloop
-    ret
-  }
-}
-
-#elif (defined(__i386__) || defined(__x86_64__)) && \
-    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#define HAS_REVERSE_LINE_SSSE3
-static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
-  intptr_t temp_width = static_cast<intptr_t>(width);
-  asm volatile(
-  "movdqa     (%3),%%xmm7\n"
-  "lea        -0x10(%0,%2,1),%0\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "lea        -0x10(%0),%0\n"
-  "pshufb     %%xmm7,%%xmm0\n"
-  "movdqa     %%xmm0,(%1)\n"
-  "lea        0x10(%1),%1\n"
-  "sub        $0x10,%2\n"
-  "ja         1b\n"
-  : "+r"(src),    // %0
-    "+r"(dst),    // %1
-    "+r"(temp_width)   // %2
-  : "r"(kShuffleReverse)   // %3
-  : "memory"
-);
-}
-#endif
-
+LIBYUV_API
 void RotatePlane180(const uint8* src, int src_stride,
                     uint8* dst, int dst_stride,
                     int width, int height) {
-  int i;
-  reverse_func ReverseLine;
-
-#if defined(HAS_REVERSE_LINE_NEON)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
-      IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) {
-    ReverseLine = ReverseLine_NEON;
-  } else
+  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_NEON;
+  }
 #endif
-#if defined(HAS_REVERSE_LINE_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
-      IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) {
-    ReverseLine = ReverseLine_SSSE3;
-  } else
+#if defined(HAS_MIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    MirrorRow = MirrorRow_SSE2;
+  }
 #endif
-  {
-    ReverseLine = ReverseLine_C;
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    MirrorRow = MirrorRow_SSSE3;
   }
-  // Rotate by 180 is a mirror and vertical flip
-  src += src_stride * (height - 1);
-
-  for (i = 0; i < height; ++i) {
-    ReverseLine(src, dst, width);
-    src -= src_stride;
+#endif
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
+    CopyRow = CopyRow_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_X86)
+  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+    CopyRow = CopyRow_X86;
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    CopyRow = CopyRow_SSE2;
+  }
+#endif
+  if (width > kMaxStride) {
+    return;
+  }
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  const uint8* src_bot = src + src_stride * (height - 1);
+  uint8* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  // Odd height will harmlessly mirror the middle row twice.
+  for (int y = 0; y < half_height; ++y) {
+    MirrorRow(src, row, width);  // Mirror first row into a buffer
+    src += src_stride;
+    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
     dst += dst_stride;
+    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
   }
 }
 
 static void TransposeUVWx8_C(const uint8* src, int src_stride,
                              uint8* dst_a, int dst_stride_a,
                              uint8* dst_b, int dst_stride_b,
-                             int w) {
-  int i;
-  for (i = 0; i < w; ++i) {
+                             int width) {
+  for (int i = 0; i < width; ++i) {
     dst_a[0] = src[0 * src_stride + 0];
     dst_b[0] = src[0 * src_stride + 1];
     dst_a[1] = src[1 * src_stride + 0];
@@ -970,71 +927,55 @@ static void TransposeUVWx8_C(const uint8* src, int src_stride,
 static void TransposeUVWxH_C(const uint8* src, int src_stride,
                              uint8* dst_a, int dst_stride_a,
                              uint8* dst_b, int dst_stride_b,
-                             int w, int h) {
-  int i, j;
-  for (i = 0; i < w * 2; i += 2)
-    for (j = 0; j < h; ++j) {
+                             int width, int height) {
+  for (int i = 0; i < width * 2; i += 2)
+    for (int j = 0; j < height; ++j) {
       dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
       dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
     }
 }
 
+LIBYUV_API
 void TransposeUV(const uint8* src, int src_stride,
                  uint8* dst_a, int dst_stride_a,
                  uint8* dst_b, int dst_stride_b,
                  int width, int height) {
-  int i = height;
-  rotate_uv_wx8_func TransposeWx8;
-  rotate_uv_wxh_func TransposeWxH;
-
+  void (*TransposeUVWx8)(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) = TransposeUVWx8_C;
 #if defined(HAS_TRANSPOSE_UVWX8_NEON)
-  unsigned long long store_reg[8];
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
-    SaveRegisters_NEON(store_reg);
-    TransposeWx8 = TransposeUVWx8_NEON;
-    TransposeWxH = TransposeUVWxH_C;
-  } else
-#endif
-#if defined(HAS_TRANSPOSE_UVWX8_SSE2)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-      (width % 8 == 0) &&
-      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
-      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
-      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0)) {
-    TransposeWx8 = TransposeUVWx8_SSE2;
-    TransposeWxH = TransposeUVWxH_C;
-  } else
-#endif
-  {
-    TransposeWx8 = TransposeUVWx8_C;
-    TransposeWxH = TransposeUVWxH_C;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    TransposeUVWx8 = TransposeUVWx8_NEON;
+  }
+#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 8) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+    TransposeUVWx8 = TransposeUVWx8_SSE2;
   }
+#endif
 
-  // work through the source in 8x8 tiles
+  // Work through the source in 8x8 tiles.
+  int i = height;
   while (i >= 8) {
-    TransposeWx8(src, src_stride,
-                 dst_a, dst_stride_a,
-                 dst_b, dst_stride_b,
-                 width);
-
-    src   += 8 * src_stride;    // go down 8 rows
-    dst_a += 8;                 // move over 8 columns
-    dst_b += 8;                 // move over 8 columns
-    i     -= 8;
+    TransposeUVWx8(src, src_stride,
+                   dst_a, dst_stride_a,
+                   dst_b, dst_stride_b,
+                   width);
+    src += 8 * src_stride;    // Go down 8 rows.
+    dst_a += 8;               // Move over 8 columns.
+    dst_b += 8;               // Move over 8 columns.
+    i -= 8;
   }
 
-  TransposeWxH(src, src_stride,
-               dst_a, dst_stride_a,
-               dst_b, dst_stride_b,
-               width, i);
-
-#if defined(HAS_TRANSPOSE_UVWX8_NEON)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
-    RestoreRegisters_NEON(store_reg);
-  }
-#endif
+  TransposeUVWxH_C(src, src_stride,
+                   dst_a, dst_stride_a,
+                   dst_b, dst_stride_b,
+                   width, i);
 }
 
+LIBYUV_API
 void RotateUV90(const uint8* src, int src_stride,
                 uint8* dst_a, int dst_stride_a,
                 uint8* dst_b, int dst_stride_b,
@@ -1048,6 +989,7 @@ void RotateUV90(const uint8* src, int src_stride,
               width, height);
 }
 
+LIBYUV_API
 void RotateUV270(const uint8* src, int src_stride,
                  uint8* dst_a, int dst_stride_a,
                  uint8* dst_b, int dst_stride_b,
@@ -1063,119 +1005,38 @@ void RotateUV270(const uint8* src, int src_stride,
               width, height);
 }
 
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
-#define HAS_REVERSE_LINE_UV_SSSE3
-__declspec(naked)
-void ReverseLineUV_SSSE3(const uint8* src,
-                         uint8* dst_a, uint8* dst_b,
-                         int width) {
-__asm {
-    push      edi
-    mov       eax, [esp + 4 + 4]   // src
-    mov       edx, [esp + 4 + 8]   // dst_a
-    mov       edi, [esp + 4 + 12]  // dst_b
-    mov       ecx, [esp + 4 + 16]  // width
-    movdqa    xmm7, _kShuffleReverseUV
-    lea       eax, [eax + ecx * 2 - 16]
-
- convertloop :
-    movdqa    xmm0, [eax]
-    lea       eax, [eax - 16]
-    pshufb    xmm0, xmm7
-    movlpd    qword ptr [edx], xmm0
-    lea       edx, [edx + 8]
-    movhpd    qword ptr [edi], xmm0
-    lea       edi, [edi + 8]
-    sub       ecx, 8
-    ja        convertloop
-    pop       edi
-    ret
-  }
-}
-
-#elif (defined(__i386__) || defined(__x86_64__)) && \
-    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#define HAS_REVERSE_LINE_UV_SSSE3
-void ReverseLineUV_SSSE3(const uint8* src,
-                         uint8* dst_a, uint8* dst_b,
-                         int width) {
-  intptr_t temp_width = static_cast<intptr_t>(width);
-  asm volatile(
-  "movdqa     (%4),%%xmm7\n"
-  "lea        -0x10(%0,%3,2),%0\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "lea        -0x10(%0),%0\n"
-  "pshufb     %%xmm7,%%xmm0\n"
-  "movlpd     %%xmm0,(%1)\n"
-  "lea        0x8(%1),%1\n"
-  "movhpd     %%xmm0,(%2)\n"
-  "lea        0x8(%2),%2\n"
-  "sub        $0x8,%3\n"
-  "ja         1b\n"
-  : "+r"(src),      // %0
-    "+r"(dst_a),    // %1
-    "+r"(dst_b),    // %2
-    "+r"(temp_width)     // %3
-  : "r"(kShuffleReverseUV)  // %4
-  : "memory"
-);
-}
-#endif
-
-static void ReverseLineUV_C(const uint8* src,
-                            uint8* dst_a, uint8* dst_b,
-                            int width) {
-  int i;
-  src += width << 1;
-  for (i = 0; i < width; ++i) {
-    src -= 2;
-    dst_a[i] = src[0];
-    dst_b[i] = src[1];
-  }
-}
-
+// Rotate 180 is a horizontal and vertical flip.
+LIBYUV_API
 void RotateUV180(const uint8* src, int src_stride,
                  uint8* dst_a, int dst_stride_a,
                  uint8* dst_b, int dst_stride_b,
                  int width, int height) {
-  int i;
-  reverse_uv_func ReverseLine;
-
-#if defined(HAS_REVERSE_LINE_UV_NEON)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
-      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
-      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
-    ReverseLine = ReverseLineUV_NEON;
-  } else
-#endif
-#if defined(HAS_REVERSE_LINE_UV_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
-      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
-      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
-    ReverseLine = ReverseLineUV_SSSE3;
-  } else
-#endif
-  {
-    ReverseLine = ReverseLineUV_C;
+  void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
+      MirrorRowUV_C;
+#if defined(HAS_MIRRORROW_UV_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRowUV = MirrorRowUV_NEON;
   }
+#elif defined(HAS_MIRRORROW_UV_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+    MirrorRowUV = MirrorRowUV_SSSE3;
+  }
+#endif
 
   dst_a += dst_stride_a * (height - 1);
   dst_b += dst_stride_b * (height - 1);
 
-  for (i = 0; i < height; ++i) {
-    ReverseLine(src, dst_a, dst_b, width);
-
-    src   += src_stride;      // down one line at a time
-    dst_a -= dst_stride_a;    // nominally up one line at a time
-    dst_b -= dst_stride_b;    // nominally up one line at a time
+  for (int i = 0; i < height; ++i) {
+    MirrorRowUV(src, dst_a, dst_b, width);
+    src += src_stride;
+    dst_a -= dst_stride_a;
+    dst_b -= dst_stride_b;
   }
 }
 
+LIBYUV_API
 int I420Rotate(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
@@ -1184,6 +1045,10 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
                uint8* dst_v, int dst_stride_v,
                int width, int height,
                RotationMode mode) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
+      !dst_y || !dst_u || !dst_v) {
+    return -1;
+  }
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
 
@@ -1248,6 +1113,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
   return -1;
 }
 
+LIBYUV_API
 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
                      const uint8* src_uv, int src_stride_uv,
                      uint8* dst_y, int dst_stride_y,
@@ -1255,6 +1121,10 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
                      uint8* dst_v, int dst_stride_v,
                      int width, int height,
                      RotationMode mode) {
+  if (!src_y || !src_uv || width <= 0 || height == 0 ||
+      !dst_y || !dst_u || !dst_v) {
+    return -1;
+  }
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
 
@@ -1271,7 +1141,8 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
   switch (mode) {
     case kRotate0:
       // copy frame
-      return NV12ToI420(src_y, src_uv, src_stride_y,
+      return NV12ToI420(src_y, src_stride_y,
+                        src_uv, src_stride_uv,
                         dst_y, dst_stride_y,
                         dst_u, dst_stride_u,
                         dst_v, dst_stride_v,
@@ -1309,4 +1180,7 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
   return -1;
 }
 
+#ifdef __cplusplus
+}  // extern "C"
 }  // namespace libyuv
+#endif
diff --git a/files/source/rotate_argb.cc b/files/source/rotate_argb.cc
new file mode 100644
index 00000000..9c994467
--- /dev/null
+++ b/files/source/rotate_argb.cc
@@ -0,0 +1,175 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGBScale has a function to copy pixels to a row, striding each source
+// pixel by a constant.
+#if !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \
+  defined(__x86_64__) || defined(__i386__))
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
+                               int src_stepx,
+                               uint8* dst_ptr, int dst_width);
+#endif
+void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
+                            int src_stepx,
+                            uint8* dst_ptr, int dst_width);
+
+static void ARGBTranspose(const uint8* src, int src_stride,
+                          uint8* dst, int dst_stride,
+                          int width, int height) {
+  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
+      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(height, 4) &&  // width of dest.
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+  }
+#endif
+
+  int src_pixel_step = src_stride / 4;
+  for (int i = 0; i < width; ++i) {  // column of source to row of dest.
+    ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
+    dst += dst_stride;
+    src += 4;
+  }
+}
+
+void ARGBRotate90(const uint8* src, int src_stride,
+                  uint8* dst, int dst_stride,
+                  int width, int height) {
+  // Rotate by 90 is a ARGBTranspose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  // Rotate by 270 is a ARGBTranspose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate180(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride,
+                   int width, int height) {
+  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+      ARGBMirrorRow_C;
+#if defined(HAS_ARGBMIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+  }
+#endif
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 64)) {
+    CopyRow = CopyRow_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    CopyRow = CopyRow_X86;
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    CopyRow = CopyRow_SSE2;
+  }
+#endif
+  if (width * 4 > kMaxStride) {
+    return;
+  }
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  const uint8* src_bot = src + src_stride * (height - 1);
+  uint8* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  // Odd height will harmlessly mirror the middle row twice.
+  for (int y = 0; y < half_height; ++y) {
+    ARGBMirrorRow(src, row, width);  // Mirror first row into a buffer
+    src += src_stride;
+    ARGBMirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    dst += dst_stride;
+    CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+}
+
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height,
+               RotationMode mode) {
+  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return ARGBCopy(src_argb, src_stride_argb,
+                      dst_argb, dst_stride_argb,
+                      width, height);
+    case kRotate90:
+      ARGBRotate90(src_argb, src_stride_argb,
+                   dst_argb, dst_stride_argb,
+                   width, height);
+      return 0;
+    case kRotate270:
+      ARGBRotate270(src_argb, src_stride_argb,
+                    dst_argb, dst_stride_argb,
+                    width, height);
+      return 0;
+    case kRotate180:
+      ARGBRotate180(src_argb, src_stride_argb,
+                    dst_argb, dst_stride_argb,
+                    width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/rotate_neon.cc b/files/source/rotate_neon.cc
new file mode 100644
index 00000000..49b30032
--- /dev/null
+++ b/files/source/rotate_neon.cc
@@ -0,0 +1,406 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+
+static const uvec8 kVTbl4x4Transpose =
+  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride,
+                       int width) {
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %4, #8                        \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    ".p2align  4                               \n"
+    "1:                                        \n"
+      "mov         r9, %0                      \n"
+
+      "vld1.8      {d0}, [r9], %1              \n"
+      "vld1.8      {d1}, [r9], %1              \n"
+      "vld1.8      {d2}, [r9], %1              \n"
+      "vld1.8      {d3}, [r9], %1              \n"
+      "vld1.8      {d4}, [r9], %1              \n"
+      "vld1.8      {d5}, [r9], %1              \n"
+      "vld1.8      {d6}, [r9], %1              \n"
+      "vld1.8      {d7}, [r9]                  \n"
+
+      "vtrn.8      d1, d0                      \n"
+      "vtrn.8      d3, d2                      \n"
+      "vtrn.8      d5, d4                      \n"
+      "vtrn.8      d7, d6                      \n"
+
+      "vtrn.16     d1, d3                      \n"
+      "vtrn.16     d0, d2                      \n"
+      "vtrn.16     d5, d7                      \n"
+      "vtrn.16     d4, d6                      \n"
+
+      "vtrn.32     d1, d5                      \n"
+      "vtrn.32     d0, d4                      \n"
+      "vtrn.32     d3, d7                      \n"
+      "vtrn.32     d2, d6                      \n"
+
+      "vrev16.8    q0, q0                      \n"
+      "vrev16.8    q1, q1                      \n"
+      "vrev16.8    q2, q2                      \n"
+      "vrev16.8    q3, q3                      \n"
+
+      "mov         r9, %2                      \n"
+
+      "vst1.8      {d1}, [r9], %3              \n"
+      "vst1.8      {d0}, [r9], %3              \n"
+      "vst1.8      {d3}, [r9], %3              \n"
+      "vst1.8      {d2}, [r9], %3              \n"
+      "vst1.8      {d5}, [r9], %3              \n"
+      "vst1.8      {d4}, [r9], %3              \n"
+      "vst1.8      {d7}, [r9], %3              \n"
+      "vst1.8      {d6}, [r9]                  \n"
+
+      "add         %0, #8                      \n"  // src += 8
+      "add         %2, %2, %3, lsl #3          \n"  // dst += 8 * dst_stride
+      "subs        %4,  #8                     \n"  // w   -= 8
+      "bge         1b                          \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %4, #8                        \n"
+    "beq         4f                            \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %4, #2                        \n"
+    "blt         3f                            \n"
+
+    "cmp         %4, #4                        \n"
+    "blt         2f                            \n"
+
+    // 4x8 block
+    "mov         r9, %0                        \n"
+    "vld1.32     {d0[0]}, [r9], %1             \n"
+    "vld1.32     {d0[1]}, [r9], %1             \n"
+    "vld1.32     {d1[0]}, [r9], %1             \n"
+    "vld1.32     {d1[1]}, [r9], %1             \n"
+    "vld1.32     {d2[0]}, [r9], %1             \n"
+    "vld1.32     {d2[1]}, [r9], %1             \n"
+    "vld1.32     {d3[0]}, [r9], %1             \n"
+    "vld1.32     {d3[1]}, [r9]                 \n"
+
+    "mov         r9, %2                        \n"
+
+    "vld1.8      {q3}, [%5]                    \n"
+
+    "vtbl.8      d4, {d0, d1}, d6              \n"
+    "vtbl.8      d5, {d0, d1}, d7              \n"
+    "vtbl.8      d0, {d2, d3}, d6              \n"
+    "vtbl.8      d1, {d2, d3}, d7              \n"
+
+    // TODO: rework shuffle above to write
+    //       out with 4 instead of 8 writes
+    "vst1.32     {d4[0]}, [r9], %3             \n"
+    "vst1.32     {d4[1]}, [r9], %3             \n"
+    "vst1.32     {d5[0]}, [r9], %3             \n"
+    "vst1.32     {d5[1]}, [r9]                 \n"
+
+    "add         r9, %2, #4                    \n"
+    "vst1.32     {d0[0]}, [r9], %3             \n"
+    "vst1.32     {d0[1]}, [r9], %3             \n"
+    "vst1.32     {d1[0]}, [r9], %3             \n"
+    "vst1.32     {d1[1]}, [r9]                 \n"
+
+    "add         %0, #4                        \n"  // src += 4
+    "add         %2, %2, %3, lsl #2            \n"  // dst += 4 * dst_stride
+    "subs        %4,  #4                       \n"  // w   -= 4
+    "beq         4f                            \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %4, #2                        \n"
+    "blt         3f                            \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov         r9, %0                        \n"
+    "vld1.16     {d0[0]}, [r9], %1             \n"
+    "vld1.16     {d1[0]}, [r9], %1             \n"
+    "vld1.16     {d0[1]}, [r9], %1             \n"
+    "vld1.16     {d1[1]}, [r9], %1             \n"
+    "vld1.16     {d0[2]}, [r9], %1             \n"
+    "vld1.16     {d1[2]}, [r9], %1             \n"
+    "vld1.16     {d0[3]}, [r9], %1             \n"
+    "vld1.16     {d1[3]}, [r9]                 \n"
+
+    "vtrn.8      d0, d1                        \n"
+
+    "mov         r9, %2                        \n"
+
+    "vst1.64     {d0}, [r9], %3                \n"
+    "vst1.64     {d1}, [r9]                    \n"
+
+    "add         %0, #2                        \n"  // src += 2
+    "add         %2, %2, %3, lsl #1            \n"  // dst += 2 * dst_stride
+    "subs        %4,  #2                       \n"  // w   -= 2
+    "beq         4f                            \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    "vld1.8      {d0[0]}, [%0], %1             \n"
+    "vld1.8      {d0[1]}, [%0], %1             \n"
+    "vld1.8      {d0[2]}, [%0], %1             \n"
+    "vld1.8      {d0[3]}, [%0], %1             \n"
+    "vld1.8      {d0[4]}, [%0], %1             \n"
+    "vld1.8      {d0[5]}, [%0], %1             \n"
+    "vld1.8      {d0[6]}, [%0], %1             \n"
+    "vld1.8      {d0[7]}, [%0]                 \n"
+
+    "vst1.64     {d0}, [%2]                    \n"
+
+    "4:                                        \n"
+
+    : "+r"(src),               // %0
+      "+r"(src_stride),        // %1
+      "+r"(dst),               // %2
+      "+r"(dst_stride),        // %3
+      "+r"(width)              // %4
+    : "r"(&kVTbl4x4Transpose)  // %5
+    : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
+  );
+}
+
+static const uvec8 kVTbl4x4TransposeDi =
+  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) {
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %6, #8                        \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    ".p2align  4                               \n"
+    "1:                                        \n"
+      "mov         r9, %0                      \n"
+
+      "vld2.8      {d0,  d1},  [r9], %1        \n"
+      "vld2.8      {d2,  d3},  [r9], %1        \n"
+      "vld2.8      {d4,  d5},  [r9], %1        \n"
+      "vld2.8      {d6,  d7},  [r9], %1        \n"
+      "vld2.8      {d16, d17}, [r9], %1        \n"
+      "vld2.8      {d18, d19}, [r9], %1        \n"
+      "vld2.8      {d20, d21}, [r9], %1        \n"
+      "vld2.8      {d22, d23}, [r9]            \n"
+
+      "vtrn.8      q1, q0                      \n"
+      "vtrn.8      q3, q2                      \n"
+      "vtrn.8      q9, q8                      \n"
+      "vtrn.8      q11, q10                    \n"
+
+      "vtrn.16     q1, q3                      \n"
+      "vtrn.16     q0, q2                      \n"
+      "vtrn.16     q9, q11                     \n"
+      "vtrn.16     q8, q10                     \n"
+
+      "vtrn.32     q1, q9                      \n"
+      "vtrn.32     q0, q8                      \n"
+      "vtrn.32     q3, q11                     \n"
+      "vtrn.32     q2, q10                     \n"
+
+      "vrev16.8    q0, q0                      \n"
+      "vrev16.8    q1, q1                      \n"
+      "vrev16.8    q2, q2                      \n"
+      "vrev16.8    q3, q3                      \n"
+      "vrev16.8    q8, q8                      \n"
+      "vrev16.8    q9, q9                      \n"
+      "vrev16.8    q10, q10                    \n"
+      "vrev16.8    q11, q11                    \n"
+
+      "mov         r9, %2                      \n"
+
+      "vst1.8      {d2},  [r9], %3             \n"
+      "vst1.8      {d0},  [r9], %3             \n"
+      "vst1.8      {d6},  [r9], %3             \n"
+      "vst1.8      {d4},  [r9], %3             \n"
+      "vst1.8      {d18}, [r9], %3             \n"
+      "vst1.8      {d16}, [r9], %3             \n"
+      "vst1.8      {d22}, [r9], %3             \n"
+      "vst1.8      {d20}, [r9]                 \n"
+
+      "mov         r9, %4                      \n"
+
+      "vst1.8      {d3},  [r9], %5             \n"
+      "vst1.8      {d1},  [r9], %5             \n"
+      "vst1.8      {d7},  [r9], %5             \n"
+      "vst1.8      {d5},  [r9], %5             \n"
+      "vst1.8      {d19}, [r9], %5             \n"
+      "vst1.8      {d17}, [r9], %5             \n"
+      "vst1.8      {d23}, [r9], %5             \n"
+      "vst1.8      {d21}, [r9]                 \n"
+
+      "add         %0, #8*2                    \n"  // src   += 8*2
+      "add         %2, %2, %3, lsl #3          \n"  // dst_a += 8 * dst_stride_a
+      "add         %4, %4, %5, lsl #3          \n"  // dst_b += 8 * dst_stride_b
+      "subs        %6,  #8                     \n"  // w     -= 8
+      "bge         1b                          \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %6, #8                        \n"
+    "beq         4f                            \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %6, #2                        \n"
+    "blt         3f                            \n"
+
+    "cmp         %6, #4                        \n"
+    "blt         2f                            \n"
+
+    //TODO(frkoenig) : clean this up
+    // 4x8 block
+    "mov         r9, %0                        \n"
+    "vld1.64     {d0}, [r9], %1                \n"
+    "vld1.64     {d1}, [r9], %1                \n"
+    "vld1.64     {d2}, [r9], %1                \n"
+    "vld1.64     {d3}, [r9], %1                \n"
+    "vld1.64     {d4}, [r9], %1                \n"
+    "vld1.64     {d5}, [r9], %1                \n"
+    "vld1.64     {d6}, [r9], %1                \n"
+    "vld1.64     {d7}, [r9]                    \n"
+
+    "vld1.8      {q15}, [%7]                   \n"
+
+    "vtrn.8      q0, q1                        \n"
+    "vtrn.8      q2, q3                        \n"
+
+    "vtbl.8      d16, {d0, d1}, d30            \n"
+    "vtbl.8      d17, {d0, d1}, d31            \n"
+    "vtbl.8      d18, {d2, d3}, d30            \n"
+    "vtbl.8      d19, {d2, d3}, d31            \n"
+    "vtbl.8      d20, {d4, d5}, d30            \n"
+    "vtbl.8      d21, {d4, d5}, d31            \n"
+    "vtbl.8      d22, {d6, d7}, d30            \n"
+    "vtbl.8      d23, {d6, d7}, d31            \n"
+
+    "mov         r9, %2                        \n"
+
+    "vst1.32     {d16[0]},  [r9], %3           \n"
+    "vst1.32     {d16[1]},  [r9], %3           \n"
+    "vst1.32     {d17[0]},  [r9], %3           \n"
+    "vst1.32     {d17[1]},  [r9], %3           \n"
+
+    "add         r9, %2, #4                    \n"
+    "vst1.32     {d20[0]}, [r9], %3            \n"
+    "vst1.32     {d20[1]}, [r9], %3            \n"
+    "vst1.32     {d21[0]}, [r9], %3            \n"
+    "vst1.32     {d21[1]}, [r9]                \n"
+
+    "mov         r9, %4                        \n"
+
+    "vst1.32     {d18[0]}, [r9], %5            \n"
+    "vst1.32     {d18[1]}, [r9], %5            \n"
+    "vst1.32     {d19[0]}, [r9], %5            \n"
+    "vst1.32     {d19[1]}, [r9], %5            \n"
+
+    "add         r9, %4, #4                    \n"
+    "vst1.32     {d22[0]},  [r9], %5           \n"
+    "vst1.32     {d22[1]},  [r9], %5           \n"
+    "vst1.32     {d23[0]},  [r9], %5           \n"
+    "vst1.32     {d23[1]},  [r9]               \n"
+
+    "add         %0, #4*2                      \n"  // src   += 4 * 2
+    "add         %2, %2, %3, lsl #2            \n"  // dst_a += 4 * dst_stride_a
+    "add         %4, %4, %5, lsl #2            \n"  // dst_b += 4 * dst_stride_b
+    "subs        %6,  #4                       \n"  // w     -= 4
+    "beq         4f                            \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %6, #2                        \n"
+    "blt         3f                            \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov         r9, %0                        \n"
+    "vld2.16     {d0[0], d2[0]}, [r9], %1      \n"
+    "vld2.16     {d1[0], d3[0]}, [r9], %1      \n"
+    "vld2.16     {d0[1], d2[1]}, [r9], %1      \n"
+    "vld2.16     {d1[1], d3[1]}, [r9], %1      \n"
+    "vld2.16     {d0[2], d2[2]}, [r9], %1      \n"
+    "vld2.16     {d1[2], d3[2]}, [r9], %1      \n"
+    "vld2.16     {d0[3], d2[3]}, [r9], %1      \n"
+    "vld2.16     {d1[3], d3[3]}, [r9]          \n"
+
+    "vtrn.8      d0, d1                        \n"
+    "vtrn.8      d2, d3                        \n"
+
+    "mov         r9, %2                        \n"
+
+    "vst1.64     {d0}, [r9], %3                \n"
+    "vst1.64     {d2}, [r9]                    \n"
+
+    "mov         r9, %4                        \n"
+
+    "vst1.64     {d1}, [r9], %5                \n"
+    "vst1.64     {d3}, [r9]                    \n"
+
+    "add         %0, #2*2                      \n"  // src   += 2 * 2
+    "add         %2, %2, %3, lsl #1            \n"  // dst_a += 2 * dst_stride_a
+    "add         %4, %4, %5, lsl #1            \n"  // dst_b += 2 * dst_stride_b
+    "subs        %6,  #2                       \n"  // w     -= 2
+    "beq         4f                            \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    "vld2.8      {d0[0], d1[0]}, [%0], %1      \n"
+    "vld2.8      {d0[1], d1[1]}, [%0], %1      \n"
+    "vld2.8      {d0[2], d1[2]}, [%0], %1      \n"
+    "vld2.8      {d0[3], d1[3]}, [%0], %1      \n"
+    "vld2.8      {d0[4], d1[4]}, [%0], %1      \n"
+    "vld2.8      {d0[5], d1[5]}, [%0], %1      \n"
+    "vld2.8      {d0[6], d1[6]}, [%0], %1      \n"
+    "vld2.8      {d0[7], d1[7]}, [%0]          \n"
+
+    "vst1.64     {d0}, [%2]                    \n"
+    "vst1.64     {d1}, [%4]                    \n"
+
+    "4:                                        \n"
+
+    : "+r"(src),                 // %0
+      "+r"(src_stride),          // %1
+      "+r"(dst_a),               // %2
+      "+r"(dst_stride_a),        // %3
+      "+r"(dst_b),               // %4
+      "+r"(dst_stride_b),        // %5
+      "+r"(width)                // %6
+    : "r"(&kVTbl4x4TransposeDi)  // %7
+    : "memory", "cc", "r9",
+      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  );
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/rotate_neon.s b/files/source/rotate_neon.s
deleted file mode 100644
index 75ea957a..00000000
--- a/files/source/rotate_neon.s
+++ /dev/null
@@ -1,563 +0,0 @@
-  .global RestoreRegisters_NEON
-  .global ReverseLine_NEON
-  .global ReverseLineUV_NEON
-  .global SaveRegisters_NEON
-  .global TransposeWx8_NEON
-  .global TransposeUVWx8_NEON
-  .type RestoreRegisters_NEON, function
-  .type ReverseLine_NEON, function
-  .type ReverseLineUV_NEON, function
-  .type SaveRegisters_NEON, function
-  .type TransposeWx8_NEON, function
-  .type TransposeUVWx8_NEON, function
-
-@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width)
-@ r0 const uint8* src
-@ r1 uint8* dst
-@ r2 width
-ReverseLine_NEON:
-
-  @ compute where to start writing destination
-  add         r1, r2      @ dst + width
-
-  @ work on segments that are multiples of 16
-  lsrs        r3, r2, #4
-
-  @ the output is written in two block.  8 bytes followed
-  @ by another 8.  reading is done sequentially, from left to
-  @ right.  writing is done from right to left in block sizes
-  @ r1, the destination pointer is incremented after writing
-  @ the first of the two blocks.  need to subtract that 8 off
-  @ along with 16 to get the next location.
-  mov         r3, #-24
-
-  beq         Lline_residuals
-
-  @ back of destination by the size of the register that is
-  @ going to be reversed
-  sub         r1, #16
-
-  @ the loop needs to run on blocks of 16.  what will be left
-  @ over is either a negative number, the residuals that need
-  @ to be done, or 0.  if this isn't subtracted off here the
-  @ loop will run one extra time.
-  sub         r2, #16
-
-Lsegments_of_16:
-    vld1.8      {q0}, [r0]!               @ src += 16
-
-    @ reverse the bytes in the 64 bit segments.  unable to reverse
-    @ the bytes in the entire 128 bits in one go.
-    vrev64.8    q0, q0
-
-    @ because of the inability to reverse the entire 128 bits
-    @ reverse the writing out of the two 64 bit segments.
-    vst1.8      {d1}, [r1]!
-    vst1.8      {d0}, [r1], r3            @ dst -= 16
-
-    subs        r2, #16
-    bge         Lsegments_of_16
-
-  @ add 16 back to the counter.  if the result is 0 there is no
-  @ residuals so return
-  adds        r2, #16
-  bxeq        lr
-
-  add         r1, #16
-
-Lline_residuals:
-
-  mov         r3, #-3
-
-  sub         r1, #2
-  subs        r2, #2
-  @ check for 16*n+1 scenarios where segments_of_2 should not
-  @ be run, but there is something left over.
-  blt         Lsegment_of_1
-
-@ do this in neon registers as per
-@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-Lsegments_of_2:
-    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
-
-    vst1.8      {d1[0]}, [r1]!
-    vst1.8      {d0[0]}, [r1], r3         @ dst -= 2
-
-    subs        r2, #2
-    bge         Lsegments_of_2
-
-  adds        r2, #2
-  bxeq        lr
-
-Lsegment_of_1:
-  add         r1, #1
-  vld1.8      {d0[0]}, [r0]
-  vst1.8      {d0[0]}, [r1]
-
-  bx          lr
-
-@ void TransposeWx8_NEON (const uint8* src, int src_stride,
-@                         uint8* dst, int dst_stride,
-@                         int w)
-@ r0 const uint8* src
-@ r1 int src_stride
-@ r2 uint8* dst
-@ r3 int dst_stride
-@ stack int w
-TransposeWx8_NEON:
-  push        {r4,r8,r9,lr}
-
-  ldr         r8, [sp, #16]        @ width
-
-  @ loops are on blocks of 8.  loop will stop when
-  @ counter gets to or below 0.  starting the counter
-  @ at w-8 allow for this
-  sub         r8, #8
-
-@ handle 8x8 blocks.  this should be the majority of the plane
-Lloop_8x8:
-    mov         r9, r0
-
-    vld1.8      {d0}, [r9], r1
-    vld1.8      {d1}, [r9], r1
-    vld1.8      {d2}, [r9], r1
-    vld1.8      {d3}, [r9], r1
-    vld1.8      {d4}, [r9], r1
-    vld1.8      {d5}, [r9], r1
-    vld1.8      {d6}, [r9], r1
-    vld1.8      {d7}, [r9]
-
-    vtrn.8      d1, d0
-    vtrn.8      d3, d2
-    vtrn.8      d5, d4
-    vtrn.8      d7, d6
-
-    vtrn.16     d1, d3
-    vtrn.16     d0, d2
-    vtrn.16     d5, d7
-    vtrn.16     d4, d6
-
-    vtrn.32     d1, d5
-    vtrn.32     d0, d4
-    vtrn.32     d3, d7
-    vtrn.32     d2, d6
-
-    vrev16.8    q0, q0
-    vrev16.8    q1, q1
-    vrev16.8    q2, q2
-    vrev16.8    q3, q3
-
-    mov         r9, r2
-
-    vst1.8      {d1}, [r9], r3
-    vst1.8      {d0}, [r9], r3
-    vst1.8      {d3}, [r9], r3
-    vst1.8      {d2}, [r9], r3
-    vst1.8      {d5}, [r9], r3
-    vst1.8      {d4}, [r9], r3
-    vst1.8      {d7}, [r9], r3
-    vst1.8      {d6}, [r9]
-
-    add         r0, #8            @ src += 8
-    add         r2, r3, lsl #3    @ dst += 8 * dst_stride
-    subs        r8,  #8           @ w   -= 8
-    bge         Lloop_8x8
-
-  @ add 8 back to counter.  if the result is 0 there are
-  @ no residuals.
-  adds        r8, #8
-  beq         Ldone
-
-  @ some residual, so between 1 and 7 lines left to transpose
-  cmp         r8, #2
-  blt         Lblock_1x8
-
-  cmp         r8, #4
-  blt         Lblock_2x8
-
-Lblock_4x8:
-  mov         r9, r0
-  vld1.32     {d0[0]}, [r9], r1
-  vld1.32     {d0[1]}, [r9], r1
-  vld1.32     {d1[0]}, [r9], r1
-  vld1.32     {d1[1]}, [r9], r1
-  vld1.32     {d2[0]}, [r9], r1
-  vld1.32     {d2[1]}, [r9], r1
-  vld1.32     {d3[0]}, [r9], r1
-  vld1.32     {d3[1]}, [r9]
-
-  mov         r9, r2
-
-  adr         r12, vtbl_4x4_transpose
-  vld1.8      {q3}, [r12]
-
-  vtbl.8      d4, {d0, d1}, d6
-  vtbl.8      d5, {d0, d1}, d7
-  vtbl.8      d0, {d2, d3}, d6
-  vtbl.8      d1, {d2, d3}, d7
-
-  @ TODO: rework shuffle above to write
-  @       out with 4 instead of 8 writes
-  vst1.32     {d4[0]}, [r9], r3
-  vst1.32     {d4[1]}, [r9], r3
-  vst1.32     {d5[0]}, [r9], r3
-  vst1.32     {d5[1]}, [r9]
-
-  add         r9, r2, #4
-  vst1.32     {d0[0]}, [r9], r3
-  vst1.32     {d0[1]}, [r9], r3
-  vst1.32     {d1[0]}, [r9], r3
-  vst1.32     {d1[1]}, [r9]
-
-  add         r0, #4            @ src += 4
-  add         r2, r3, lsl #2    @ dst += 4 * dst_stride
-  subs        r8,  #4           @ w   -= 4
-  beq         Ldone
-
-  @ some residual, check to see if it includes a 2x8 block,
-  @ or less
-  cmp         r8, #2
-  blt         Lblock_1x8
-
-Lblock_2x8:
-  mov         r9, r0
-  vld1.16     {d0[0]}, [r9], r1
-  vld1.16     {d1[0]}, [r9], r1
-  vld1.16     {d0[1]}, [r9], r1
-  vld1.16     {d1[1]}, [r9], r1
-  vld1.16     {d0[2]}, [r9], r1
-  vld1.16     {d1[2]}, [r9], r1
-  vld1.16     {d0[3]}, [r9], r1
-  vld1.16     {d1[3]}, [r9]
-
-  vtrn.8      d0, d1
-
-  mov         r9, r2
-
-  vst1.64     {d0}, [r9], r3
-  vst1.64     {d1}, [r9]
-
-  add         r0, #2            @ src += 2
-  add         r2, r3, lsl #1    @ dst += 2 * dst_stride
-  subs        r8,  #2           @ w   -= 2
-  beq         Ldone
-
-Lblock_1x8:
-  vld1.8      {d0[0]}, [r0], r1
-  vld1.8      {d0[1]}, [r0], r1
-  vld1.8      {d0[2]}, [r0], r1
-  vld1.8      {d0[3]}, [r0], r1
-  vld1.8      {d0[4]}, [r0], r1
-  vld1.8      {d0[5]}, [r0], r1
-  vld1.8      {d0[6]}, [r0], r1
-  vld1.8      {d0[7]}, [r0]
-
-  vst1.64     {d0}, [r2]
-
-Ldone:
-
-  pop         {r4,r8,r9,pc}
-
-vtbl_4x4_transpose:
-  .byte  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-@ void SaveRegisters_NEON (unsigned long long store)
-@ r0 unsigned long long store
-SaveRegisters_NEON:
-  vst1.i64    {d8, d9, d10, d11}, [r0]!
-  vst1.i64    {d12, d13, d14, d15}, [r0]!
-  bx          lr
-
-@ void RestoreRegisters_NEON (unsigned long long store)
-@ r0 unsigned long long store
-RestoreRegisters_NEON:
-  vld1.i64    {d8, d9, d10, d11}, [r0]!
-  vld1.i64    {d12, d13, d14, d15}, [r0]!
-  bx          lr
-
-@ void ReverseLineUV_NEON (const uint8* src,
-@                          uint8* dst_a,
-@                          uint8* dst_b,
-@                          int width)
-@ r0 const uint8* src
-@ r1 uint8* dst_a
-@ r2 uint8* dst_b
-@ r3 width
-ReverseLineUV_NEON:
-
-  @ compute where to start writing destination
-  add         r1, r1, r3      @ dst_a + width
-  add         r2, r2, r3      @ dst_b + width
-
-  @ work on input segments that are multiples of 16, but
-  @ width that has been passed is output segments, half
-  @ the size of input.
-  lsrs        r12, r3, #3
-
-  beq         Lline_residuals_di
-
-  @ the output is written in to two blocks.
-  mov         r12, #-8
-
-  @ back of destination by the size of the register that is
-  @ going to be reversed
-  sub         r1, r1, #8
-  sub         r2, r2, #8
-
-  @ the loop needs to run on blocks of 8.  what will be left
-  @ over is either a negative number, the residuals that need
-  @ to be done, or 0.  if this isn't subtracted off here the
-  @ loop will run one extra time.
-  sub         r3, r3, #8
-
-Lsegments_of_8_di:
-    vld2.8      {d0, d1}, [r0]!         @ src += 16
-
-    @ reverse the bytes in the 64 bit segments
-    vrev64.8    q0, q0
-
-    vst1.8      {d0}, [r1], r12         @ dst_a -= 8
-    vst1.8      {d1}, [r2], r12         @ dst_b -= 8
-
-    subs        r3, r3, #8
-    bge         Lsegments_of_8_di
-
-  @ add 8 back to the counter.  if the result is 0 there is no
-  @ residuals so return
-  adds        r3, r3, #8
-  bxeq        lr
-
-  add         r1, r1, #8
-  add         r2, r2, #8
-
-Lline_residuals_di:
-
-  mov         r12, #-1
-
-  sub         r1, r1, #1
-  sub         r2, r2, #1
-
-@ do this in neon registers as per
-@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-Lsegments_of_1:
-    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
-
-    vst1.8      {d0[0]}, [r1], r12        @ dst_a -= 1
-    vst1.8      {d1[0]}, [r2], r12        @ dst_b -= 1
-
-    subs        r3, r3, #1
-    bgt         Lsegments_of_1
-
-  bx          lr
-
-@ void TransposeUVWx8_NEON (const uint8* src, int src_stride,
-@                           uint8* dst_a, int dst_stride_a,
-@                           uint8* dst_b, int dst_stride_b,
-@                           int width)
-@ r0 const uint8* src
-@ r1 int src_stride
-@ r2 uint8* dst_a
-@ r3 int dst_stride_a
-@ stack uint8* dst_b
-@ stack int dst_stride_b
-@ stack int width
-TransposeUVWx8_NEON:
-  push        {r4-r9,lr}
-
-  ldr         r4, [sp, #28]         @ dst_b
-  ldr         r5, [sp, #32]         @ dst_stride_b
-  ldr         r8, [sp, #36]         @ width
-  @ loops are on blocks of 8.  loop will stop when
-  @ counter gets to or below 0.  starting the counter
-  @ at w-8 allow for this
-  sub         r8, #8
-
-@ handle 8x8 blocks.  this should be the majority of the plane
-Lloop_8x8_di:
-    mov         r9, r0
-
-    vld2.8      {d0,  d1},  [r9], r1
-    vld2.8      {d2,  d3},  [r9], r1
-    vld2.8      {d4,  d5},  [r9], r1
-    vld2.8      {d6,  d7},  [r9], r1
-    vld2.8      {d8,  d9},  [r9], r1
-    vld2.8      {d10, d11}, [r9], r1
-    vld2.8      {d12, d13}, [r9], r1
-    vld2.8      {d14, d15}, [r9]
-
-    vtrn.8      q1, q0
-    vtrn.8      q3, q2
-    vtrn.8      q5, q4
-    vtrn.8      q7, q6
-
-    vtrn.16     q1, q3
-    vtrn.16     q0, q2
-    vtrn.16     q5, q7
-    vtrn.16     q4, q6
-
-    vtrn.32     q1, q5
-    vtrn.32     q0, q4
-    vtrn.32     q3, q7
-    vtrn.32     q2, q6
-
-    vrev16.8    q0, q0
-    vrev16.8    q1, q1
-    vrev16.8    q2, q2
-    vrev16.8    q3, q3
-    vrev16.8    q4, q4
-    vrev16.8    q5, q5
-    vrev16.8    q6, q6
-    vrev16.8    q7, q7
-
-    mov         r9, r2
-
-    vst1.8      {d2},  [r9], r3
-    vst1.8      {d0},  [r9], r3
-    vst1.8      {d6},  [r9], r3
-    vst1.8      {d4},  [r9], r3
-    vst1.8      {d10}, [r9], r3
-    vst1.8      {d8},  [r9], r3
-    vst1.8      {d14}, [r9], r3
-    vst1.8      {d12}, [r9]
-
-    mov         r9, r4
-
-    vst1.8      {d3},  [r9], r5
-    vst1.8      {d1},  [r9], r5
-    vst1.8      {d7},  [r9], r5
-    vst1.8      {d5},  [r9], r5
-    vst1.8      {d11}, [r9], r5
-    vst1.8      {d9},  [r9], r5
-    vst1.8      {d15}, [r9], r5
-    vst1.8      {d13}, [r9]
-
-    add         r0, #8*2          @ src   += 8*2
-    add         r2, r3, lsl #3    @ dst_a += 8 * dst_stride_a
-    add         r4, r5, lsl #3    @ dst_b += 8 * dst_stride_b
-    subs        r8,  #8           @ w     -= 8
-    bge         Lloop_8x8_di
-
-  @ add 8 back to counter.  if the result is 0 there are
-  @ no residuals.
-  adds        r8, #8
-  beq         Ldone_di
-
-  @ some residual, so between 1 and 7 lines left to transpose
-  cmp         r8, #2
-  blt         Lblock_1x8_di
-
-  cmp         r8, #4
-  blt         Lblock_2x8_di
-
-@ TODO(frkoenig) : clean this up
-Lblock_4x8_di:
-  mov         r9, r0
-  vld1.64     {d0}, [r9], r1
-  vld1.64     {d1}, [r9], r1
-  vld1.64     {d2}, [r9], r1
-  vld1.64     {d3}, [r9], r1
-  vld1.64     {d4}, [r9], r1
-  vld1.64     {d5}, [r9], r1
-  vld1.64     {d6}, [r9], r1
-  vld1.64     {d7}, [r9]
-
-  adr         r12, vtbl_4x4_transpose_di
-  vld1.8      {q7}, [r12]
-
-  vtrn.8      q0, q1
-  vtrn.8      q2, q3
-
-  vtbl.8      d8,  {d0, d1}, d14
-  vtbl.8      d9,  {d0, d1}, d15
-  vtbl.8      d10, {d2, d3}, d14
-  vtbl.8      d11, {d2, d3}, d15
-  vtbl.8      d12, {d4, d5}, d14
-  vtbl.8      d13, {d4, d5}, d15
-  vtbl.8      d0,  {d6, d7}, d14
-  vtbl.8      d1,  {d6, d7}, d15
-
-  mov         r9, r2
-
-  vst1.32     {d8[0]},  [r9], r3
-  vst1.32     {d8[1]},  [r9], r3
-  vst1.32     {d9[0]},  [r9], r3
-  vst1.32     {d9[1]},  [r9], r3
-
-  add         r9, r2, #4
-  vst1.32     {d12[0]}, [r9], r3
-  vst1.32     {d12[1]}, [r9], r3
-  vst1.32     {d13[0]}, [r9], r3
-  vst1.32     {d13[1]}, [r9]
-
-  mov         r9, r4
-
-  vst1.32     {d10[0]}, [r9], r5
-  vst1.32     {d10[1]}, [r9], r5
-  vst1.32     {d11[0]}, [r9], r5
-  vst1.32     {d11[1]}, [r9], r5
-
-  add         r9, r4, #4
-  vst1.32     {d0[0]},  [r9], r5
-  vst1.32     {d0[1]},  [r9], r5
-  vst1.32     {d1[0]},  [r9], r5
-  vst1.32     {d1[1]},  [r9]
-
-  add         r0, #4*2          @ src   += 4 * 2
-  add         r2, r3, lsl #2    @ dst_a += 4 * dst_stride_a
-  add         r4, r5, lsl #2    @ dst_b += 4 * dst_stride_b
-  subs        r8,  #4           @ w     -= 4
-  beq         Ldone_di
-
-  @ some residual, check to see if it includes a 2x8 block,
-  @ or less
-  cmp         r8, #2
-  blt         Lblock_1x8_di
-
-Lblock_2x8_di:
-  mov         r9, r0
-  vld2.16     {d0[0], d2[0]}, [r9], r1
-  vld2.16     {d1[0], d3[0]}, [r9], r1
-  vld2.16     {d0[1], d2[1]}, [r9], r1
-  vld2.16     {d1[1], d3[1]}, [r9], r1
-  vld2.16     {d0[2], d2[2]}, [r9], r1
-  vld2.16     {d1[2], d3[2]}, [r9], r1
-  vld2.16     {d0[3], d2[3]}, [r9], r1
-  vld2.16     {d1[3], d3[3]}, [r9]
-
-  vtrn.8      d0, d1
-  vtrn.8      d2, d3
-
-  mov         r9, r2
-
-  vst1.64     {d0}, [r9], r3
-  vst1.64     {d2}, [r9]
-
-  mov         r9, r4
-
-  vst1.64     {d1}, [r9], r5
-  vst1.64     {d3}, [r9]
-
-  add         r0, #2*2          @ src   += 2 * 2
-  add         r2, r3, lsl #1    @ dst_a += 2 * dst_stride_a
-  add         r4, r5, lsl #1    @ dst_a += 2 * dst_stride_a
-  subs        r8,  #2           @ w     -= 2
-  beq         Ldone_di
-
-Lblock_1x8_di:
-  vld2.8      {d0[0], d1[0]}, [r0], r1
-  vld2.8      {d0[1], d1[1]}, [r0], r1
-  vld2.8      {d0[2], d1[2]}, [r0], r1
-  vld2.8      {d0[3], d1[3]}, [r0], r1
-  vld2.8      {d0[4], d1[4]}, [r0], r1
-  vld2.8      {d0[5], d1[5]}, [r0], r1
-  vld2.8      {d0[6], d1[6]}, [r0], r1
-  vld2.8      {d0[7], d1[7]}, [r0]
-
-  vst1.64     {d0}, [r2]
-  vst1.64     {d1}, [r4]
-
-Ldone_di:
-  pop         {r4-r9, pc}
-
-vtbl_4x4_transpose_di:
-  .byte  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
diff --git a/files/source/rotate_priv.h b/files/source/rotate_priv.h
deleted file mode 100644
index b4df1494..00000000
--- a/files/source/rotate_priv.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef SOURCE_ROTATE_PRIV_H_
-#define SOURCE_ROTATE_PRIV_H_
-
-#include "libyuv/basic_types.h"
-
-namespace libyuv {
-
-// Rotate planes by 90, 180, 270
-void
-RotatePlane90(const uint8* src, int src_stride,
-              uint8* dst, int dst_stride,
-              int width, int height);
-
-void
-RotatePlane180(const uint8* src, int src_stride,
-               uint8* dst, int dst_stride,
-               int width, int height);
-
-void
-RotatePlane270(const uint8* src, int src_stride,
-               uint8* dst, int dst_stride,
-               int width, int height);
-
-void
-RotateUV90(const uint8* src, int src_stride,
-           uint8* dst_a, int dst_stride_a,
-           uint8* dst_b, int dst_stride_b,
-           int width, int height);
-
-// Rotations for when U and V are interleaved.
-// These functions take one input pointer and
-// split the data into two buffers while
-// rotating them.
-void
-RotateUV180(const uint8* src, int src_stride,
-            uint8* dst_a, int dst_stride_a,
-            uint8* dst_b, int dst_stride_b,
-            int width, int height);
-
-void
-RotateUV270(const uint8* src, int src_stride,
-            uint8* dst_a, int dst_stride_a,
-            uint8* dst_b, int dst_stride_b,
-            int width, int height);
-
-// The 90 and 270 functions are based on transposes.
-// Doing a transpose with reversing the read/write
-// order will result in a rotation by +- 90 degrees.
-void
-TransposePlane(const uint8* src, int src_stride,
-               uint8* dst, int dst_stride,
-               int width, int height);
-
-void
-TransposeUV(const uint8* src, int src_stride,
-            uint8* dst_a, int dst_stride_a,
-            uint8* dst_b, int dst_stride_b,
-            int width, int height);
-
-}  // namespace libyuv
-
-#endif  // SOURCE_ROTATE_PRIV_H_
diff --git a/files/source/row.h b/files/source/row.h
deleted file mode 100644
index 85343c56..00000000
--- a/files/source/row.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef LIBYUV_SOURCE_ROW_H_
-#define LIBYUV_SOURCE_ROW_H_
-
-#include "libyuv/basic_types.h"
-
-// The following are available on all x86 platforms
-#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
-    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#define HAS_ARGBTOYROW_SSSE3
-#define HAS_BG24TOARGBROW_SSSE3
-#define HAS_RAWTOARGBROW_SSSE3
-#define HAS_RGB24TOYROW_SSSE3
-#define HAS_RAWTOYROW_SSSE3
-#define HAS_RGB24TOUVROW_SSSE3
-#define HAS_RAWTOUVROW_SSSE3
-#endif
-
-// The following are available only on Windows
-#if defined(WIN32) \
-    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#define HAS_BGRATOYROW_SSSE3
-#define HAS_ABGRTOYROW_SSSE3
-#define HAS_ARGBTOUVROW_SSSE3
-#define HAS_BGRATOUVROW_SSSE3
-#define HAS_ABGRTOUVROW_SSSE3
-#endif
-
-extern "C" {
-#ifdef HAS_ARGBTOYROW_SSSE3
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width);
-#endif
-#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
-#define HASRGB24TOYROW_SSSE3
-#endif
-#ifdef HASRGB24TOYROW_SSSE3
-void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-#endif
-void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
-                   uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
-                   uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
-                   uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
-                    uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
-                  uint8* dst_u, uint8* dst_v, int width);
-
-#ifdef HAS_BG24TOARGBROW_SSSE3
-void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
-void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
-#endif
-void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
-void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
-
-#if defined(_MSC_VER)
-#define SIMD_ALIGNED(var) __declspec(align(16)) var
-#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
-#else
-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-#define TALIGN16(t, var) t var __attribute__((aligned(16)))
-#endif
-
-#ifdef OSX
-extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
-extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
-extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
-#else
-extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
-extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]);
-extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]);
-#endif
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width);
-
-void FastConvertYUVToBGRARow(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* rgb_buf,
-                             int width);
-
-void FastConvertYUVToABGRRow(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* rgb_buf,
-                             int width);
-
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* rgb_buf,
-                                 int width);
-
-void FastConvertYToRGB32Row(const uint8* y_buf,
-                            uint8* rgb_buf,
-                            int width);
-
-// Method to force C version.
-//#define USE_MMX 0
-//#define USE_SSE2 0
-
-#if !defined(USE_MMX)
-// Windows, Mac and Linux use MMX
-#if defined(__i386__) || defined(_MSC_VER)
-#define USE_MMX 1
-#else
-#define USE_MMX 0
-#endif
-#endif
-
-#if !defined(USE_SSE2)
-#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
-#define USE_SSE2 1
-#else
-#define USE_SSE2 0
-#endif
-#endif
-
-// x64 uses MMX2 (SSE) so emms is not required.
-// Warning C4799: function has no EMMS instruction.
-// EMMS() is slow and should be called by the calling function once per image.
-#if USE_MMX && !defined(ARCH_CPU_X86_64)
-#if defined(_MSC_VER)
-#define EMMS() __asm emms
-#pragma warning(disable: 4799)
-#else
-#define EMMS() asm("emms")
-#endif
-#else
-#define EMMS()
-#endif
-
-
-}  // extern "C"
-
-#endif  // LIBYUV_SOURCE_ROW_H_
diff --git a/files/source/row_common.cc b/files/source/row_common.cc
new file mode 100644
index 00000000..c5f3ce05
--- /dev/null
+++ b/files/source/row_common.cc
@@ -0,0 +1,1246 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h>  // For memcpy
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
+    // To support in-place conversion.
+    uint8 a = src_bgra[0];
+    uint8 r = src_bgra[1];
+    uint8 g = src_bgra[2];
+    uint8 b = src_bgra[3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    dst_argb += 4;
+    src_bgra += 4;
+  }
+}
+
+void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
+    // To support in-place conversion.
+    uint8 r = src_abgr[0];
+    uint8 g = src_abgr[1];
+    uint8 b = src_abgr[2];
+    uint8 a = src_abgr[3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    dst_argb += 4;
+    src_abgr += 4;
+  }
+}
+
+void RGBAToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
+    // To support in-place conversion.
+    uint8 a = src_abgr[0];
+    uint8 b = src_abgr[1];
+    uint8 g = src_abgr[2];
+    uint8 r = src_abgr[3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    dst_argb += 4;
+    src_abgr += 4;
+  }
+}
+
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
+    uint8 b = src_rgb24[0];
+    uint8 g = src_rgb24[1];
+    uint8 r = src_rgb24[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_rgb24 += 3;
+  }
+}
+
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
+    uint8 r = src_raw[0];
+    uint8 g = src_raw[1];
+    uint8 b = src_raw[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_raw += 3;
+  }
+}
+
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
+    uint8 b = src_rgb[0] & 0x1f;
+    uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x07) << 3);
+    uint8 r = src_rgb[1] >> 3;
+    dst_argb[0] = (b << 3) | (b >> 2);
+    dst_argb[1] = (g << 2) | (g >> 4);
+    dst_argb[2] = (r << 3) | (r >> 2);
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_rgb += 2;
+  }
+}
+
+void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
+    uint8 b = src_rgb[0] & 0x1f;
+    uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x03) << 3);
+    uint8 r = (src_rgb[1] & 0x7c) >> 2;
+    uint8 a = src_rgb[1] >> 7;
+    dst_argb[0] = (b << 3) | (b >> 2);
+    dst_argb[1] = (g << 3) | (g >> 2);
+    dst_argb[2] = (r << 3) | (r >> 2);
+    dst_argb[3] = -a;
+    dst_argb += 4;
+    src_rgb += 2;
+  }
+}
+
+void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
+    uint8 b = src_rgb[0] & 0x0f;
+    uint8 g = src_rgb[0] >> 4;
+    uint8 r = src_rgb[1] & 0x0f;
+    uint8 a = src_rgb[1] >> 4;
+    dst_argb[0] = (b << 4) | b;
+    dst_argb[1] = (g << 4) | g;
+    dst_argb[2] = (r << 4) | r;
+    dst_argb[3] = (a << 4) | a;
+    dst_argb += 4;
+    src_rgb += 2;
+  }
+}
+
+void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width; ++x) {
+    uint8 b = src_argb[0];
+    uint8 g = src_argb[1];
+    uint8 r = src_argb[2];
+    uint8 a = src_argb[3];
+    dst_rgb[0] = a;
+    dst_rgb[1] = b;
+    dst_rgb[2] = g;
+    dst_rgb[3] = r;
+    dst_rgb += 4;
+    src_argb += 4;
+  }
+}
+
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width; ++x) {
+    uint8 b = src_argb[0];
+    uint8 g = src_argb[1];
+    uint8 r = src_argb[2];
+    dst_rgb[0] = b;
+    dst_rgb[1] = g;
+    dst_rgb[2] = r;
+    dst_rgb += 3;
+    src_argb += 4;
+  }
+}
+
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width; ++x) {
+    uint8 b = src_argb[0];
+    uint8 g = src_argb[1];
+    uint8 r = src_argb[2];
+    dst_rgb[0] = r;
+    dst_rgb[1] = g;
+    dst_rgb[2] = b;
+    dst_rgb += 3;
+    src_argb += 4;
+  }
+}
+
+// TODO(fbarchard): support big endian CPU
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 2;
+    uint8 r0 = src_argb[2] >> 3;
+    uint8 b1 = src_argb[4] >> 3;
+    uint8 g1 = src_argb[5] >> 2;
+    uint8 r1 = src_argb[6] >> 3;
+    *reinterpret_cast<uint32*>(dst_rgb) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 2;
+    uint8 r0 = src_argb[2] >> 3;
+    *reinterpret_cast<uint16*>(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 3;
+    uint8 r0 = src_argb[2] >> 3;
+    uint8 a0 = src_argb[3] >> 7;
+    uint8 b1 = src_argb[4] >> 3;
+    uint8 g1 = src_argb[5] >> 3;
+    uint8 r1 = src_argb[6] >> 3;
+    uint8 a1 = src_argb[7] >> 7;
+    *reinterpret_cast<uint32*>(dst_rgb) =
+        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 3;
+    uint8 r0 = src_argb[2] >> 3;
+    uint8 a0 = src_argb[3] >> 7;
+    *reinterpret_cast<uint16*>(dst_rgb) =
+        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+  }
+}
+
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb[0] >> 4;
+    uint8 g0 = src_argb[1] >> 4;
+    uint8 r0 = src_argb[2] >> 4;
+    uint8 a0 = src_argb[3] >> 4;
+    uint8 b1 = src_argb[4] >> 4;
+    uint8 g1 = src_argb[5] >> 4;
+    uint8 r1 = src_argb[6] >> 4;
+    uint8 a1 = src_argb[7] >> 4;
+    *reinterpret_cast<uint32*>(dst_rgb) =
+        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
+        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb[0] >> 4;
+    uint8 g0 = src_argb[1] >> 4;
+    uint8 r0 = src_argb[2] >> 4;
+    uint8 a0 = src_argb[3] >> 4;
+    *reinterpret_cast<uint16*>(dst_rgb) =
+        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+  }
+}
+
+static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
+  return (( 66 * r + 129 * g +  25 * b + 128) >> 8) + 16;
+}
+
+static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+  return ((-38 * r -  74 * g + 112 * b + 128) >> 8) + 128;
+}
+static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+  return ((112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
+}
+
+#define MAKEROWY(NAME, R, G, B) \
+void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
+  for (int x = 0; x < width; ++x) {                                            \
+    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
+    src_argb0 += 4;                                                            \
+    dst_y += 1;                                                                \
+  }                                                                            \
+}                                                                              \
+void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
+                       uint8* dst_u, uint8* dst_v, int width) {                \
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
+  for (int x = 0; x < width - 1; x += 2) {                                     \
+    uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] +                                \
+               src_rgb1[B] + src_rgb1[B + 4]) >> 2;                            \
+    uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] +                                \
+               src_rgb1[G] + src_rgb1[G + 4]) >> 2;                            \
+    uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] +                                \
+               src_rgb1[R] + src_rgb1[R + 4]) >> 2;                            \
+    dst_u[0] = RGBToU(ar, ag, ab);                                             \
+    dst_v[0] = RGBToV(ar, ag, ab);                                             \
+    src_rgb0 += 8;                                                             \
+    src_rgb1 += 8;                                                             \
+    dst_u += 1;                                                                \
+    dst_v += 1;                                                                \
+  }                                                                            \
+  if (width & 1) {                                                             \
+    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
+    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
+    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
+    dst_u[0] = RGBToU(ar, ag, ab);                                             \
+    dst_v[0] = RGBToV(ar, ag, ab);                                             \
+  }                                                                            \
+}
+
+MAKEROWY(ARGB, 2, 1, 0)
+MAKEROWY(BGRA, 1, 2, 3)
+MAKEROWY(ABGR, 0, 1, 2)
+MAKEROWY(RGBA, 3, 2, 1)
+
+// http://en.wikipedia.org/wiki/Grayscale.
+// 0.11 * B + 0.59 * G + 0.30 * R
+// Coefficients rounded to multiple of 2 for consistency with SSSE3 version.
+static __inline int RGBToGray(uint8 r, uint8 g, uint8 b) {
+  return (( 76 * r + 152 * g +  28 * b) >> 8);
+}
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
+    uint8 y = RGBToGray(src_argb[2], src_argb[1], src_argb[0]);
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+    dst_argb[3] = src_argb[3];
+    dst_argb += 4;
+    src_argb += 4;
+  }
+}
+
+// Convert a row of image to Sepia tone.
+void ARGBSepiaRow_C(uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    int sb = (b * 17 + g * 68 + r * 35) >> 7;
+    int sg = (b * 22 + g * 88 + r * 45) >> 7;
+    int sr = (b * 24 + g * 98 + r * 50) >> 7;
+    // b does not over flow. a is preserved from original.
+    if (sg > 255) {
+      sg = 255;
+    }
+    if (sr > 255) {
+      sr = 255;
+    }
+    dst_argb[0] = sb;
+    dst_argb[1] = sg;
+    dst_argb[2] = sr;
+    dst_argb += 4;
+  }
+}
+
+// Apply color matrix to a row of image. Matrix is signed.
+void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) {
+  for (int x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    int a = dst_argb[3];
+    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
+              r * matrix_argb[2] + a * matrix_argb[3]) >> 7;
+    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
+              r * matrix_argb[6] + a * matrix_argb[7]) >> 7;
+    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
+              r * matrix_argb[10] + a * matrix_argb[11]) >> 7;
+    if (sb < 0) {
+      sb = 0;
+    }
+    if (sb > 255) {
+      sb = 255;
+    }
+    if (sg < 0) {
+      sg = 0;
+    }
+    if (sg > 255) {
+      sg = 255;
+    }
+    if (sr < 0) {
+      sr = 0;
+    }
+    if (sr > 255) {
+      sr = 255;
+    }
+    dst_argb[0] = sb;
+    dst_argb[1] = sg;
+    dst_argb[2] = sr;
+    dst_argb += 4;
+  }
+}
+
+// Apply color table to a row of image.
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+  for (int x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    int a = dst_argb[3];
+    dst_argb[0] = table_argb[b * 4 + 0];
+    dst_argb[1] = table_argb[g * 4 + 1];
+    dst_argb[2] = table_argb[r * 4 + 2];
+    dst_argb[3] = table_argb[a * 4 + 3];
+    dst_argb += 4;
+  }
+}
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+                       int interval_offset, int width) {
+  for (int x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
+    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
+    dst_argb += 4;
+  }
+}
+
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+  // Copy a Y to RGB.
+  for (int x = 0; x < width; ++x) {
+    uint8 y = src_y[0];
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    ++src_y;
+  }
+}
+
+// C reference code that mimics the YUV assembly.
+
+#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
+
+#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
+#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
+#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+static __inline uint32 Clip(int32 val) {
+  if (val < 0) {
+    return static_cast<uint32>(0);
+  } else if (val > 255) {
+    return static_cast<uint32>(255);
+  }
+  return static_cast<uint32>(val);
+}
+
+static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
+                              int ashift, int rshift, int gshift, int bshift) {
+  int32 y1 = (static_cast<int32>(y) - 16) * YG;
+  uint32 b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
+  uint32 g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
+  uint32 r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
+  *reinterpret_cast<uint32*>(rgb_buf) = (b << bshift) |
+                                        (g << gshift) |
+                                        (r << rshift) |
+                                        (255u << ashift);
+}
+
+static __inline void YuvPixel2(uint8 y, uint8 u, uint8 v,
+                               uint8* b, uint8* g, uint8* r) {
+  int32 y1 = (static_cast<int32>(y) - 16) * YG;
+  *b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
+  *g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
+  *r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
+}
+
+void I444ToARGBRow_C(const uint8* y_buf,
+                     const uint8* u_buf,
+                     const uint8* v_buf,
+                     uint8* rgb_buf,
+                     int width) {
+  for (int x = 0; x < width; ++x) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0);
+    y_buf += 1;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+
+// Also used for 420
+void I422ToARGBRow_C(const uint8* y_buf,
+                     const uint8* u_buf,
+                     const uint8* v_buf,
+                     uint8* rgb_buf,
+                     int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
+    y_buf += 2;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+  }
+}
+
+void I422ToRGB24Row_C(const uint8* y_buf,
+                      const uint8* u_buf,
+                      const uint8* v_buf,
+                      uint8* rgb_buf,
+                      int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
+              rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
+    y_buf += 2;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+  }
+}
+
+void I422ToRAWRow_C(const uint8* y_buf,
+                    const uint8* u_buf,
+                    const uint8* v_buf,
+                    uint8* rgb_buf,
+                    int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+    YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
+              rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
+    y_buf += 2;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+  }
+}
+
+void I411ToARGBRow_C(const uint8* y_buf,
+                     const uint8* u_buf,
+                     const uint8* v_buf,
+                     uint8* rgb_buf,
+                     int width) {
+  for (int x = 0; x < width - 3; x += 4) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
+    YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0);
+    YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0);
+    y_buf += 4;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 16;  // Advance 4 pixels.
+  }
+  if (width & 2) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
+    y_buf += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+  }
+}
+
+void NV12ToARGBRow_C(const uint8* y_buf,
+                     const uint8* uv_buf,
+                     uint8* rgb_buf,
+                     int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(y_buf[1], uv_buf[0], uv_buf[1], rgb_buf + 4, 24, 16, 8, 0);
+    y_buf += 2;
+    uv_buf += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
+  }
+}
+
+void NV21ToARGBRow_C(const uint8* y_buf,
+                     const uint8* vu_buf,
+                     uint8* rgb_buf,
+                     int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(y_buf[1], vu_buf[1], vu_buf[0], rgb_buf + 4, 24, 16, 8, 0);
+    y_buf += 2;
+    vu_buf += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+  }
+}
+
+void I422ToBGRARow_C(const uint8* y_buf,
+                     const uint8* u_buf,
+                     const uint8* v_buf,
+                     uint8* rgb_buf,
+                     int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24);
+    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24);
+    y_buf += 2;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 0, 8, 16, 24);
+  }
+}
+
+void I422ToABGRRow_C(const uint8* y_buf,
+                     const uint8* u_buf,
+                     const uint8* v_buf,
+                     uint8* rgb_buf,
+                     int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
+    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16);
+    y_buf += 2;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
+  }
+}
+
+void I422ToRGBARow_C(const uint8* y_buf,
+                     const uint8* u_buf,
+                     const uint8* v_buf,
+                     uint8* rgb_buf,
+                     int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 24, 16, 8);
+    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 24, 16, 8);
+    y_buf += 2;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 24, 16, 8);
+  }
+}
+
+void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) {
+  for (int x = 0; x < width; ++x) {
+    YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0);
+    y_buf += 1;
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+
+void MirrorRow_C(const uint8* src, uint8* dst, int width) {
+  src += width - 1;
+  for (int x = 0; x < width - 1; x += 2) {
+    dst[x] = src[0];
+    dst[x + 1] = src[-1];
+    src -= 2;
+  }
+  if (width & 1) {
+    dst[width - 1] = src[0];
+  }
+}
+
+void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  src_uv += (width - 1) << 1;
+  for (int x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[-2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[-2 + 1];
+    src_uv -= 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
+  }
+}
+
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
+  const uint32* src32 = reinterpret_cast<const uint32*>(src);
+  uint32* dst32 = reinterpret_cast<uint32*>(dst);
+  src32 += width - 1;
+  for (int x = 0; x < width - 1; x += 2) {
+    dst32[x] = src32[0];
+    dst32[x + 1] = src32[-1];
+    src32 -= 2;
+  }
+  if (width & 1) {
+    dst32[width - 1] = src32[0];
+  }
+}
+
+void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[3];
+    src_uv += 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
+  }
+}
+
+void CopyRow_C(const uint8* src, uint8* dst, int count) {
+  memcpy(dst, src, count);
+}
+
+void SetRow8_C(uint8* dst, uint32 v8, int count) {
+#ifdef _MSC_VER
+  // VC will generate rep stosb.
+  for (int x = 0; x < count; ++x) {
+    dst[x] = v8;
+  }
+#else
+  memset(dst, v8, count);
+#endif
+}
+
+void SetRows32_C(uint8* dst, uint32 v32, int width,
+                 int dst_stride, int height) {
+  for (int y = 0; y < height; ++y) {
+    uint32* d = reinterpret_cast<uint32*>(dst);
+    for (int x = 0; x < width; ++x) {
+      d[x] = v32;
+    }
+    dst += dst_stride;
+  }
+}
+
+// Filter 2 rows of YUY2 UV's (422) into U and V (420).
+void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
+                   uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values, filtering 2 rows of YUY2.
+  for (int x = 0; x < width; x += 2) {
+    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+    src_yuy2 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of YUY2 UV's (422) into U and V (422).
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  for (int x = 0; x < width; x += 2) {
+    dst_u[0] = src_yuy2[1];
+    dst_v[0] = src_yuy2[3];
+    src_yuy2 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of YUY2 Y's (422) into Y (420/422).
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
+  // Output a row of Y values.
+  for (int x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_yuy2[0];
+    dst_y[x + 1] = src_yuy2[2];
+    src_yuy2 += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_yuy2[0];
+  }
+}
+
+// Filter 2 rows of UYVY UV's (422) into U and V (420).
+void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
+                   uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  for (int x = 0; x < width; x += 2) {
+    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
+    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
+    src_uyvy += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of UYVY UV's (422) into U and V (422).
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  for (int x = 0; x < width; x += 2) {
+    dst_u[0] = src_uyvy[0];
+    dst_v[0] = src_uyvy[2];
+    src_uyvy += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of UYVY Y's (422) into Y (420/422).
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
+  // Output a row of Y values.
+  for (int x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_uyvy[1];
+    dst_y[x + 1] = src_uyvy[3];
+    src_uyvy += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_uyvy[1];
+  }
+}
+
+#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+
+// Blend src_argb0 over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb0 or src_argb1.
+// This code mimics the SSSE3 version for better testability.
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                    uint8* dst_argb, int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    uint32 fb = src_argb0[0];
+    uint32 fg = src_argb0[1];
+    uint32 fr = src_argb0[2];
+    uint32 a = src_argb0[3];
+    uint32 bb = src_argb1[0];
+    uint32 bg = src_argb1[1];
+    uint32 br = src_argb1[2];
+    dst_argb[0] = BLEND(fb, bb, a);
+    dst_argb[1] = BLEND(fg, bg, a);
+    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[3] = 255u;
+
+    fb = src_argb0[4 + 0];
+    fg = src_argb0[4 + 1];
+    fr = src_argb0[4 + 2];
+    a = src_argb0[4 + 3];
+    bb = src_argb1[4 + 0];
+    bg = src_argb1[4 + 1];
+    br = src_argb1[4 + 2];
+    dst_argb[4 + 0] = BLEND(fb, bb, a);
+    dst_argb[4 + 1] = BLEND(fg, bg, a);
+    dst_argb[4 + 2] = BLEND(fr, br, a);
+    dst_argb[4 + 3] = 255u;
+    src_argb0 += 8;
+    src_argb1 += 8;
+    dst_argb += 8;
+  }
+
+  if (width & 1) {
+    uint32 fb = src_argb0[0];
+    uint32 fg = src_argb0[1];
+    uint32 fr = src_argb0[2];
+    uint32 a = src_argb0[3];
+    uint32 bb = src_argb1[0];
+    uint32 bg = src_argb1[1];
+    uint32 br = src_argb1[2];
+    dst_argb[0] = BLEND(fb, bb, a);
+    dst_argb[1] = BLEND(fg, bg, a);
+    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[3] = 255u;
+  }
+}
+#undef BLEND
+#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+
+// Multiply source RGB by alpha and store to destination.
+// This code mimics the SSSE3 version for better testability.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  for (int i = 0; i < width - 1; i += 2) {
+    uint32 b = src_argb[0];
+    uint32 g = src_argb[1];
+    uint32 r = src_argb[2];
+    uint32 a = src_argb[3];
+    dst_argb[0] = ATTENUATE(b, a);
+    dst_argb[1] = ATTENUATE(g, a);
+    dst_argb[2] = ATTENUATE(r, a);
+    dst_argb[3] = a;
+    b = src_argb[4];
+    g = src_argb[5];
+    r = src_argb[6];
+    a = src_argb[7];
+    dst_argb[4] = ATTENUATE(b, a);
+    dst_argb[5] = ATTENUATE(g, a);
+    dst_argb[6] = ATTENUATE(r, a);
+    dst_argb[7] = a;
+    src_argb += 8;
+    dst_argb += 8;
+  }
+
+  if (width & 1) {
+    const uint32 b = src_argb[0];
+    const uint32 g = src_argb[1];
+    const uint32 r = src_argb[2];
+    const uint32 a = src_argb[3];
+    dst_argb[0] = ATTENUATE(b, a);
+    dst_argb[1] = ATTENUATE(g, a);
+    dst_argb[2] = ATTENUATE(r, a);
+    dst_argb[3] = a;
+  }
+}
+#undef ATTENUATE
+
+// Divide source RGB by alpha and store to destination.
+// b = (b * 255 + (a / 2)) / a;
+// g = (g * 255 + (a / 2)) / a;
+// r = (r * 255 + (a / 2)) / a;
+// Reciprocal method is off by 1 on some values. ie 125
+// 8.16 fixed point inverse table
+#define T(a) 0x10000 / a
+uint32 fixed_invtbl8[256] = {
+  0x0100, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
+  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
+  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
+  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
+  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
+  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
+  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
+  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
+  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
+  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
+  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
+  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
+  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
+  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
+  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
+  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
+  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
+  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
+  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
+  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
+  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
+  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
+  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
+  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
+  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
+  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
+  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
+  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
+  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
+  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
+  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
+  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x0100 };
+#undef T
+
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  for (int i = 0; i < width; ++i) {
+    uint32 b = src_argb[0];
+    uint32 g = src_argb[1];
+    uint32 r = src_argb[2];
+    const uint32 a = src_argb[3];
+    if (a) {
+      const uint32 ia = fixed_invtbl8[a];  // 8.16 fixed point
+      b = (b * ia) >> 8;
+      g = (g * ia) >> 8;
+      r = (r * ia) >> 8;
+      // Clamping should not be necessary but is free in assembly.
+      if (b > 255) {
+        b = 255;
+      }
+      if (g > 255) {
+        g = 255;
+      }
+      if (r > 255) {
+        r = 255;
+      }
+    }
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+// Wrappers to handle odd width
+#define YANY(NAMEANY, I420TORGB_SSE, I420TORGB_C, UV_SHIFT)                    \
+    void NAMEANY(const uint8* y_buf,                                           \
+                 const uint8* u_buf,                                           \
+                 const uint8* v_buf,                                           \
+                 uint8* rgb_buf,                                               \
+                 int width) {                                                  \
+      int n = width & ~7;                                                      \
+      I420TORGB_SSE(y_buf, u_buf, v_buf, rgb_buf, n);                          \
+      I420TORGB_C(y_buf + n,                                                   \
+                  u_buf + (n >> UV_SHIFT),                                     \
+                  v_buf + (n >> UV_SHIFT),                                     \
+                  rgb_buf + n * 4, width & 7);                                 \
+    }
+
+// Wrappers to handle odd width
+#define Y2NY(NAMEANY, NV12TORGB_SSE, NV12TORGB_C, UV_SHIFT)                    \
+    void NAMEANY(const uint8* y_buf,                                           \
+                 const uint8* uv_buf,                                          \
+                 uint8* rgb_buf,                                               \
+                 int width) {                                                  \
+      int n = width & ~7;                                                      \
+      NV12TORGB_SSE(y_buf, uv_buf, rgb_buf, n);                                \
+      NV12TORGB_C(y_buf + n,                                                   \
+                  uv_buf + (n >> UV_SHIFT),                                    \
+                  rgb_buf + n * 4, width & 7);                                 \
+    }
+
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0)
+YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1)
+YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2)
+Y2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, 0)
+Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0)
+YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
+YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
+#endif
+#ifdef HAS_I422TORGB24ROW_SSSE3
+YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3,                 \
+     I422ToRGB24Row_C, 1)
+YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1)
+#endif
+#ifdef HAS_I422TORGBAROW_SSSE3
+YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
+#endif
+#ifdef HAS_I422TOARGBROW_NEON
+YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1)
+YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1)
+YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1)
+YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1)
+Y2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0)
+Y2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0)
+YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1)
+YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1)
+#endif
+#undef YANY
+
+#define RGBANY(NAMEANY, ARGBTORGB, BPP)                                        \
+    void NAMEANY(const uint8* argb_buf,                                        \
+                 uint8* rgb_buf,                                               \
+                 int width) {                                                  \
+      SIMD_ALIGNED(uint8 row[kMaxStride]);                                     \
+      ARGBTORGB(argb_buf, row, width);                                         \
+      memcpy(rgb_buf, row, width * BPP);                                       \
+    }
+
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 3)
+RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 3)
+RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2)
+RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2)
+RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 3)
+RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 3)
+#endif
+#undef RGBANY
+
+#define YANY(NAMEANY, ARGBTOY_SSE, BPP)                                        \
+    void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \
+      ARGBTOY_SSE(src_argb, dst_y, width - 16);                                \
+      ARGBTOY_SSE(src_argb + (width - 16) * BPP, dst_y + (width - 16), 16);    \
+    }
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4)
+YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4)
+YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4)
+#endif
+#ifdef HAS_RGBATOYROW_SSSE3
+YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4)
+#endif
+#ifdef HAS_YUY2TOYROW_SSE2
+YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2)
+YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
+YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2)
+YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2)
+#endif
+#undef YANY
+
+#define UVANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP)                            \
+    void NAMEANY(const uint8* src_argb, int src_stride_argb,                   \
+                 uint8* dst_u, uint8* dst_v, int width) {                      \
+      int n = width & ~15;                                                     \
+      ANYTOUV_SSE(src_argb, src_stride_argb, dst_u, dst_v, n);                 \
+      ANYTOUV_C(src_argb  + n * BPP, src_stride_argb,                          \
+                 dst_u + (n >> 1),                                             \
+                 dst_v + (n >> 1),                                             \
+                 width & 15);                                                  \
+    }
+
+#ifdef HAS_ARGBTOUVROW_SSSE3
+UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4)
+UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4)
+UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4)
+#endif
+#ifdef HAS_RGBATOYROW_SSSE3
+UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4)
+#endif
+#ifdef HAS_YUY2TOUVROW_SSE2
+UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2)
+UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
+UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2)
+UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2)
+#endif
+#undef UVANY
+
+#define UV422ANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP)                         \
+    void NAMEANY(const uint8* src_argb,                                        \
+                 uint8* dst_u, uint8* dst_v, int width) {                      \
+      int n = width & ~15;                                                     \
+      ANYTOUV_SSE(src_argb, dst_u, dst_v, n);                                  \
+      ANYTOUV_C(src_argb  + n * BPP,                                           \
+                 dst_u + (n >> 1),                                             \
+                 dst_v + (n >> 1),                                             \
+                 width & 15);                                                  \
+    }
+
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,               \
+         YUY2ToUV422Row_C, 2)
+UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,               \
+         UYVYToUV422Row_C, 2)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON,                         \
+         YUY2ToUV422Row_C, 2)
+UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,                         \
+         UYVYToUV422Row_C, 2)
+#endif
+#undef UV422ANY
+
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+                               const int32* previous_cumsum, int width) {
+  int32 row_sum[4] = {0, 0, 0, 0};
+  for (int x = 0; x < width; ++x) {
+    row_sum[0] += row[x * 4 + 0];
+    row_sum[1] += row[x * 4 + 1];
+    row_sum[2] += row[x * 4 + 2];
+    row_sum[3] += row[x * 4 + 3];
+    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
+    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
+    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
+    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
+  }
+}
+
+void CumulativeSumToAverage_C(const int32* tl, const int32* bl,
+                              int w, int area, uint8* dst, int count) {
+  float ooa = 1.0f / area;
+  for (int i = 0; i < count; ++i) {
+    dst[0] = static_cast<uint8>((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+    dst[1] = static_cast<uint8>((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+    dst[2] = static_cast<uint8>((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+    dst[3] = static_cast<uint8>((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+    dst += 4;
+    tl += 4;
+    bl += 4;
+  }
+}
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 24
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                    uint32 value) {
+  const uint32 b_scale = REPEAT8(value & 0xff);
+  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
+  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
+  const uint32 a_scale = REPEAT8(value >> 24);
+
+  for (int i = 0; i < width; ++i) {
+    const uint32 b = REPEAT8(src_argb[0]);
+    const uint32 g = REPEAT8(src_argb[1]);
+    const uint32 r = REPEAT8(src_argb[2]);
+    const uint32 a = REPEAT8(src_argb[3]);
+    dst_argb[0] = SHADE(b, b_scale);
+    dst_argb[1] = SHADE(g, g_scale);
+    dst_argb[2] = SHADE(r, r_scale);
+    dst_argb[3] = SHADE(a, a_scale);
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+#undef REPEAT8
+#undef SHADE
+
+// Copy pixels from rotated source to destination row with a slope.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+                     uint8* dst_argb, const float* uv_dudv, int width) {
+  // Render a row of pixels from source into a buffer.
+  float uv[2];
+  uv[0] = uv_dudv[0];
+  uv[1] = uv_dudv[1];
+  for (int i = 0; i < width; ++i) {
+    int x = static_cast<int>(uv[0]);
+    int y = static_cast<int>(uv[1]);
+    *reinterpret_cast<uint32*>(dst_argb) =
+        *reinterpret_cast<const uint32*>(src_argb + y * src_argb_stride +
+                                         x * 4);
+    dst_argb += 4;
+    uv[0] += uv_dudv[2];
+    uv[1] += uv_dudv[3];
+  }
+}
+
+// C version 2x2 -> 2x1.
+void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  uint8* end = dst_ptr + (dst_width << 2);
+  do {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
+    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
+    dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
+    dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
+    dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
+    dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
+    src_ptr += 8;
+    src_ptr1 += 8;
+    dst_ptr += 8;
+  } while (dst_ptr < end);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc
new file mode 100644
index 00000000..19a78330
--- /dev/null
+++ b/files/source/row_neon.cc
@@ -0,0 +1,829 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422                                                             \
+    "vld1.u8    {d0}, [%0]!                    \n"                             \
+    "vld1.u32   {d2[0]}, [%1]!                 \n"                             \
+    "vld1.u32   {d2[1]}, [%2]!                 \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12                                                               \
+    "vld1.u8    {d0}, [%0]!                    \n"                             \
+    "vld1.u8    {d2}, [%1]!                    \n"                             \
+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"                             \
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21                                                               \
+    "vld1.u8    {d0}, [%0]!                    \n"                             \
+    "vld1.u8    {d2}, [%1]!                    \n"                             \
+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
+    "vuzp.u8    d3, d2                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"                             \
+
+#define YUV422TORGB                                                            \
+    "veor.u8    d2, d26                        \n"/*subtract 128 from u and v*/\
+    "vmull.s8   q8, d2, d24                    \n"/*  u/v B/R component      */\
+    "vmull.s8   q9, d2, d25                    \n"/*  u/v G component        */\
+    "vmov.u8    d1, #0                         \n"/*  split odd/even y apart */\
+    "vtrn.u8    d0, d1                         \n"                             \
+    "vsub.s16   q0, q0, q15                    \n"/*  offset y               */\
+    "vmul.s16   q0, q0, q14                    \n"                             \
+    "vadd.s16   d18, d19                       \n"                             \
+    "vqadd.s16  d20, d0, d16                   \n"                             \
+    "vqadd.s16  d21, d1, d16                   \n"                             \
+    "vqadd.s16  d22, d0, d17                   \n"                             \
+    "vqadd.s16  d23, d1, d17                   \n"                             \
+    "vqadd.s16  d16, d0, d18                   \n"                             \
+    "vqadd.s16  d17, d1, d18                   \n"                             \
+    "vqrshrun.s16 d0, q10, #6                  \n"                             \
+    "vqrshrun.s16 d1, q11, #6                  \n"                             \
+    "vqrshrun.s16 d2, q8, #6                   \n"                             \
+    "vmovl.u8   q10, d0                        \n"/*  set up for reinterleave*/\
+    "vmovl.u8   q11, d1                        \n"                             \
+    "vmovl.u8   q8, d2                         \n"                             \
+    "vtrn.u8    d20, d21                       \n"                             \
+    "vtrn.u8    d22, d23                       \n"                             \
+    "vtrn.u8    d16, d17                       \n"                             \
+    "vmov.u8    d21, d16                       \n"
+
+#if defined(HAS_I422TOARGBROW_NEON) || defined(HAS_I422TOBGRAROW_NEON) ||      \
+    defined(HAS_I422TOABGRROW_NEON) || defined(HAS_I422TORGBAROW_NEON)
+static const vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102,
+                               0, 0, 0, 0, 0, 0, 0, 0 };
+static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
+                             0, 0, 0, 0, 0, 0, 0, 0 };
+#endif
+
+#ifdef HAS_I422TOARGBROW_NEON
+void I422ToARGBRow_NEON(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  asm volatile (
+    "vld1.u8    {d24}, [%5]                    \n"
+    "vld1.u8    {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(y_buf),    // %0
+      "+r"(u_buf),    // %1
+      "+r"(v_buf),    // %2
+      "+r"(rgb_buf),  // %3
+      "+r"(width)     // %4
+    : "r"(&kUVToRB),  // %5
+      "r"(&kUVToG)    // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TOARGBROW_NEON
+
+#ifdef HAS_I422TOBGRAROW_NEON
+void I422ToBGRARow_NEON(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  asm volatile (
+    "vld1.u8    {d24}, [%5]                    \n"
+    "vld1.u8    {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    "vmov.u8    d19, #255                      \n"
+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(y_buf),    // %0
+      "+r"(u_buf),    // %1
+      "+r"(v_buf),    // %2
+      "+r"(rgb_buf),  // %3
+      "+r"(width)     // %4
+    : "r"(&kUVToRB),  // %5
+      "r"(&kUVToG)    // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TOBGRAROW_NEON
+
+#ifdef HAS_I422TOABGRROW_NEON
+void I422ToABGRRow_NEON(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  asm volatile (
+    "vld1.u8    {d24}, [%5]                    \n"
+    "vld1.u8    {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    "vmov.u8    d23, #255                      \n"
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(y_buf),    // %0
+      "+r"(u_buf),    // %1
+      "+r"(v_buf),    // %2
+      "+r"(rgb_buf),  // %3
+      "+r"(width)     // %4
+    : "r"(&kUVToRB),  // %5
+      "r"(&kUVToG)    // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TOABGRROW_NEON
+
+#ifdef HAS_I422TORGBAROW_NEON
+void I422ToRGBARow_NEON(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  asm volatile (
+    "vld1.u8    {d24}, [%5]                    \n"
+    "vld1.u8    {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d19, #255                      \n"
+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(y_buf),    // %0
+      "+r"(u_buf),    // %1
+      "+r"(v_buf),    // %2
+      "+r"(rgb_buf),  // %3
+      "+r"(width)     // %4
+    : "r"(&kUVToRB),  // %5
+      "r"(&kUVToG)    // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TORGBAROW_NEON
+
+#ifdef HAS_I422TORGB24ROW_NEON
+void I422ToRGB24Row_NEON(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  asm volatile (
+    "vld1.u8    {d24}, [%5]                    \n"
+    "vld1.u8    {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vst3.8     {d20, d21, d22}, [%3]!         \n"
+    "bgt        1b                             \n"
+    : "+r"(y_buf),    // %0
+      "+r"(u_buf),    // %1
+      "+r"(v_buf),    // %2
+      "+r"(rgb_buf),  // %3
+      "+r"(width)     // %4
+    : "r"(&kUVToRB),  // %5
+      "r"(&kUVToG)    // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TORGB24ROW_NEON
+
+#ifdef HAS_I422TORAWROW_NEON
+void I422ToRAWRow_NEON(const uint8* y_buf,
+                       const uint8* u_buf,
+                       const uint8* v_buf,
+                       uint8* rgb_buf,
+                       int width) {
+  asm volatile (
+    "vld1.u8    {d24}, [%5]                    \n"
+    "vld1.u8    {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    "vst3.8     {d20, d21, d22}, [%3]!         \n"
+    "bgt        1b                             \n"
+    : "+r"(y_buf),    // %0
+      "+r"(u_buf),    // %1
+      "+r"(v_buf),    // %2
+      "+r"(rgb_buf),  // %3
+      "+r"(width)     // %4
+    : "r"(&kUVToRB),  // %5
+      "r"(&kUVToG)    // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TORAWROW_NEON
+
+#ifdef HAS_NV12TOARGBROW_NEON
+void NV12ToARGBRow_NEON(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  asm volatile (
+    "vld1.u8    {d24}, [%4]                    \n"
+    "vld1.u8    {d25}, [%5]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(y_buf),    // %0
+      "+r"(uv_buf),   // %1
+      "+r"(rgb_buf),  // %2
+      "+r"(width)     // %3
+    : "r"(&kUVToRB),  // %4
+      "r"(&kUVToG)    // %5
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_NV12TOARGBROW_NEON
+
+#ifdef HAS_NV21TOARGBROW_NEON
+void NV21ToARGBRow_NEON(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  asm volatile (
+    "vld1.u8    {d24}, [%4]                    \n"
+    "vld1.u8    {d25}, [%5]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(y_buf),    // %0
+      "+r"(uv_buf),   // %1
+      "+r"(rgb_buf),  // %2
+      "+r"(width)     // %3
+    : "r"(&kUVToRB),  // %4
+      "r"(&kUVToG)    // %5
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_NV21TOARGBROW_NEON
+
+#ifdef HAS_SPLITUV_NEON
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
+// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
+void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld2.u8    {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "vst1.u8    {q0}, [%1]!                    \n"  // store U
+    "vst1.u8    {q1}, [%2]!                    \n"  // Store V
+    "bgt        1b                             \n"
+    : "+r"(src_uv),  // %0
+      "+r"(dst_u),   // %1
+      "+r"(dst_v),   // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "memory", "cc", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // HAS_SPLITUV_NEON
+
+#ifdef HAS_COPYROW_NEON
+// Copy multiple of 64
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vldm       %0!, {q0, q1, q2, q3}          \n"  // load 64
+    "subs       %2, %2, #64                    \n"  // 64 processed per loop
+    "vstm       %1!, {q0, q1, q2, q3}          \n"  // store 64
+    "bgt        1b                             \n"
+    : "+r"(src),   // %0
+      "+r"(dst),   // %1
+      "+r"(count)  // %2  // Output registers
+    :                     // Input registers
+    : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+#endif  // HAS_COPYROW_NEON
+
+#ifdef HAS_SETROW_NEON
+// SetRow8 writes 'count' bytes using a 32 bit value repeated.
+void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
+  asm volatile (  // NOLINT
+    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+    "1:                                        \n"
+    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
+    "vst1.u32  {q0}, [%0]!                     \n"  // store
+    "bgt       1b                              \n"
+    : "+r"(dst),   // %0
+      "+r"(count)  // %1
+    : "r"(v32)     // %2
+    : "q0", "memory", "cc");
+}
+
+// TODO(fbarchard): Make fully assembler
+// SetRow32 writes 'count' words using a 32 bit value repeated.
+void SetRows32_NEON(uint8* dst, uint32 v32, int width,
+                    int dst_stride, int height) {
+  for (int y = 0; y < height; ++y) {
+    SetRow8_NEON(dst, v32, width << 2);
+    dst += dst_stride;
+  }
+}
+#endif  // HAS_SETROW_NEON
+
+#ifdef HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    // compute where to start writing destination
+    "add         %1, %2                        \n"
+    // work on segments that are multiples of 16
+    "lsrs        r3, %2, #4                    \n"
+    // the output is written in two block. 8 bytes followed
+    // by another 8. reading is done sequentially, from left to
+    // right. writing is done from right to left in block sizes
+    // %1, the destination pointer is incremented after writing
+    // the first of the two blocks. need to subtract that 8 off
+    // along with 16 to get the next location.
+    "mov         r3, #-24                      \n"
+    "beq         2f                            \n"
+
+    // back of destination by the size of the register that is
+    // going to be mirrored
+    "sub         %1, #16                       \n"
+    // the loop needs to run on blocks of 16. what will be left
+    // over is either a negative number, the residuals that need
+    // to be done, or 0. If this isn't subtracted off here the
+    // loop will run one extra time.
+    "sub         %2, #16                       \n"
+
+    // mirror the bytes in the 64 bit segments. unable to mirror
+    // the bytes in the entire 128 bits in one go.
+    // because of the inability to mirror the entire 128 bits
+    // mirror the writing out of the two 64 bit segments.
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld1.8      {q0}, [%0]!                   \n"  // src += 16
+    "subs        %2, #16                       \n"
+    "vrev64.8    q0, q0                        \n"
+    "vst1.8      {d1}, [%1]!                   \n"
+    "vst1.8      {d0}, [%1], r3                \n"  // dst -= 16
+    "bge         1b                            \n"
+
+    // add 16 back to the counter. if the result is 0 there is no
+    // residuals so jump past
+    "adds        %2, #16                       \n"
+    "beq         5f                            \n"
+    "add         %1, #16                       \n"
+  "2:                                          \n"
+    "mov         r3, #-3                       \n"
+    "sub         %1, #2                        \n"
+    "subs        %2, #2                        \n"
+    // check for 16*n+1 scenarios where segments_of_2 should not
+    // be run, but there is something left over.
+    "blt         4f                            \n"
+
+// do this in neon registers as per
+// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
+  "3:                                          \n"
+    "vld2.8      {d0[0], d1[0]}, [%0]!         \n"  // src += 2
+    "subs        %2, #2                        \n"
+    "vst1.8      {d1[0]}, [%1]!                \n"
+    "vst1.8      {d0[0]}, [%1], r3             \n"  // dst -= 2
+    "bge         3b                            \n"
+
+    "adds        %2, #2                        \n"
+    "beq         5f                            \n"
+  "4:                                          \n"
+    "add         %1, #1                        \n"
+    "vld1.8      {d0[0]}, [%0]                 \n"
+    "vst1.8      {d0[0]}, [%1]                 \n"
+  "5:                                          \n"
+    : "+r"(src),   // %0
+      "+r"(dst),   // %1
+      "+r"(width)  // %2
+    :
+    : "memory", "cc", "r3", "q0"
+  );
+}
+#endif  // HAS_MIRRORROW_NEON
+
+#ifdef HAS_MIRRORROWUV_NEON
+void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
+  asm volatile (
+    // compute where to start writing destination
+    "add         %1, %3                        \n"  // dst_a + width
+    "add         %2, %3                        \n"  // dst_b + width
+    // work on input segments that are multiples of 16, but
+    // width that has been passed is output segments, half
+    // the size of input.
+    "lsrs        r12, %3, #3                   \n"
+    "beq         2f                            \n"
+    // the output is written in to two blocks.
+    "mov         r12, #-8                      \n"
+    // back of destination by the size of the register that is
+    // going to be mirrord
+    "sub         %1, #8                        \n"
+    "sub         %2, #8                        \n"
+    // the loop needs to run on blocks of 8. what will be left
+    // over is either a negative number, the residuals that need
+    // to be done, or 0. if this isn't subtracted off here the
+    // loop will run one extra time.
+    "sub         %3, #8                        \n"
+
+    // mirror the bytes in the 64 bit segments
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld2.8      {d0, d1}, [%0]!               \n"  // src += 16
+    "subs        %3, #8                        \n"
+    "vrev64.8    q0, q0                        \n"
+    "vst1.8      {d0}, [%1], r12               \n"  // dst_a -= 8
+    "vst1.8      {d1}, [%2], r12               \n"  // dst_b -= 8
+    "bge         1b                            \n"
+
+    // add 8 back to the counter. if the result is 0 there is no
+    // residuals so return
+    "adds        %3, #8                        \n"
+    "beq         4f                            \n"
+    "add         %1, #8                        \n"
+    "add         %2, #8                        \n"
+  "2:                                          \n"
+    "mov         r12, #-1                      \n"
+    "sub         %1, #1                        \n"
+    "sub         %2, #1                        \n"
+  "3:                                          \n"
+      "vld2.8      {d0[0], d1[0]}, [%0]!       \n"  // src += 2
+      "subs        %3, %3, #1                  \n"
+      "vst1.8      {d0[0]}, [%1], r12          \n"  // dst_a -= 1
+      "vst1.8      {d1[0]}, [%2], r12          \n"  // dst_b -= 1
+      "bgt         3b                          \n"
+  "4:                                          \n"
+    : "+r"(src),    // %0
+      "+r"(dst_a),  // %1
+      "+r"(dst_b),  // %2
+      "+r"(width)   // %3
+    :
+    : "memory", "cc", "r12", "q0"
+  );
+}
+#endif  // HAS_MIRRORROWUV_NEON
+
+#ifdef HAS_BGRATOARGBROW_NEON
+void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d1, d2                         \n"  // swap G, R
+    "vswp.u8    d0, d3                         \n"  // swap B, A
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+#endif  // HAS_BGRATOARGBROW_NEON
+
+#ifdef HAS_ABGRTOARGBROW_NEON
+void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d0, d2                         \n"  // swap R, B
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+#endif  // HAS_ABGRTOARGBROW_NEON
+
+#ifdef HAS_RGBATOARGBROW_NEON
+void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                           \n"
+    "vld1.8     {d0, d1, d2, d3}, [%0]!         \n"  // load 8 pixels of RGBA.
+    "subs       %2, %2, #8                      \n"  // 8 processed per loop.
+    "vmov.u8    d4, d0                          \n"  // move A after RGB
+    "vst4.8     {d1, d2, d3, d4}, [%1]!         \n"  // store 8 pixels of ARGB.
+    "bgt        1b                              \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+#endif  // HAS_RGBATOARGBROW_NEON
+
+#ifdef HAS_RGB24TOARGBROW_NEON
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #255                       \n"  // Alpha
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),   // %1
+    "+r"(pix)         // %2
+  :
+  : "memory", "cc", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+#endif  // HAS_RGB24TOARGBROW_NEON
+
+#ifdef HAS_RAWTOARGBROW_NEON
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #255                       \n"  // Alpha
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d1, d3                         \n"  // swap R, B
+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+#endif  // HAS_RAWTOARGBROW_NEON
+
+#ifdef HAS_ARGBTORGBAROW_NEON
+void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmov.u8    d0, d4                         \n"  // move A before RGB.
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of RGBA.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_rgba),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORGBAROW_NEON
+
+#ifdef HAS_ARGBTORGB24ROW_NEON
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(pix)         // %2
+  :
+  : "memory", "cc", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORGB24ROW_NEON
+
+#ifdef HAS_ARGBTORAWROW_NEON
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d1, d3                         \n"  // swap R, B
+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_raw),   // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORAWROW_NEON
+
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld2.u8    {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    "vst1.u8    {q0}, [%1]!                    \n"  // store 16 pixels of Y.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOYROW_NEON
+
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld2.u8    {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    "vst1.u8    {q1}, [%1]!                    \n"  // store 16 pixels of Y.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOYROW_NEON
+
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    "vst1.u8    {d1}, [%1]!                    \n"  // store 8 U.
+    "vst1.u8    {d3}, [%2]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOYROW_NEON
+
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    "vst1.u8    {d0}, [%1]!                    \n"  // store 8 U.
+    "vst1.u8    {d2}, [%2]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOYROW_NEON
+
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "adds       %1, %0, %1                     \n"  // stride + src_yuy2
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
+    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
+    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
+    "vst1.u8    {d1}, [%2]!                    \n"  // store 8 U.
+    "vst1.u8    {d3}, [%3]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),     // %0
+    "+r"(stride_yuy2),  // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOYROW_NEON
+
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "adds       %1, %0, %1                     \n"  // stride + src_uyvy
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
+    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
+    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
+    "vst1.u8    {d0}, [%2]!                    \n"  // store 8 U.
+    "vst1.u8    {d2}, [%3]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),     // %0
+    "+r"(stride_uyvy),  // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOYROW_NEON
+
+#endif  // __ARM_NEON__
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/row_posix.cc b/files/source/row_posix.cc
index 88ce475b..33149dad 100644
--- a/files/source/row_posix.cc
+++ b/files/source/row_posix.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,652 +8,3655 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "row.h"
+#include "libyuv/row.h"
 
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
 extern "C" {
+#endif
+
+// This module is for GCC x86 and x64
+#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
+
+// GCC 4.2 on OSX has link error when passing static or const to inline.
+// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
+#ifdef __APPLE__
+#define CONST
+#else
+#define CONST static const
+#endif
 
 #ifdef HAS_ARGBTOYROW_SSSE3
 
-// Constant multiplication table for converting ARGB to I400.
-extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
-  13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
+// Constants for ARGB
+CONST vec8 kARGBToY = {
+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+CONST vec8 kARGBToU = {
+  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+CONST vec8 kARGBToV = {
+  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+// Constants for BGRA
+CONST vec8 kBGRAToY = {
+  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+CONST vec8 kBGRAToU = {
+  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+CONST vec8 kBGRAToV = {
+  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR
+CONST vec8 kABGRToY = {
+  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+CONST vec8 kABGRToU = {
+  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+CONST vec8 kABGRToV = {
+  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+CONST uvec8 kAddY16 = {
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
 };
 
-extern "C" TALIGN16(const uint8, kAdd16[16]) = {
-  1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
+CONST uvec8 kAddUV128 = {
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
 };
 
-// Shuffle table for converting BG24 to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
+// Shuffle table for converting RGB24 to ARGB.
+CONST uvec8 kShuffleMaskRGB24ToARGB = {
   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
 };
 
 // Shuffle table for converting RAW to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
+CONST uvec8 kShuffleMaskRAWToARGB = {
   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
 };
 
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  asm volatile(
-  "movdqa     (%3),%%xmm7\n"
-  "movdqa     (%4),%%xmm6\n"
-  "movdqa     %%xmm6,%%xmm5\n"
-  "psllw      $0x4,%%xmm5\n"  // Generate a mask of 0x10 on each byte.
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "pmaddubsw  %%xmm7,%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "psrlw      $0x7,%%xmm0\n"
-  "pmaddubsw  %%xmm7,%%xmm1\n"
-  "lea        0x20(%0),%0\n"
-  "psrlw      $0x7,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm0\n"
-  "pmaddubsw  %%xmm6,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "paddb      %%xmm5,%%xmm0\n"
-  "movq       %%xmm0,(%1)\n"
-  "lea        0x8(%1),%1\n"
-  "sub        $0x8,%2\n"
-  "ja         1b\n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_y),      // %1
-    "+r"(pix)         // %2
-  : "r"(kMultiplyMaskARGBToI400),    // %3
-    "r"(kAdd16)   // %4
-  : "memory"
-);
-}
-#endif
-
-#ifdef  HAS_BG24TOARGBROW_SSSE3
-void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
-  asm volatile(
-  "pcmpeqb    %%xmm7,%%xmm7\n"  // generate mask 0xff000000
-  "pslld      $0x18,%%xmm7\n"
-  "movdqa     (%3),%%xmm6\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "movdqa     0x20(%0),%%xmm3\n"
-  "lea        0x30(%0),%0\n"
-  "movdqa     %%xmm3,%%xmm2\n"
-  "palignr    $0x8,%%xmm1,%%xmm2\n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
-  "pshufb     %%xmm6,%%xmm2\n"
-  "por        %%xmm7,%%xmm2\n"
-  "palignr    $0xc,%%xmm0,%%xmm1\n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
-  "pshufb     %%xmm6,%%xmm0\n"
-  "movdqa     %%xmm2,0x20(%1)\n"
-  "por        %%xmm7,%%xmm0\n"
-  "pshufb     %%xmm6,%%xmm1\n"
-  "movdqa     %%xmm0,(%1)\n"
-  "por        %%xmm7,%%xmm1\n"
-  "palignr    $0x4,%%xmm3,%%xmm3\n"  // xmm3 = { xmm3[4:15] }
-  "pshufb     %%xmm6,%%xmm3\n"
-  "movdqa     %%xmm1,0x10(%1)\n"
-  "por        %%xmm7,%%xmm3\n"
-  "movdqa     %%xmm3,0x30(%1)\n"
-  "lea        0x40(%1),%1\n"
-  "sub        $0x10,%2\n"
-  "ja         1b\n"
-  : "+r"(src_bg24),  // %0
+// Shuffle table for converting ABGR to ARGB.
+CONST uvec8 kShuffleMaskABGRToARGB = {
+  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting BGRA to ARGB.
+CONST uvec8 kShuffleMaskBGRAToARGB = {
+  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+// Shuffle table for converting RGBA to ARGB.
+CONST uvec8 kShuffleMaskRGBAToARGB = {
+  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
+};
+
+// Shuffle table for converting ARGB to RGBA.
+CONST uvec8 kShuffleMaskARGBToRGBA = {
+  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+CONST uvec8 kShuffleMaskARGBToRGB24 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+CONST uvec8 kShuffleMaskARGBToRAW = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movq      (%0),%%xmm0                     \n"
+    "lea       0x8(%0),%0                      \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm1                   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "movdqa    %%xmm1,0x10(%1)                 \n"
+    "lea       0x20(%1),%1                     \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y),     // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
-  : "r"(kShuffleMaskBG24ToARGB)  // %3
-  : "memory"
-);
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+
+void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm5                       \n"
+    "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "lea       0x10(%0),%0                     \n"
+    "jg        1b                              \n"
+
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "m"(kShuffleMaskABGRToARGB)  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm5"
+#endif
+  );
+}
+
+void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm5                       \n"
+    "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "lea       0x10(%0),%0                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "m"(kShuffleMaskBGRAToARGB)  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm5"
+#endif
+  );
+}
+
+void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm5                       \n"
+    "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "lea       0x10(%0),%0                     \n"
+    "jg        1b                              \n"
+
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "m"(kShuffleMaskRGBAToARGB)  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm5"
+#endif
+  );
+}
+
+void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm5                       \n"
+    "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "lea       0x10(%0),%0                     \n"
+    "jg        1b                              \n"
+
+  : "+r"(src_argb),  // %0
+    "+r"(dst_rgba),  // %1
+    "+r"(pix)        // %2
+  : "m"(kShuffleMaskARGBToRGBA)  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm5"
+#endif
+  );
+}
+
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
+    "pslld     $0x18,%%xmm5                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "movdqu    0x20(%0),%%xmm3                 \n"
+    "lea       0x30(%0),%0                     \n"
+    "movdqa    %%xmm3,%%xmm2                   \n"
+    "palignr   $0x8,%%xmm1,%%xmm2              \n"
+    "pshufb    %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqa    %%xmm2,0x20(%1)                 \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "palignr   $0x4,%%xmm3,%%xmm3              \n"
+    "pshufb    %%xmm4,%%xmm3                   \n"
+    "movdqa    %%xmm1,0x10(%1)                 \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm3,0x30(%1)                 \n"
+    "lea       0x40(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "m"(kShuffleMaskRGB24ToARGB)  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
 }
 
 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
-  asm volatile(
-  "pcmpeqb    %%xmm7,%%xmm7\n"  // generate mask 0xff000000
-  "pslld      $0x18,%%xmm7\n"
-  "movdqa     (%3),%%xmm6\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "movdqa     0x20(%0),%%xmm3\n"
-  "lea        0x30(%0),%0\n"
-  "movdqa     %%xmm3,%%xmm2\n"
-  "palignr    $0x8,%%xmm1,%%xmm2\n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
-  "pshufb     %%xmm6,%%xmm2\n"
-  "por        %%xmm7,%%xmm2\n"
-  "palignr    $0xc,%%xmm0,%%xmm1\n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
-  "pshufb     %%xmm6,%%xmm0\n"
-  "movdqa     %%xmm2,0x20(%1)\n"
-  "por        %%xmm7,%%xmm0\n"
-  "pshufb     %%xmm6,%%xmm1\n"
-  "movdqa     %%xmm0,(%1)\n"
-  "por        %%xmm7,%%xmm1\n"
-  "palignr    $0x4,%%xmm3,%%xmm3\n"  // xmm3 = { xmm3[4:15] }
-  "pshufb     %%xmm6,%%xmm3\n"
-  "movdqa     %%xmm1,0x10(%1)\n"
-  "por        %%xmm7,%%xmm3\n"
-  "movdqa     %%xmm3,0x30(%1)\n"
-  "lea        0x40(%1),%1\n"
-  "sub        $0x10,%2\n"
-  "ja         1b\n"
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
+    "pslld     $0x18,%%xmm5                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "movdqu    0x20(%0),%%xmm3                 \n"
+    "lea       0x30(%0),%0                     \n"
+    "movdqa    %%xmm3,%%xmm2                   \n"
+    "palignr   $0x8,%%xmm1,%%xmm2              \n"
+    "pshufb    %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqa    %%xmm2,0x20(%1)                 \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "palignr   $0x4,%%xmm3,%%xmm3              \n"
+    "pshufb    %%xmm4,%%xmm3                   \n"
+    "movdqa    %%xmm1,0x10(%1)                 \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm3,0x30(%1)                 \n"
+    "lea       0x40(%1),%1                     \n"
+    "jg        1b                              \n"
   : "+r"(src_raw),   // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
-  : "r"(kShuffleMaskRAWToARGB)  // %3
-  : "memory"
-);
+  : "m"(kShuffleMaskRAWToARGB)  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
 }
+
+void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "mov       $0x1080108,%%eax                \n"
+    "movd      %%eax,%%xmm5                    \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "mov       $0x20802080,%%eax               \n"
+    "movd      %%eax,%%xmm6                    \n"
+    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psllw     $0xb,%%xmm3                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0xa,%%xmm4                     \n"
+    "psrlw     $0x5,%%xmm4                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psllw     $0x8,%%xmm7                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "psllw     $0xb,%%xmm2                     \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "psllw     $0x8,%%xmm1                     \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "pmulhuw   %%xmm6,%%xmm0                   \n"
+    "por       %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm1                   \n"
+    "punpckhbw %%xmm0,%%xmm2                   \n"
+    "movdqa    %%xmm1,(%1,%0,2)                \n"
+    "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
+    "lea       0x10(%0),%0                     \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :
+  : "memory", "cc", "eax"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 #endif
+  );
+}
 
-#if defined(__x86_64__)
+void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "mov       $0x1080108,%%eax                \n"
+    "movd      %%eax,%%xmm5                    \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "mov       $0x42004200,%%eax               \n"
+    "movd      %%eax,%%xmm6                    \n"
+    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psllw     $0xb,%%xmm3                     \n"
+    "movdqa    %%xmm3,%%xmm4                   \n"
+    "psrlw     $0x6,%%xmm4                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psllw     $0x8,%%xmm7                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psllw     $0x1,%%xmm1                     \n"
+    "psllw     $0xb,%%xmm2                     \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "psllw     $0x8,%%xmm1                     \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "pmulhuw   %%xmm6,%%xmm0                   \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm1                   \n"
+    "punpckhbw %%xmm0,%%xmm2                   \n"
+    "movdqa    %%xmm1,(%1,%0,2)                \n"
+    "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
+    "lea       0x10(%0),%0                     \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :
+  : "memory", "cc", "eax"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
+}
 
-// 64 bit linux gcc version
-
-void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
-                              const uint8* u_buf,  // rsi
-                              const uint8* v_buf,  // rdx
-                              uint8* rgb_buf,      // rcx
-                              int width) {         // r8
-  asm volatile(
-"1:"
-  "movzb  (%1),%%r10\n"
-  "lea    1(%1),%1\n"
-  "movzb  (%2),%%r11\n"
-  "lea    1(%2),%2\n"
-  "movq   2048(%5,%%r10,8),%%xmm0\n"
-  "movzb  (%0),%%r10\n"
-  "movq   4096(%5,%%r11,8),%%xmm1\n"
-  "movzb  0x1(%0),%%r11\n"
-  "paddsw %%xmm1,%%xmm0\n"
-  "movq   (%5,%%r10,8),%%xmm2\n"
-  "lea    2(%0),%0\n"
-  "movq   (%5,%%r11,8),%%xmm3\n"
-  "paddsw %%xmm0,%%xmm2\n"
-  "paddsw %%xmm0,%%xmm3\n"
-  "shufps $0x44,%%xmm3,%%xmm2\n"
-  "psraw  $0x6,%%xmm2\n"
-  "packuswb %%xmm2,%%xmm2\n"
-  "movq   %%xmm2,0x0(%3)\n"
-  "lea    8(%3),%3\n"
-  "sub    $0x2,%4\n"
-  "ja     1b\n"
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(rgb_buf),  // %3
-    "+r"(width)     // %4
-  : "r" (_kCoefficientsRgbY)  // %5
-  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
-);
-}
-
-void FastConvertYUVToBGRARow(const uint8* y_buf,  // rdi
-                             const uint8* u_buf,  // rsi
-                             const uint8* v_buf,  // rdx
-                             uint8* rgb_buf,      // rcx
-                             int width) {         // r8
-  asm volatile(
-"1:"
-  "movzb  (%1),%%r10\n"
-  "lea    1(%1),%1\n"
-  "movzb  (%2),%%r11\n"
-  "lea    1(%2),%2\n"
-  "movq   2048(%5,%%r10,8),%%xmm0\n"
-  "movzb  (%0),%%r10\n"
-  "movq   4096(%5,%%r11,8),%%xmm1\n"
-  "movzb  0x1(%0),%%r11\n"
-  "paddsw %%xmm1,%%xmm0\n"
-  "movq   (%5,%%r10,8),%%xmm2\n"
-  "lea    2(%0),%0\n"
-  "movq   (%5,%%r11,8),%%xmm3\n"
-  "paddsw %%xmm0,%%xmm2\n"
-  "paddsw %%xmm0,%%xmm3\n"
-  "shufps $0x44,%%xmm3,%%xmm2\n"
-  "psraw  $0x6,%%xmm2\n"
-  "packuswb %%xmm2,%%xmm2\n"
-  "movq   %%xmm2,0x0(%3)\n"
-  "lea    8(%3),%3\n"
-  "sub    $0x2,%4\n"
-  "ja     1b\n"
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(rgb_buf),  // %3
-    "+r"(width)     // %4
-  : "r" (_kCoefficientsBgraY)  // %5
-  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
-);
-}
-
-void FastConvertYUVToABGRRow(const uint8* y_buf,  // rdi
-                             const uint8* u_buf,  // rsi
-                             const uint8* v_buf,  // rdx
-                             uint8* rgb_buf,      // rcx
-                             int width) {         // r8
-  asm volatile(
-"1:"
-  "movzb  (%1),%%r10\n"
-  "lea    1(%1),%1\n"
-  "movzb  (%2),%%r11\n"
-  "lea    1(%2),%2\n"
-  "movq   2048(%5,%%r10,8),%%xmm0\n"
-  "movzb  (%0),%%r10\n"
-  "movq   4096(%5,%%r11,8),%%xmm1\n"
-  "movzb  0x1(%0),%%r11\n"
-  "paddsw %%xmm1,%%xmm0\n"
-  "movq   (%5,%%r10,8),%%xmm2\n"
-  "lea    2(%0),%0\n"
-  "movq   (%5,%%r11,8),%%xmm3\n"
-  "paddsw %%xmm0,%%xmm2\n"
-  "paddsw %%xmm0,%%xmm3\n"
-  "shufps $0x44,%%xmm3,%%xmm2\n"
-  "psraw  $0x6,%%xmm2\n"
-  "packuswb %%xmm2,%%xmm2\n"
-  "movq   %%xmm2,0x0(%3)\n"
-  "lea    8(%3),%3\n"
-  "sub    $0x2,%4\n"
-  "ja     1b\n"
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(rgb_buf),  // %3
-    "+r"(width)     // %4
-  : "r" (_kCoefficientsAbgrY)  // %5
-  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
-);
-}
-
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf,  // rdi
-                                 const uint8* u_buf,  // rsi
-                                 const uint8* v_buf,  // rdx
-                                 uint8* rgb_buf,      // rcx
-                                 int width) {         // r8
-  asm volatile(
-"1:"
-  "movzb  (%1),%%r10\n"
-  "lea    1(%1),%1\n"
-  "movzb  (%2),%%r11\n"
-  "lea    1(%2),%2\n"
-  "movq   2048(%5,%%r10,8),%%xmm0\n"
-  "movzb  (%0),%%r10\n"
-  "movq   4096(%5,%%r11,8),%%xmm1\n"
-  "paddsw %%xmm1,%%xmm0\n"
-  "movq   (%5,%%r10,8),%%xmm2\n"
-  "lea    1(%0),%0\n"
-  "paddsw %%xmm0,%%xmm2\n"
-  "shufps $0x44,%%xmm2,%%xmm2\n"
-  "psraw  $0x6,%%xmm2\n"
-  "packuswb %%xmm2,%%xmm2\n"
-  "movd   %%xmm2,0x0(%3)\n"
-  "lea    4(%3),%3\n"
-  "sub    $0x1,%4\n"
-  "ja     1b\n"
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(rgb_buf),  // %3
-    "+r"(width)     // %4
-  : "r" (_kCoefficientsRgbY)  // %5
-  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2"
-);
-}
-
-void FastConvertYToRGB32Row(const uint8* y_buf,  // rdi
-                            uint8* rgb_buf,      // rcx
-                            int width) {         // r8
-  asm volatile(
-"1:"
-  "movzb  (%0),%%r10\n"
-  "movzb  0x1(%0),%%r11\n"
-  "movq   (%3,%%r10,8),%%xmm2\n"
-  "lea    2(%0),%0\n"
-  "movq   (%3,%%r11,8),%%xmm3\n"
-  "shufps $0x44,%%xmm3,%%xmm2\n"
-  "psraw  $0x6,%%xmm2\n"
-  "packuswb %%xmm2,%%xmm2\n"
-  "movq   %%xmm2,0x0(%1)\n"
-  "lea    8(%1),%1\n"
-  "sub    $0x2,%2\n"
-  "ja     1b\n"
+void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "mov       $0xf0f0f0f,%%eax                \n"
+    "movd      %%eax,%%xmm4                    \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "pslld     $0x4,%%xmm5                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "psllw     $0x4,%%xmm1                     \n"
+    "psrlw     $0x4,%%xmm3                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm2,%%xmm0                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,(%1,%0,2)                \n"
+    "movdqa    %%xmm1,0x10(%1,%0,2)            \n"
+    "lea       0x10(%0),%0                     \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :
+  : "memory", "cc", "eax"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm6                       \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    0x20(%0),%%xmm2                 \n"
+    "movdqa    0x30(%0),%%xmm3                 \n"
+    "lea       0x40(%0),%0                     \n"
+    "pshufb    %%xmm6,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm6,%%xmm2                   \n"
+    "pshufb    %%xmm6,%%xmm3                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "psrldq    $0x4,%%xmm1                     \n"
+    "pslldq    $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm2,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pslldq    $0x8,%%xmm5                     \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "psrldq    $0x8,%%xmm2                     \n"
+    "pslldq    $0x4,%%xmm3                     \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqa    %%xmm1,0x10(%1)                 \n"
+    "movdqa    %%xmm2,0x20(%1)                 \n"
+    "lea       0x30(%1),%1                     \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  : "m"(kShuffleMaskARGBToRGB24)  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+  );
+}
+
+void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm6                       \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    0x20(%0),%%xmm2                 \n"
+    "movdqa    0x30(%0),%%xmm3                 \n"
+    "lea       0x40(%0),%0                     \n"
+    "pshufb    %%xmm6,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm6,%%xmm2                   \n"
+    "pshufb    %%xmm6,%%xmm3                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "psrldq    $0x4,%%xmm1                     \n"
+    "pslldq    $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm2,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pslldq    $0x8,%%xmm5                     \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "psrldq    $0x8,%%xmm2                     \n"
+    "pslldq    $0x4,%%xmm3                     \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqa    %%xmm1,0x10(%1)                 \n"
+    "movdqa    %%xmm2,0x20(%1)                 \n"
+    "lea       0x30(%1),%1                     \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  : "m"(kShuffleMaskARGBToRAW)  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+  );
+}
+
+void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psrld     $0x1b,%%xmm3                    \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psrld     $0x1a,%%xmm4                    \n"
+    "pslld     $0x5,%%xmm4                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0xb,%%xmm5                     \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pslld     $0x8,%%xmm0                     \n"
+    "psrld     $0x3,%%xmm1                     \n"
+    "psrld     $0x5,%%xmm2                     \n"
+    "psrad     $0x10,%%xmm0                    \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm4,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "lea       0x10(%0),%0                     \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psrld     $0x1b,%%xmm4                    \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "pslld     $0x5,%%xmm5                     \n"
+    "movdqa    %%xmm4,%%xmm6                   \n"
+    "pslld     $0xa,%%xmm6                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "pslld     $0xf,%%xmm7                     \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "psrad     $0x10,%%xmm0                    \n"
+    "psrld     $0x3,%%xmm1                     \n"
+    "psrld     $0x6,%%xmm2                     \n"
+    "psrld     $0x9,%%xmm3                     \n"
+    "pand      %%xmm7,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm6,%%xmm3                   \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "lea       0x10(%0),%0                     \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm4,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm3                     \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm3,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm1                   \n"
+    "psrlq     $0x4,%%xmm0                     \n"
+    "psrlq     $0x8,%%xmm1                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "lea       0x10(%0),%0                     \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+  );
+}
+
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    0x20(%0),%%xmm2                 \n"
+    "movdqa    0x30(%0),%%xmm3                 \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       0x40(%0),%0                     \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "movdqu    0x20(%0),%%xmm2                 \n"
+    "movdqu    0x30(%0),%%xmm3                 \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       0x40(%0),%0                     \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqu    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+// TODO(fbarchard): pass xmm constants to single block of assembly.
+// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
+// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
+// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
+// and considered unsafe.
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %0,%%xmm4                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm5                       \n"
+  :
+  : "m"(kARGBToU),  // %0
+    "m"(kARGBToV),  // %1
+    "m"(kAddUV128)  // %2
+  );
+  asm volatile (
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    0x20(%0),%%xmm2                 \n"
+    "movdqa    0x30(%0),%%xmm6                 \n"
+    "pavgb     (%0,%4,1),%%xmm0                \n"
+    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
+    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
+    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
+    "lea       0x40(%0),%0                     \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
+    "movlps    %%xmm0,(%1)                     \n"
+    "movhps    %%xmm0,(%1,%2,1)                \n"
+    "lea       0x8(%1),%1                      \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"(static_cast<intptr_t>(src_stride_argb))
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+  );
+}
+
+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %0,%%xmm4                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm5                       \n"
+  :
+  : "m"(kARGBToU),         // %0
+    "m"(kARGBToV),         // %1
+    "m"(kAddUV128)         // %2
+  );
+  asm volatile (
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "movdqu    0x20(%0),%%xmm2                 \n"
+    "movdqu    0x30(%0),%%xmm6                 \n"
+    "movdqu    (%0,%4,1),%%xmm7                \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "lea       0x40(%0),%0                     \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
+    "movlps    %%xmm0,(%1)                     \n"
+    "movhps    %%xmm0,(%1,%2,1)                \n"
+    "lea       0x8(%1),%1                      \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"(static_cast<intptr_t>(src_stride_argb))
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+  );
+}
+
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    0x20(%0),%%xmm2                 \n"
+    "movdqa    0x30(%0),%%xmm3                 \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       0x40(%0),%0                     \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kBGRAToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "movdqu    0x20(%0),%%xmm2                 \n"
+    "movdqu    0x30(%0),%%xmm3                 \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       0x40(%0),%0                     \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqu    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kBGRAToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %0,%%xmm4                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm5                       \n"
+  :
+  : "m"(kBGRAToU),         // %0
+    "m"(kBGRAToV),         // %1
+    "m"(kAddUV128)         // %2
+  );
+  asm volatile (
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    0x20(%0),%%xmm2                 \n"
+    "movdqa    0x30(%0),%%xmm6                 \n"
+    "pavgb     (%0,%4,1),%%xmm0                \n"
+    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
+    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
+    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
+    "lea       0x40(%0),%0                     \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
+    "movlps    %%xmm0,(%1)                     \n"
+    "movhps    %%xmm0,(%1,%2,1)                \n"
+    "lea       0x8(%1),%1                      \n"
+    "jg        1b                              \n"
+  : "+r"(src_bgra0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"(static_cast<intptr_t>(src_stride_bgra))
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+  );
+}
+
+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
+                                 uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %0,%%xmm4                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm5                       \n"
+  :
+  : "m"(kBGRAToU),         // %0
+    "m"(kBGRAToV),         // %1
+    "m"(kAddUV128)         // %2
+  );
+  asm volatile (
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "movdqu    0x20(%0),%%xmm2                 \n"
+    "movdqu    0x30(%0),%%xmm6                 \n"
+    "movdqu    (%0,%4,1),%%xmm7                \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "lea       0x40(%0),%0                     \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
+    "movlps    %%xmm0,(%1)                     \n"
+    "movhps    %%xmm0,(%1,%2,1)                \n"
+    "lea       0x8(%1),%1                      \n"
+    "jg        1b                              \n"
+  : "+r"(src_bgra0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"(static_cast<intptr_t>(src_stride_bgra))
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+  );
+}
+
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    0x20(%0),%%xmm2                 \n"
+    "movdqa    0x30(%0),%%xmm3                 \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       0x40(%0),%0                     \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kABGRToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "movdqu    0x20(%0),%%xmm2                 \n"
+    "movdqu    0x30(%0),%%xmm3                 \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       0x40(%0),%0                     \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqu    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kABGRToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %0,%%xmm4                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm5                       \n"
+  :
+  : "m"(kABGRToU),         // %0
+    "m"(kABGRToV),         // %1
+    "m"(kAddUV128)         // %2
+  );
+  asm volatile (
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    0x20(%0),%%xmm2                 \n"
+    "movdqa    0x30(%0),%%xmm6                 \n"
+    "pavgb     (%0,%4,1),%%xmm0                \n"
+    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
+    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
+    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
+    "lea       0x40(%0),%0                     \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
+    "movlps    %%xmm0,(%1)                     \n"
+    "movhps    %%xmm0,(%1,%2,1)                \n"
+    "lea       0x8(%1),%1                      \n"
+    "jg        1b                              \n"
+  : "+r"(src_abgr0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"(static_cast<intptr_t>(src_stride_abgr))
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+  );
+}
+
+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
+                                 uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %0,%%xmm4                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm5                       \n"
+  :
+  : "m"(kABGRToU),         // %0
+    "m"(kABGRToV),         // %1
+    "m"(kAddUV128)         // %2
+  );
+  asm volatile (
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "movdqu    0x20(%0),%%xmm2                 \n"
+    "movdqu    0x30(%0),%%xmm6                 \n"
+    "movdqu    (%0,%4,1),%%xmm7                \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "lea       0x40(%0),%0                     \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
+    "movlps    %%xmm0,(%1)                     \n"
+    "movhps    %%xmm0,(%1,%2,1)                \n"
+    "lea       0x8(%1),%1                      \n"
+    "jg        1b                              \n"
+  : "+r"(src_abgr0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"(static_cast<intptr_t>(src_stride_abgr))
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+  );
+}
+#endif  // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
+#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
+#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
+
+struct {
+  vec8 kUVToB;  // 0
+  vec8 kUVToG;  // 16
+  vec8 kUVToR;  // 32
+  vec16 kUVBiasB;  // 48
+  vec16 kUVBiasG;  // 64
+  vec16 kUVBiasR;  // 80
+  vec16 kYSub16;  // 96
+  vec16 kYToRgb;  // 112
+  vec8 kVUToB;  // 128
+  vec8 kVUToG;  // 144
+  vec8 kVUToR;  // 160
+} CONST SIMD_ALIGNED(kYuvConstants) = {
+  { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR },
+  { 16, 16, 16, 16, 16, 16, 16, 16 },
+  { YG, YG, YG, YG, YG, YG, YG, YG },
+  { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
+};
+
+
+// Read 8 UV from 411
+#define READYUV444                                                             \
+    "movq       (%[u_buf]),%%xmm0              \n"                             \
+    "movq       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
+    "lea        0x8(%[u_buf]),%[u_buf]         \n"                             \
+    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
+
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422                                                             \
+    "movd       (%[u_buf]),%%xmm0              \n"                             \
+    "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
+    "lea        0x4(%[u_buf]),%[u_buf]         \n"                             \
+    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
+    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
+
+// Read 2 UV from 411, upsample to 8 UV
+#define READYUV411                                                             \
+    "movd       (%[u_buf]),%%xmm0              \n"                             \
+    "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
+    "lea        0x2(%[u_buf]),%[u_buf]         \n"                             \
+    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
+    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
+    "punpckldq  %%xmm0,%%xmm0                  \n"                             \
+
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12                                                               \
+    "movq       (%[uv_buf]),%%xmm0             \n"                             \
+    "lea        0x8(%[uv_buf]),%[uv_buf]       \n"                             \
+    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
+
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB                                                               \
+    "movdqa     %%xmm0,%%xmm1                  \n"                             \
+    "movdqa     %%xmm0,%%xmm2                  \n"                             \
+    "pmaddubsw  (%[kYuvConstants]),%%xmm0      \n"                             \
+    "pmaddubsw  16(%[kYuvConstants]),%%xmm1    \n"                             \
+    "pmaddubsw  32(%[kYuvConstants]),%%xmm2    \n"                             \
+    "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
+    "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
+    "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
+    "movq       (%[y_buf]),%%xmm3              \n"                             \
+    "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
+    "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
+    "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
+    "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
+    "paddsw     %%xmm3,%%xmm0                  \n"                             \
+    "paddsw     %%xmm3,%%xmm1                  \n"                             \
+    "paddsw     %%xmm3,%%xmm2                  \n"                             \
+    "psraw      $0x6,%%xmm0                    \n"                             \
+    "psraw      $0x6,%%xmm1                    \n"                             \
+    "psraw      $0x6,%%xmm2                    \n"                             \
+    "packuswb   %%xmm0,%%xmm0                  \n"                             \
+    "packuswb   %%xmm1,%%xmm1                  \n"                             \
+    "packuswb   %%xmm2,%%xmm2                  \n"                             \
+
+// Convert 8 pixels: 8 VU and 8 Y
+#define YVUTORGB                                                               \
+    "movdqa     %%xmm0,%%xmm1                  \n"                             \
+    "movdqa     %%xmm0,%%xmm2                  \n"                             \
+    "pmaddubsw  128(%[kYuvConstants]),%%xmm0   \n"                             \
+    "pmaddubsw  144(%[kYuvConstants]),%%xmm1   \n"                             \
+    "pmaddubsw  160(%[kYuvConstants]),%%xmm2   \n"                             \
+    "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
+    "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
+    "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
+    "movq       (%[y_buf]),%%xmm3              \n"                             \
+    "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
+    "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
+    "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
+    "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
+    "paddsw     %%xmm3,%%xmm0                  \n"                             \
+    "paddsw     %%xmm3,%%xmm1                  \n"                             \
+    "paddsw     %%xmm3,%%xmm2                  \n"                             \
+    "psraw      $0x6,%%xmm0                    \n"                             \
+    "psraw      $0x6,%%xmm1                    \n"                             \
+    "psraw      $0x6,%%xmm2                    \n"                             \
+    "packuswb   %%xmm0,%%xmm0                  \n"                             \
+    "packuswb   %%xmm1,%%xmm1                  \n"                             \
+    "packuswb   %%xmm2,%%xmm2                  \n"                             \
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* argb_buf,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READYUV444
+    YUVTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,(%[argb_buf])            \n"
+    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* argb_buf,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,(%[argb_buf])            \n"
+    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* argb_buf,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READYUV411
+    YUVTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,(%[argb_buf])            \n"
+    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* uv_buf,
+                                uint8* argb_buf,
+                                int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READNV12
+    YUVTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,(%[argb_buf])            \n"
+    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* vu_buf,
+                                uint8* argb_buf,
+                                int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READNV12
+    YVUTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,(%[argb_buf])            \n"
+    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(vu_buf),    // %[uv_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                          const uint8* u_buf,
+                                          const uint8* v_buf,
+                                          uint8* argb_buf,
+                                          int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READYUV444
+    YUVTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqu    %%xmm0,(%[argb_buf])            \n"
+    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                          const uint8* u_buf,
+                                          const uint8* v_buf,
+                                          uint8* argb_buf,
+                                          int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqu    %%xmm0,(%[argb_buf])            \n"
+    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                          const uint8* u_buf,
+                                          const uint8* v_buf,
+                                          uint8* argb_buf,
+                                          int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READYUV411
+    YUVTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqu    %%xmm0,(%[argb_buf])            \n"
+    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                          const uint8* uv_buf,
+                                          uint8* argb_buf,
+                                          int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READNV12
+    YUVTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqu    %%xmm0,(%[argb_buf])            \n"
+    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                          const uint8* vu_buf,
+                                          uint8* argb_buf,
+                                          int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READNV12
+    YVUTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqu    %%xmm0,(%[argb_buf])            \n"
+    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(vu_buf),    // %[uv_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* bgra_buf,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm2,%%xmm5                   \n"
+    "movdqa    %%xmm5,%%xmm0                   \n"
+    "punpcklwd %%xmm1,%%xmm5                   \n"
+    "punpckhwd %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm5,(%[argb_buf])            \n"
+    "movdqa    %%xmm0,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(bgra_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* abgr_buf,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm2                   \n"
+    "punpckhwd %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,(%[argb_buf])            \n"
+    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(abgr_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
+                                          const uint8* u_buf,
+                                          const uint8* v_buf,
+                                          uint8* bgra_buf,
+                                          int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm2,%%xmm5                   \n"
+    "movdqa    %%xmm5,%%xmm0                   \n"
+    "punpcklwd %%xmm1,%%xmm5                   \n"
+    "punpckhwd %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm5,(%[argb_buf])            \n"
+    "movdqu    %%xmm0,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(bgra_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+                                          const uint8* u_buf,
+                                          const uint8* v_buf,
+                                          uint8* abgr_buf,
+                                          int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm2                   \n"
+    "punpckhwd %%xmm0,%%xmm1                   \n"
+    "movdqu    %%xmm2,(%[argb_buf])            \n"
+    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(abgr_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_I422TOARGBROW_SSSE3
+
+#ifdef HAS_YTOARGBROW_SSE2
+void YToARGBRow_SSE2(const uint8* y_buf,
+                     uint8* rgb_buf,
+                     int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    "mov       $0x10001000,%%eax               \n"
+    "movd      %%eax,%%xmm3                    \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "mov       $0x012a012a,%%eax               \n"
+    "movd      %%eax,%%xmm2                    \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+    "movq      (%0),%%xmm0                     \n"
+    "lea       0x8(%0),%0                      \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "psubusw   %%xmm3,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+
+    // Step 2: Weave into ARGB
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm1                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "por       %%xmm4,%%xmm1                   \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "movdqa    %%xmm1,16(%1)                   \n"
+    "lea       32(%1),%1                       \n"
+
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
   : "+r"(y_buf),    // %0
     "+r"(rgb_buf),  // %1
-    "+r"(width)     // %2
-  : "r" (_kCoefficientsRgbY)  // %3
-  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
-);
-}
-
-#elif defined(__i386__)
-// 32 bit gcc version
-
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width);
-  asm(
-  ".text\n"
-#if defined(OSX) || defined(IOS)
-  ".globl _FastConvertYUVToRGB32Row\n"
-"_FastConvertYUVToRGB32Row:\n"
-#else
-  ".global FastConvertYUVToRGB32Row\n"
-"FastConvertYUVToRGB32Row:\n"
-#endif
-  "pusha\n"
-  "mov    0x24(%esp),%edx\n"
-  "mov    0x28(%esp),%edi\n"
-  "mov    0x2c(%esp),%esi\n"
-  "mov    0x30(%esp),%ebp\n"
-  "mov    0x34(%esp),%ecx\n"
-
-"1:"
-  "movzbl (%edi),%eax\n"
-  "lea    1(%edi),%edi\n"
-  "movzbl (%esi),%ebx\n"
-  "lea    1(%esi),%esi\n"
-  "movq   _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
-  "movzbl (%edx),%eax\n"
-  "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
-  "movzbl 0x1(%edx),%ebx\n"
-  "movq   _kCoefficientsRgbY(,%eax,8),%mm1\n"
-  "lea    2(%edx),%edx\n"
-  "movq   _kCoefficientsRgbY(,%ebx,8),%mm2\n"
-  "paddsw %mm0,%mm1\n"
-  "paddsw %mm0,%mm2\n"
-  "psraw  $0x6,%mm1\n"
-  "psraw  $0x6,%mm2\n"
-  "packuswb %mm2,%mm1\n"
-  "movntq %mm1,0x0(%ebp)\n"
-  "lea    8(%ebp),%ebp\n"
-  "sub    $0x2,%ecx\n"
-  "ja     1b\n"
-  "popa\n"
-  "ret\n"
-);
-
-void FastConvertYUVToBGRARow(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width);
-  asm(
-  ".text\n"
-#if defined(OSX) || defined(IOS)
-  ".globl _FastConvertYUVToBGRARow\n"
-"_FastConvertYUVToBGRARow:\n"
-#else
-  ".global FastConvertYUVToBGRARow\n"
-"FastConvertYUVToBGRARow:\n"
-#endif
-  "pusha\n"
-  "mov    0x24(%esp),%edx\n"
-  "mov    0x28(%esp),%edi\n"
-  "mov    0x2c(%esp),%esi\n"
-  "mov    0x30(%esp),%ebp\n"
-  "mov    0x34(%esp),%ecx\n"
-
-"1:"
-  "movzbl (%edi),%eax\n"
-  "lea    1(%edi),%edi\n"
-  "movzbl (%esi),%ebx\n"
-  "lea    1(%esi),%esi\n"
-  "movq   _kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
-  "movzbl (%edx),%eax\n"
-  "paddsw _kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
-  "movzbl 0x1(%edx),%ebx\n"
-  "movq   _kCoefficientsBgraY(,%eax,8),%mm1\n"
-  "lea    2(%edx),%edx\n"
-  "movq   _kCoefficientsBgraY(,%ebx,8),%mm2\n"
-  "paddsw %mm0,%mm1\n"
-  "paddsw %mm0,%mm2\n"
-  "psraw  $0x6,%mm1\n"
-  "psraw  $0x6,%mm2\n"
-  "packuswb %mm2,%mm1\n"
-  "movntq %mm1,0x0(%ebp)\n"
-  "lea    8(%ebp),%ebp\n"
-  "sub    $0x2,%ecx\n"
-  "ja     1b\n"
-  "popa\n"
-  "ret\n"
-);
-
-void FastConvertYUVToABGRRow(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width);
-  asm(
-  ".text\n"
-#if defined(OSX) || defined(IOS)
-  ".globl _FastConvertYUVToABGRRow\n"
-"_FastConvertYUVToABGRRow:\n"
-#else
-  ".global FastConvertYUVToABGRRow\n"
-"FastConvertYUVToABGRRow:\n"
-#endif
-  "pusha\n"
-  "mov    0x24(%esp),%edx\n"
-  "mov    0x28(%esp),%edi\n"
-  "mov    0x2c(%esp),%esi\n"
-  "mov    0x30(%esp),%ebp\n"
-  "mov    0x34(%esp),%ecx\n"
-
-"1:"
-  "movzbl (%edi),%eax\n"
-  "lea    1(%edi),%edi\n"
-  "movzbl (%esi),%ebx\n"
-  "lea    1(%esi),%esi\n"
-  "movq   _kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
-  "movzbl (%edx),%eax\n"
-  "paddsw _kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
-  "movzbl 0x1(%edx),%ebx\n"
-  "movq   _kCoefficientsAbgrY(,%eax,8),%mm1\n"
-  "lea    2(%edx),%edx\n"
-  "movq   _kCoefficientsAbgrY(,%ebx,8),%mm2\n"
-  "paddsw %mm0,%mm1\n"
-  "paddsw %mm0,%mm2\n"
-  "psraw  $0x6,%mm1\n"
-  "psraw  $0x6,%mm2\n"
-  "packuswb %mm2,%mm1\n"
-  "movntq %mm1,0x0(%ebp)\n"
-  "lea    8(%ebp),%ebp\n"
-  "sub    $0x2,%ecx\n"
-  "ja     1b\n"
-  "popa\n"
-  "ret\n"
-);
-
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* rgb_buf,
-                                 int width);
-  asm(
-  ".text\n"
-#if defined(OSX) || defined(IOS)
-  ".globl _FastConvertYUV444ToRGB32Row\n"
-"_FastConvertYUV444ToRGB32Row:\n"
-#else
-  ".global FastConvertYUV444ToRGB32Row\n"
-"FastConvertYUV444ToRGB32Row:\n"
-#endif
-  "pusha\n"
-  "mov    0x24(%esp),%edx\n"
-  "mov    0x28(%esp),%edi\n"
-  "mov    0x2c(%esp),%esi\n"
-  "mov    0x30(%esp),%ebp\n"
-  "mov    0x34(%esp),%ecx\n"
-
-"1:"
-  "movzbl (%edi),%eax\n"
-  "lea    1(%edi),%edi\n"
-  "movzbl (%esi),%ebx\n"
-  "lea    1(%esi),%esi\n"
-  "movq   _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
-  "movzbl (%edx),%eax\n"
-  "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
-  "lea    1(%edx),%edx\n"
-  "paddsw _kCoefficientsRgbY(,%eax,8),%mm0\n"
-  "psraw  $0x6,%mm0\n"
-  "packuswb %mm0,%mm0\n"
-  "movd   %mm0,0x0(%ebp)\n"
-  "lea    4(%ebp),%ebp\n"
-  "sub    $0x1,%ecx\n"
-  "ja     1b\n"
-  "popa\n"
-  "ret\n"
-);
-
-void FastConvertYToRGB32Row(const uint8* y_buf,
-                            uint8* rgb_buf,
-                            int width);
-  asm(
-  ".text\n"
-#if defined(OSX) || defined(IOS)
-  ".globl _FastConvertYToRGB32Row\n"
-"_FastConvertYToRGB32Row:\n"
-#else
-  ".global FastConvertYToRGB32Row\n"
-"FastConvertYToRGB32Row:\n"
-#endif
-  "push   %ebx\n"
-  "mov    0x8(%esp),%eax\n"
-  "mov    0xc(%esp),%edx\n"
-  "mov    0x10(%esp),%ecx\n"
-
-"1:"
-  "movzbl (%eax),%ebx\n"
-  "movq   _kCoefficientsRgbY(,%ebx,8),%mm0\n"
-  "psraw  $0x6,%mm0\n"
-  "movzbl 0x1(%eax),%ebx\n"
-  "movq   _kCoefficientsRgbY(,%ebx,8),%mm1\n"
-  "psraw  $0x6,%mm1\n"
-  "packuswb %mm1,%mm0\n"
-  "lea    0x2(%eax),%eax\n"
-  "movq   %mm0,(%edx)\n"
-  "lea    0x8(%edx),%edx\n"
-  "sub    $0x2,%ecx\n"
-  "ja     1b\n"
-  "pop    %ebx\n"
-  "ret\n"
-);
+    "+rm"(width)    // %2
+  :
+  : "memory", "cc", "eax"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+  );
+}
+#endif  // HAS_YTOARGBROW_SSE2
 
-#else
-// C reference code that mimic the YUV assembly.
-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
-    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
-
-static inline void YuvPixel(uint8 y,
-                            uint8 u,
-                            uint8 v,
-                            uint8* rgb_buf,
-                            int ashift,
-                            int rshift,
-                            int gshift,
-                            int bshift) {
-
-  int b = _kCoefficientsRgbY[256+u][0];
-  int g = _kCoefficientsRgbY[256+u][1];
-  int r = _kCoefficientsRgbY[256+u][2];
-  int a = _kCoefficientsRgbY[256+u][3];
-
-  b = paddsw(b, _kCoefficientsRgbY[512+v][0]);
-  g = paddsw(g, _kCoefficientsRgbY[512+v][1]);
-  r = paddsw(r, _kCoefficientsRgbY[512+v][2]);
-  a = paddsw(a, _kCoefficientsRgbY[512+v][3]);
-
-  b = paddsw(b, _kCoefficientsRgbY[y][0]);
-  g = paddsw(g, _kCoefficientsRgbY[y][1]);
-  r = paddsw(r, _kCoefficientsRgbY[y][2]);
-  a = paddsw(a, _kCoefficientsRgbY[y][3]);
-
-  b >>= 6;
-  g >>= 6;
-  r >>= 6;
-  a >>= 6;
-
-  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) |
-                                        (packuswb(g) << gshift) |
-                                        (packuswb(r) << rshift) |
-                                        (packuswb(a) << ashift);
-}
-
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width) {
-  for (int x = 0; x < width; x += 2) {
-    uint8 u = u_buf[x >> 1];
-    uint8 v = v_buf[x >> 1];
-    uint8 y0 = y_buf[x];
-    YuvPixel(y0, u, v, rgb_buf, 24, 16, 8, 0);
-    if ((x + 1) < width) {
-      uint8 y1 = y_buf[x + 1];
-      YuvPixel(y1, u, v, rgb_buf + 4, 24, 16, 8, 0);
-    }
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+CONST uvec8 kShuffleMirror = {
+  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = static_cast<intptr_t>(width);
+  asm volatile (
+    "movdqa    %3,%%xmm5                       \n"
+    "lea       -0x10(%0),%0                    \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0,%2),%%xmm0                  \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kShuffleMirror) // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm5"
+#endif
+  );
 }
+#endif  // HAS_MIRRORROW_SSSE3
 
-void FastConvertYUVToBGRARow(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* rgb_buf,
-                             int width) {
-  for (int x = 0; x < width; x += 2) {
-    uint8 u = u_buf[x >> 1];
-    uint8 v = v_buf[x >> 1];
-    uint8 y0 = y_buf[x];
-    YuvPixel(y0, u, v, rgb_buf, 0, 8, 16, 24);
-    if ((x + 1) < width) {
-      uint8 y1 = y_buf[x + 1];
-      YuvPixel(y1, u, v, rgb_buf + 4, 0, 8, 16, 24);
-    }
-    rgb_buf += 8;  // Advance 2 pixels.
+#ifdef HAS_MIRRORROW_SSE2
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = static_cast<intptr_t>(width);
+  asm volatile (
+    "lea       -0x10(%0),%0                    \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0,%2),%%xmm0                  \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "psllw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
+    "sub       $0x10,%2                        \n"
+    "movdqu    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+#endif  // HAS_MIRRORROW_SSE2
+
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+CONST uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  intptr_t temp_width = static_cast<intptr_t>(width);
+  asm volatile (
+    "movdqa    %4,%%xmm1                       \n"
+    "lea       -16(%0,%3,2),%0                 \n"
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "lea       -16(%0),%0                      \n"
+    "pshufb    %%xmm1,%%xmm0                   \n"
+    "sub       $8,%3                           \n"
+    "movlpd    %%xmm0,(%1)                     \n"
+    "movhpd    %%xmm0,(%1,%2)                  \n"
+    "lea       8(%1),%1                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),      // %0
+    "+r"(dst_u),    // %1
+    "+r"(dst_v),    // %2
+    "+r"(temp_width)  // %3
+  : "m"(kShuffleMirrorUV)  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+#endif  // HAS_MIRRORROW_UV_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+CONST uvec8 kARGBShuffleMirror = {
+  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
+};
+
+void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = static_cast<intptr_t>(width);
+  asm volatile (
+    "movdqa    %3,%%xmm5                       \n"
+    "lea       -0x10(%0),%0                    \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0,%2,4),%%xmm0                \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kARGBShuffleMirror)  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_SSSE3
+
+#ifdef HAS_SPLITUV_SSE2
+void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb    %%xmm5,%%xmm5                    \n"
+    "psrlw      $0x8,%%xmm5                      \n"
+    "sub        %1,%2                            \n"
+    ".p2align  4                               \n"
+  "1:                                            \n"
+    "movdqa     (%0),%%xmm0                      \n"
+    "movdqa     0x10(%0),%%xmm1                  \n"
+    "lea        0x20(%0),%0                      \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "pand       %%xmm5,%%xmm0                    \n"
+    "pand       %%xmm5,%%xmm1                    \n"
+    "packuswb   %%xmm1,%%xmm0                    \n"
+    "psrlw      $0x8,%%xmm2                      \n"
+    "psrlw      $0x8,%%xmm3                      \n"
+    "packuswb   %%xmm3,%%xmm2                    \n"
+    "movdqa     %%xmm0,(%1)                      \n"
+    "movdqa     %%xmm2,(%1,%2)                   \n"
+    "lea        0x10(%1),%1                      \n"
+    "sub        $0x10,%3                         \n"
+    "jg         1b                               \n"
+  : "+r"(src_uv),     // %0
+    "+r"(dst_u),      // %1
+    "+r"(dst_v),      // %2
+    "+r"(pix)         // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_SPLITUV_SSE2
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    "sub        %0,%1                          \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    %%xmm0,(%0,%1)                  \n"
+    "movdqa    %%xmm1,0x10(%0,%1)              \n"
+    "lea       0x20(%0),%0                     \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_X86
+void CopyRow_X86(const uint8* src, uint8* dst, int width) {
+  size_t width_tmp = static_cast<size_t>(width);
+  asm volatile (
+    "shr       $0x2,%2                         \n"
+    "rep movsl                                 \n"
+  : "+S"(src),  // %0
+    "+D"(dst),  // %1
+    "+c"(width_tmp) // %2
+  :
+  : "memory", "cc"
+  );
+}
+#endif  // HAS_COPYROW_X86
+
+#ifdef HAS_SETROW_X86
+void SetRow8_X86(uint8* dst, uint32 v32, int width) {
+  size_t width_tmp = static_cast<size_t>(width);
+  asm volatile (
+    "shr       $0x2,%1                         \n"
+    "rep stosl                                 \n"
+    : "+D"(dst),       // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v32)         // %2
+    : "memory", "cc");
+}
+
+void SetRows32_X86(uint8* dst, uint32 v32, int width,
+                   int dst_stride, int height) {
+  for (int y = 0; y < height; ++y) {
+    size_t width_tmp = static_cast<size_t>(width);
+    uint32* d = reinterpret_cast<uint32*>(dst);
+    asm volatile (
+      "rep stosl                               \n"
+      : "+D"(d),         // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
+    dst += dst_stride;
   }
 }
+#endif  // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    (%0,%4,1),%%xmm2                \n"
+    "movdqa    0x10(%0,%4,1),%%xmm3            \n"
+    "lea       0x20(%0),%0                     \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "movq      %%xmm1,(%1,%2)                  \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "movq      %%xmm1,(%1,%2)                  \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+
+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
+                               uint8* dst_y, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqu    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+
+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
+                                int stride_yuy2,
+                                uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "movdqu    (%0,%4,1),%%xmm2                \n"
+    "movdqu    0x10(%0,%4,1),%%xmm3            \n"
+    "lea       0x20(%0),%0                     \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "movq      %%xmm1,(%1,%2)                  \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+
+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
+                                   uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "movq      %%xmm1,(%1,%2)                  \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    (%0,%4,1),%%xmm2                \n"
+    "movdqa    0x10(%0,%4,1),%%xmm3            \n"
+    "lea       0x20(%0),%0                     \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "movq      %%xmm1,(%1,%2)                  \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "movq      %%xmm1,(%1,%2)                  \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+
+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
+                               uint8* dst_y, int pix) {
+  asm volatile (
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqu    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+
+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                                uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "movdqu    (%0,%4,1),%%xmm2                \n"
+    "movdqu    0x10(%0,%4,1),%%xmm3            \n"
+    "lea       0x20(%0),%0                     \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "movq      %%xmm1,(%1,%2)                  \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+
+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
+                                   uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "movq      %%xmm1,(%1,%2)                  \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time.
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0xf,%%xmm7                     \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x8,%%xmm6                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psllw     $0x8,%%xmm5                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    "sub       $0x1,%3                         \n"
+    "je        91f                             \n"
+    "jl        99f                             \n"
+
+    // 1 pixel loop until destination pointer is aligned.
+  "10:                                         \n"
+    "test      $0xf,%2                         \n"
+    "je        19f                             \n"
+    "movd      (%0),%%xmm3                     \n"
+    "lea       0x4(%0),%0                      \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movd      (%1),%%xmm2                     \n"
+    "psrlw     $0x8,%%xmm3                     \n"
+    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movd      (%1),%%xmm1                     \n"
+    "lea       0x4(%1),%1                      \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "sub       $0x1,%3                         \n"
+    "movd      %%xmm0,(%2)                     \n"
+    "lea       0x4(%2),%2                      \n"
+    "jge       10b                             \n"
+
+  "19:                                         \n"
+    "add       $1-4,%3                         \n"
+    "jl        49f                             \n"
 
-void FastConvertYUVToABGRRow(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* rgb_buf,
+    // 4 pixel loop.
+    ".p2align  2                               \n"
+  "41:                                         \n"
+    "movdqu    (%0),%%xmm3                     \n"
+    "lea       0x10(%0),%0                     \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movdqu    (%1),%%xmm2                     \n"
+    "psrlw     $0x8,%%xmm3                     \n"
+    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movdqu    (%1),%%xmm1                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%3                         \n"
+    "movdqa    %%xmm0,(%2)                     \n"
+    "lea       0x10(%2),%2                     \n"
+    "jge       41b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        99f                             \n"
+
+    // 1 pixel loop.
+  "91:                                         \n"
+    "movd      (%0),%%xmm3                     \n"
+    "lea       0x4(%0),%0                      \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movd      (%1),%%xmm2                     \n"
+    "psrlw     $0x8,%%xmm3                     \n"
+    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movd      (%1),%%xmm1                     \n"
+    "lea       0x4(%1),%1                      \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "sub       $0x1,%3                         \n"
+    "movd      %%xmm0,(%2)                     \n"
+    "lea       0x4(%2),%2                      \n"
+    "jge       91b                             \n"
+  "99:                                         \n"
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
+}
+#endif  // HAS_ARGBBLENDROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+CONST uvec8 kShuffleAlpha = {
+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+
+// Blend 8 pixels at a time
+// Shuffle table for reversing the bytes.
+
+// Same as SSE2, but replaces
+//    psrlw      xmm3, 8          // alpha
+//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
+//    pshuflw    xmm3, xmm3,0F5h
+// with..
+//    pshufb     xmm3, kShuffleAlpha // alpha
+
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                        uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0xf,%%xmm7                     \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x8,%%xmm6                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psllw     $0x8,%%xmm5                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    "sub       $0x1,%3                         \n"
+    "je        91f                             \n"
+    "jl        99f                             \n"
+
+    // 1 pixel loop until destination pointer is aligned.
+  "10:                                         \n"
+    "test      $0xf,%2                         \n"
+    "je        19f                             \n"
+    "movd      (%0),%%xmm3                     \n"
+    "lea       0x4(%0),%0                      \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movd      (%1),%%xmm2                     \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movd      (%1),%%xmm1                     \n"
+    "lea       0x4(%1),%1                      \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "sub       $0x1,%3                         \n"
+    "movd      %%xmm0,(%2)                     \n"
+    "lea       0x4(%2),%2                      \n"
+    "jge       10b                             \n"
+
+  "19:                                         \n"
+    "add       $1-4,%3                         \n"
+    "jl        49f                             \n"
+    "test      $0xf,%0                         \n"
+    "jne       41f                             \n"
+    "test      $0xf,%1                         \n"
+    "jne       41f                             \n"
+
+    // 4 pixel loop.
+    ".p2align  2                               \n"
+  "40:                                         \n"
+    "movdqa    (%0),%%xmm3                     \n"
+    "lea       0x10(%0),%0                     \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movdqa    (%1),%%xmm2                     \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movdqa    (%1),%%xmm1                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%3                         \n"
+    "movdqa    %%xmm0,(%2)                     \n"
+    "lea       0x10(%2),%2                     \n"
+    "jge       40b                             \n"
+    "jmp       49f                             \n"
+
+    // 4 pixel unaligned loop.
+    ".p2align  2                               \n"
+  "41:                                         \n"
+    "movdqu    (%0),%%xmm3                     \n"
+    "lea       0x10(%0),%0                     \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movdqu    (%1),%%xmm2                     \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movdqu    (%1),%%xmm1                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%3                         \n"
+    "movdqa    %%xmm0,(%2)                     \n"
+    "lea       0x10(%2),%2                     \n"
+    "jge       41b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        99f                             \n"
+
+    // 1 pixel loop.
+  "91:                                         \n"
+    "movd      (%0),%%xmm3                     \n"
+    "lea       0x4(%0),%0                      \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movd      (%1),%%xmm2                     \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movd      (%1),%%xmm1                     \n"
+    "lea       0x4(%1),%1                      \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "sub       $0x1,%3                         \n"
+    "movd      %%xmm0,(%2)                     \n"
+    "lea       0x4(%2),%2                      \n"
+    "jge       91b                             \n"
+  "99:                                         \n"
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  : "m"(kShuffleAlpha)  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATE_SSE2
+// Attenuate 4 pixels at a time.
+// aligned to 16 bytes
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrld     $0x8,%%xmm5                     \n"
+
+    // 4 pixel loop.
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
+    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqa    (%0),%%xmm1                     \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
+    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "movdqa    (%0),%%xmm2                     \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "pand      %%xmm4,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "lea       0x10(%0),%0                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBATTENUATE_SSE2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha
+CONST uvec8 kShuffleAlpha0 = {
+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+CONST uvec8 kShuffleAlpha1 = {
+  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+// Attenuate 4 pixels at a time.
+// aligned to 16 bytes
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "pslld     $0x18,%%xmm3                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    // 4 pixel loop.
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqa    (%0),%%xmm1                     \n"
+    "punpcklbw %%xmm1,%%xmm1                   \n"
+    "pmulhuw   %%xmm1,%%xmm0                   \n"
+    "movdqa    (%0),%%xmm1                     \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "movdqa    (%0),%%xmm2                     \n"
+    "punpckhbw %%xmm2,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "movdqa    (%0),%%xmm2                     \n"
+    "pand      %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "lea       0x10(%0),%0                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleAlpha0),  // %3
+    "m"(kShuffleAlpha1)  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+// aligned to 16 bytes
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
-  for (int x = 0; x < width; x += 2) {
-    uint8 u = u_buf[x >> 1];
-    uint8 v = v_buf[x >> 1];
-    uint8 y0 = y_buf[x];
-    YuvPixel(y0, u, v, rgb_buf, 24, 0, 8, 16);
-    if ((x + 1) < width) {
-      uint8 y1 = y_buf[x + 1];
-      YuvPixel(y1, u, v, rgb_buf + 4, 24, 0, 8, 16);
-    }
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
+  uintptr_t alpha = 0;
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+
+    // 4 pixel loop.
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movzb     0x3(%0),%3                      \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "movd      0x0(%4,%3,4),%%xmm2             \n"
+    "movzb     0x7(%0),%3                      \n"
+    "movd      0x0(%4,%3,4),%%xmm3             \n"
+    "pshuflw   $0xc0,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0xc0,%%xmm3,%%xmm3             \n"
+    "movlhps   %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqa    (%0),%%xmm1                     \n"
+    "movzb     0xb(%0),%3                      \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "movd      0x0(%4,%3,4),%%xmm2             \n"
+    "movzb     0xf(%0),%3                      \n"
+    "movd      0x0(%4,%3,4),%%xmm3             \n"
+    "pshuflw   $0xc0,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0xc0,%%xmm3,%%xmm3             \n"
+    "movlhps   %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "movdqa    (%0),%%xmm2                     \n"
+    "pand      %%xmm4,%%xmm2                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "lea       0x10(%0),%0                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width),       // %2
+    "+r"(alpha)        // %3
+  : "r"(fixed_invtbl8)  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
 }
+#endif  // HAS_ARGBUNATTENUATEROW_SSE2
 
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* rgb_buf,
-                                 int width) {
-  for (int x = 0; x < width; ++x) {
-    uint8 u = u_buf[x];
-    uint8 v = v_buf[x];
-    uint8 y = y_buf[x];
-    YuvPixel(y, u, v, rgb_buf, 24, 16, 8, 0);
-    rgb_buf += 4;  // Advance 1 pixel.
-  }
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
+CONST vec8 kARGBToGray = {
+  14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
+};
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "sub       %0,%1                           \n"
+
+    // 8 pixel loop.
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movdqa    (%0),%%xmm2                     \n"
+    "movdqa    0x10(%0),%%xmm3                 \n"
+    "psrld     $0x18,%%xmm2                    \n"
+    "psrld     $0x18,%%xmm3                    \n"
+    "packuswb  %%xmm3,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm3                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm3,%%xmm0                   \n"
+    "punpckhwd %%xmm3,%%xmm1                   \n"
+    "sub       $0x8,%2                         \n"
+    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "movdqa    %%xmm1,0x10(%0,%1,1)            \n"
+    "lea       0x20(%0),%0                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "m"(kARGBToGray)  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+  );
 }
+#endif  // HAS_ARGBGRAYROW_SSSE3
 
-void FastConvertYToRGB32Row(const uint8* y_buf,
-                            uint8* rgb_buf,
-                            int width) {
-  for (int x = 0; x < width; ++x) {
-    uint8 y = y_buf[x];
-    YuvPixel(y, 128, 128, rgb_buf, 24, 16, 8, 0);
-    rgb_buf += 4;  // Advance 1 pixel.
-  }
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone
+CONST vec8 kARGBToSepiaB = {
+  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+CONST vec8 kARGBToSepiaG = {
+  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+CONST vec8 kARGBToSepiaR = {
+  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+  asm volatile (
+    "movdqa    %2,%%xmm2                       \n"
+    "movdqa    %3,%%xmm3                       \n"
+    "movdqa    %4,%%xmm4                       \n"
+
+    // 8 pixel loop.
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm6                 \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "pmaddubsw %%xmm2,%%xmm6                   \n"
+    "phaddw    %%xmm6,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movdqa    (%0),%%xmm5                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "pmaddubsw %%xmm3,%%xmm5                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm5                   \n"
+    "psrlw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "movdqa    (%0),%%xmm5                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "pmaddubsw %%xmm4,%%xmm5                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm5                   \n"
+    "psrlw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "movdqa    (%0),%%xmm6                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "psrld     $0x18,%%xmm6                    \n"
+    "psrld     $0x18,%%xmm1                    \n"
+    "packuswb  %%xmm1,%%xmm6                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm5                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm5,%%xmm0                   \n"
+    "punpckhwd %%xmm5,%%xmm1                   \n"
+    "sub       $0x8,%1                         \n"
+    "movdqa    %%xmm0,(%0)                     \n"
+    "movdqa    %%xmm1,0x10(%0)                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),      // %0
+    "+r"(width)          // %1
+  : "m"(kARGBToSepiaB),  // %2
+    "m"(kARGBToSepiaG),  // %3
+    "m"(kARGBToSepiaR)   // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+  );
+}
+#endif  // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
+                              int width) {
+  asm volatile (
+    "movd      (%2),%%xmm2                     \n"
+    "movd      0x4(%2),%%xmm3                  \n"
+    "movd      0x8(%2),%%xmm4                  \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+
+    // 8 pixel loop.
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm6                 \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "pmaddubsw %%xmm2,%%xmm6                   \n"
+    "movdqa    (%0),%%xmm5                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "pmaddubsw %%xmm3,%%xmm5                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "phaddsw   %%xmm6,%%xmm0                   \n"
+    "phaddsw   %%xmm1,%%xmm5                   \n"
+    "psraw     $0x7,%%xmm0                     \n"
+    "psraw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "movdqa    (%0),%%xmm5                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "pmaddubsw %%xmm4,%%xmm5                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "phaddsw   %%xmm1,%%xmm5                   \n"
+    "psraw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "movdqa    (%0),%%xmm6                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "psrld     $0x18,%%xmm6                    \n"
+    "psrld     $0x18,%%xmm1                    \n"
+    "packuswb  %%xmm1,%%xmm6                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm6,%%xmm5                   \n"
+    "punpcklwd %%xmm5,%%xmm0                   \n"
+    "punpckhwd %%xmm5,%%xmm1                   \n"
+    "sub       $0x8,%1                         \n"
+    "movdqa    %%xmm0,(%0)                     \n"
+    "movdqa    %%xmm1,0x10(%0)                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),      // %0
+    "+r"(width)          // %1
+  : "r"(matrix_argb)     // %2
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+  );
 }
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
 
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+// aligned to 16 bytes
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "movd      %2,%%xmm2                       \n"
+    "movd      %3,%%xmm3                       \n"
+    "movd      %4,%%xmm4                       \n"
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
+    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
+    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "pslld     $0x18,%%xmm6                    \n"
+
+    // 4 pixel loop.
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqa    (%0),%%xmm1                     \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "pmullw    %%xmm3,%%xmm0                   \n"
+    "movdqa    (%0),%%xmm7                     \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm6,%%xmm7                   \n"
+    "paddw     %%xmm4,%%xmm0                   \n"
+    "paddw     %%xmm4,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "por       %%xmm7,%%xmm0                   \n"
+    "sub       $0x4,%1                         \n"
+    "movdqa    %%xmm0,(%0)                     \n"
+    "lea       0x10(%0),%0                     \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 #endif
+  );
+}
+#endif  // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width) {
+  asm volatile (
+    "sub       %1,%2                           \n"
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+    "test      $0xf,%1                         \n"
+    "jne       49f                             \n"
+
+  // 4 pixel loop                              \n"
+    ".p2align  2                               \n"
+  "40:                                         \n"
+    "movdqu    (%0),%%xmm2                     \n"
+    "lea       0x10(%0),%0                     \n"
+    "movdqa    %%xmm2,%%xmm4                   \n"
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm1,%%xmm2                   \n"
+    "punpckhwd %%xmm1,%%xmm3                   \n"
+    "punpckhbw %%xmm1,%%xmm4                   \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "punpcklwd %%xmm1,%%xmm4                   \n"
+    "punpckhwd %%xmm1,%%xmm5                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "movdqa    (%1,%2,1),%%xmm2                \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "movdqa    0x10(%1,%2,1),%%xmm3            \n"
+    "paddd     %%xmm0,%%xmm3                   \n"
+    "paddd     %%xmm4,%%xmm0                   \n"
+    "movdqa    0x20(%1,%2,1),%%xmm4            \n"
+    "paddd     %%xmm0,%%xmm4                   \n"
+    "paddd     %%xmm5,%%xmm0                   \n"
+    "movdqa    0x30(%1,%2,1),%%xmm5            \n"
+    "paddd     %%xmm0,%%xmm5                   \n"
+    "movdqa    %%xmm2,(%1)                     \n"
+    "movdqa    %%xmm3,0x10(%1)                 \n"
+    "movdqa    %%xmm4,0x20(%1)                 \n"
+    "movdqa    %%xmm5,0x30(%1)                 \n"
+    "lea       0x40(%1),%1                     \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    ".p2align  2                               \n"
+  "10:                                         \n"
+    "movd      (%0),%%xmm2                     \n"
+    "lea       0x4(%0),%0                      \n"
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "punpcklwd %%xmm1,%%xmm2                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "movdqu    (%1,%2,1),%%xmm2                \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "movdqu    %%xmm2,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "sub       $0x1,%3                         \n"
+    "jge       10b                             \n"
 
+  "19:                                         \n"
+  : "+r"(row),  // %0
+    "+r"(cumsum),  // %1
+    "+r"(previous_cumsum),  // %2
+    "+r"(width)  // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
+void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
+                                 int width, int area, uint8* dst, int count) {
+  asm volatile (
+    "movd      %5,%%xmm4                       \n"
+    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
+    "rcpss     %%xmm4,%%xmm4                   \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+
+  // 4 pixel loop                              \n"
+    ".p2align  2                               \n"
+  "40:                                         \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    0x20(%0),%%xmm2                 \n"
+    "movdqa    0x30(%0),%%xmm3                 \n"
+    "psubd     (%0,%4,4),%%xmm0                \n"
+    "psubd     0x10(%0,%4,4),%%xmm1            \n"
+    "psubd     0x20(%0,%4,4),%%xmm2            \n"
+    "psubd     0x30(%0,%4,4),%%xmm3            \n"
+    "lea       0x40(%0),%0                     \n"
+    "psubd     (%1),%%xmm0                     \n"
+    "psubd     0x10(%1),%%xmm1                 \n"
+    "psubd     0x20(%1),%%xmm2                 \n"
+    "psubd     0x30(%1),%%xmm3                 \n"
+    "paddd     (%1,%4,4),%%xmm0                \n"
+    "paddd     0x10(%1,%4,4),%%xmm1            \n"
+    "paddd     0x20(%1,%4,4),%%xmm2            \n"
+    "paddd     0x30(%1,%4,4),%%xmm3            \n"
+    "lea       0x40(%1),%1                     \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
+    "mulps     %%xmm4,%%xmm0                   \n"
+    "mulps     %%xmm4,%%xmm1                   \n"
+    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
+    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
+    "mulps     %%xmm4,%%xmm2                   \n"
+    "mulps     %%xmm4,%%xmm3                   \n"
+    "cvtps2dq  %%xmm0,%%xmm0                   \n"
+    "cvtps2dq  %%xmm1,%%xmm1                   \n"
+    "cvtps2dq  %%xmm2,%%xmm2                   \n"
+    "cvtps2dq  %%xmm3,%%xmm3                   \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm3,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0,(%2)                     \n"
+    "lea       0x10(%2),%2                     \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    ".p2align  2                               \n"
+  "10:                                         \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "psubd     (%0,%4,4),%%xmm0                \n"
+    "lea       0x10(%0),%0                     \n"
+    "psubd     (%1),%%xmm0                     \n"
+    "paddd     (%1,%4,4),%%xmm0                \n"
+    "lea       0x10(%1),%1                     \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "mulps     %%xmm4,%%xmm0                   \n"
+    "cvtps2dq  %%xmm0,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0,(%2)                     \n"
+    "lea       0x4(%2),%2                      \n"
+    "sub       $0x1,%3                         \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(topleft),  // %0
+    "+r"(botleft),  // %1
+    "+r"(dst),      // %2
+    "+rm"(count)    // %3
+  : "r"(static_cast<intptr_t>(width)),  // %4
+    "rm"(area)     // %5
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+  );
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGE_SSE2
+#ifdef HAS_ARGBSHADE_SSE2
+// Shade 4 pixels at a time by specified value.
+// Aligned to 16 bytes.
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "movd      %3,%%xmm2                       \n"
+    "sub       %0,%1                           \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "punpcklqdq %%xmm2,%%xmm2                  \n"
+
+    // 4 pixel loop.
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "lea       0x10(%0),%0                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),       // %0
+    "+r"(dst_argb),       // %1
+    "+r"(width)           // %2
+  : "r"(value)            // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2"
+#endif
+  );
+}
+#endif  // HAS_ARGBSHADE_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// TODO(fbarchard): Find 64 bit way to avoid masking.
+// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
+// Copy ARGB pixels from source image with slope to a row of destination.
+// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
+// an error if movq is used. movd  %%xmm0,%1
+
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width) {
+  intptr_t src_argb_stride_temp = src_argb_stride;
+  intptr_t temp = 0;
+  asm volatile (
+    "movq      (%3),%%xmm2                     \n"
+    "movq      0x8(%3),%%xmm7                  \n"
+    "shl       $0x10,%1                        \n"
+    "add       $0x4,%1                         \n"
+    "movd      %1,%%xmm5                       \n"
+    "sub       $0x4,%4                         \n"
+    "jl        49f                             \n"
+
+    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "addps     %%xmm7,%%xmm0                   \n"
+    "movlhps   %%xmm0,%%xmm2                   \n"
+    "movdqa    %%xmm7,%%xmm4                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
+
+  // 4 pixel loop                              \n"
+    ".p2align  4                               \n"
+  "40:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"
+    "cvttps2dq %%xmm3,%%xmm1                   \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "pmaddwd   %%xmm5,%%xmm0                   \n"
+#if defined(__x86_64__)
+    "movd      %%xmm0,%1                       \n"
+    "mov       %1,%5                           \n"
+    "and       $0x0fffffff,%1                  \n"
+    "shr       $32,%5                          \n"
+    "pshufd    $0xEE,%%xmm0,%%xmm0             \n"
+#else
+    "movd      %%xmm0,%1                       \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    "movd      %%xmm0,%5                       \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+#endif
+    "movd      (%0,%1,1),%%xmm1                \n"
+    "movd      (%0,%5,1),%%xmm6                \n"
+    "punpckldq %%xmm6,%%xmm1                   \n"
+    "addps     %%xmm4,%%xmm2                   \n"
+    "movq      %%xmm1,(%2)                     \n"
+#if defined(__x86_64__)
+    "movd      %%xmm0,%1                       \n"
+    "mov       %1,%5                           \n"
+    "and       $0x0fffffff,%1                  \n"
+    "shr       $32,%5                          \n"
+#else
+    "movd      %%xmm0,%1                       \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    "movd      %%xmm0,%5                       \n"
+#endif
+    "movd      (%0,%1,1),%%xmm0                \n"
+    "movd      (%0,%5,1),%%xmm6                \n"
+    "punpckldq %%xmm6,%%xmm0                   \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "sub       $0x4,%4                         \n"
+    "movq      %%xmm0,0x08(%2)                 \n"
+    "lea       0x10(%2),%2                     \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%4                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    ".p2align  4                               \n"
+  "10:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "pmaddwd   %%xmm5,%%xmm0                   \n"
+    "addps     %%xmm7,%%xmm2                   \n"
+    "movd      %%xmm0,%1                       \n"
+#if defined(__x86_64__)
+    "and       $0x0fffffff,%1                  \n"
+#endif
+    "movd      (%0,%1,1),%%xmm0                \n"
+    "sub       $0x1,%4                         \n"
+    "movd      %%xmm0,(%2)                     \n"
+    "lea       0x4(%2),%2                      \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_stride_temp),  // %1
+    "+r"(dst_argb),  // %2
+    "+r"(uv_dudv),   // %3
+    "+rm"(width),    // %4
+    "+r"(temp)   // %5
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride, int dst_width,
+                              int source_y_fraction) {
+  asm volatile (
+    "sub       %1,%0                           \n"
+    "shr       %3                              \n"
+    "cmp       $0x0,%3                         \n"
+    "je        2f                              \n"
+    "cmp       $0x40,%3                        \n"
+    "je        3f                              \n"
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x80,%3                        \n"
+    "movd      %3,%%xmm5                       \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
+    "punpcklwd %%xmm5,%%xmm5                   \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm2                \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm2,%%xmm0                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    "pmaddubsw %%xmm5,%%xmm0                   \n"
+    "pmaddubsw %%xmm5,%%xmm1                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+    "jmp       4f                              \n"
+    ".p2align  4                               \n"
+  "2:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        2b                              \n"
+    "jmp       4f                              \n"
+    ".p2align  4                               \n"
+  "3:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "pavgb     (%1,%4,1),%%xmm0                \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        3b                              \n"
+  "4:                                          \n"
+    ".p2align  4                               \n"
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+r"(dst_width),   // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"(static_cast<intptr_t>(src_stride))  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm5"
+#endif
+  );
+}
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
 }  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/row_table.cc b/files/source/row_table.cc
deleted file mode 100644
index 022d9f88..00000000
--- a/files/source/row_table.cc
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "row.h"
-
-#define kMaxStride (2048 * 4)
-
-extern "C" {
-
-#define MAKETABLE(NAME) \
-SIMD_ALIGNED(const int16 NAME[256 * 3][4]) = {\
-  RGBY(0x00), RGBY(0x01), RGBY(0x02), RGBY(0x03), \
-  RGBY(0x04), RGBY(0x05), RGBY(0x06), RGBY(0x07), \
-  RGBY(0x08), RGBY(0x09), RGBY(0x0A), RGBY(0x0B), \
-  RGBY(0x0C), RGBY(0x0D), RGBY(0x0E), RGBY(0x0F), \
-  RGBY(0x10), RGBY(0x11), RGBY(0x12), RGBY(0x13), \
-  RGBY(0x14), RGBY(0x15), RGBY(0x16), RGBY(0x17), \
-  RGBY(0x18), RGBY(0x19), RGBY(0x1A), RGBY(0x1B), \
-  RGBY(0x1C), RGBY(0x1D), RGBY(0x1E), RGBY(0x1F), \
-  RGBY(0x20), RGBY(0x21), RGBY(0x22), RGBY(0x23), \
-  RGBY(0x24), RGBY(0x25), RGBY(0x26), RGBY(0x27), \
-  RGBY(0x28), RGBY(0x29), RGBY(0x2A), RGBY(0x2B), \
-  RGBY(0x2C), RGBY(0x2D), RGBY(0x2E), RGBY(0x2F), \
-  RGBY(0x30), RGBY(0x31), RGBY(0x32), RGBY(0x33), \
-  RGBY(0x34), RGBY(0x35), RGBY(0x36), RGBY(0x37), \
-  RGBY(0x38), RGBY(0x39), RGBY(0x3A), RGBY(0x3B), \
-  RGBY(0x3C), RGBY(0x3D), RGBY(0x3E), RGBY(0x3F), \
-  RGBY(0x40), RGBY(0x41), RGBY(0x42), RGBY(0x43), \
-  RGBY(0x44), RGBY(0x45), RGBY(0x46), RGBY(0x47), \
-  RGBY(0x48), RGBY(0x49), RGBY(0x4A), RGBY(0x4B), \
-  RGBY(0x4C), RGBY(0x4D), RGBY(0x4E), RGBY(0x4F), \
-  RGBY(0x50), RGBY(0x51), RGBY(0x52), RGBY(0x53), \
-  RGBY(0x54), RGBY(0x55), RGBY(0x56), RGBY(0x57), \
-  RGBY(0x58), RGBY(0x59), RGBY(0x5A), RGBY(0x5B), \
-  RGBY(0x5C), RGBY(0x5D), RGBY(0x5E), RGBY(0x5F), \
-  RGBY(0x60), RGBY(0x61), RGBY(0x62), RGBY(0x63), \
-  RGBY(0x64), RGBY(0x65), RGBY(0x66), RGBY(0x67), \
-  RGBY(0x68), RGBY(0x69), RGBY(0x6A), RGBY(0x6B), \
-  RGBY(0x6C), RGBY(0x6D), RGBY(0x6E), RGBY(0x6F), \
-  RGBY(0x70), RGBY(0x71), RGBY(0x72), RGBY(0x73), \
-  RGBY(0x74), RGBY(0x75), RGBY(0x76), RGBY(0x77), \
-  RGBY(0x78), RGBY(0x79), RGBY(0x7A), RGBY(0x7B), \
-  RGBY(0x7C), RGBY(0x7D), RGBY(0x7E), RGBY(0x7F), \
-  RGBY(0x80), RGBY(0x81), RGBY(0x82), RGBY(0x83), \
-  RGBY(0x84), RGBY(0x85), RGBY(0x86), RGBY(0x87), \
-  RGBY(0x88), RGBY(0x89), RGBY(0x8A), RGBY(0x8B), \
-  RGBY(0x8C), RGBY(0x8D), RGBY(0x8E), RGBY(0x8F), \
-  RGBY(0x90), RGBY(0x91), RGBY(0x92), RGBY(0x93), \
-  RGBY(0x94), RGBY(0x95), RGBY(0x96), RGBY(0x97), \
-  RGBY(0x98), RGBY(0x99), RGBY(0x9A), RGBY(0x9B), \
-  RGBY(0x9C), RGBY(0x9D), RGBY(0x9E), RGBY(0x9F), \
-  RGBY(0xA0), RGBY(0xA1), RGBY(0xA2), RGBY(0xA3), \
-  RGBY(0xA4), RGBY(0xA5), RGBY(0xA6), RGBY(0xA7), \
-  RGBY(0xA8), RGBY(0xA9), RGBY(0xAA), RGBY(0xAB), \
-  RGBY(0xAC), RGBY(0xAD), RGBY(0xAE), RGBY(0xAF), \
-  RGBY(0xB0), RGBY(0xB1), RGBY(0xB2), RGBY(0xB3), \
-  RGBY(0xB4), RGBY(0xB5), RGBY(0xB6), RGBY(0xB7), \
-  RGBY(0xB8), RGBY(0xB9), RGBY(0xBA), RGBY(0xBB), \
-  RGBY(0xBC), RGBY(0xBD), RGBY(0xBE), RGBY(0xBF), \
-  RGBY(0xC0), RGBY(0xC1), RGBY(0xC2), RGBY(0xC3), \
-  RGBY(0xC4), RGBY(0xC5), RGBY(0xC6), RGBY(0xC7), \
-  RGBY(0xC8), RGBY(0xC9), RGBY(0xCA), RGBY(0xCB), \
-  RGBY(0xCC), RGBY(0xCD), RGBY(0xCE), RGBY(0xCF), \
-  RGBY(0xD0), RGBY(0xD1), RGBY(0xD2), RGBY(0xD3), \
-  RGBY(0xD4), RGBY(0xD5), RGBY(0xD6), RGBY(0xD7), \
-  RGBY(0xD8), RGBY(0xD9), RGBY(0xDA), RGBY(0xDB), \
-  RGBY(0xDC), RGBY(0xDD), RGBY(0xDE), RGBY(0xDF), \
-  RGBY(0xE0), RGBY(0xE1), RGBY(0xE2), RGBY(0xE3), \
-  RGBY(0xE4), RGBY(0xE5), RGBY(0xE6), RGBY(0xE7), \
-  RGBY(0xE8), RGBY(0xE9), RGBY(0xEA), RGBY(0xEB), \
-  RGBY(0xEC), RGBY(0xED), RGBY(0xEE), RGBY(0xEF), \
-  RGBY(0xF0), RGBY(0xF1), RGBY(0xF2), RGBY(0xF3), \
-  RGBY(0xF4), RGBY(0xF5), RGBY(0xF6), RGBY(0xF7), \
-  RGBY(0xF8), RGBY(0xF9), RGBY(0xFA), RGBY(0xFB), \
-  RGBY(0xFC), RGBY(0xFD), RGBY(0xFE), RGBY(0xFF), \
-  RGBU(0x00), RGBU(0x01), RGBU(0x02), RGBU(0x03), \
-  RGBU(0x04), RGBU(0x05), RGBU(0x06), RGBU(0x07), \
-  RGBU(0x08), RGBU(0x09), RGBU(0x0A), RGBU(0x0B), \
-  RGBU(0x0C), RGBU(0x0D), RGBU(0x0E), RGBU(0x0F), \
-  RGBU(0x10), RGBU(0x11), RGBU(0x12), RGBU(0x13), \
-  RGBU(0x14), RGBU(0x15), RGBU(0x16), RGBU(0x17), \
-  RGBU(0x18), RGBU(0x19), RGBU(0x1A), RGBU(0x1B), \
-  RGBU(0x1C), RGBU(0x1D), RGBU(0x1E), RGBU(0x1F), \
-  RGBU(0x20), RGBU(0x21), RGBU(0x22), RGBU(0x23), \
-  RGBU(0x24), RGBU(0x25), RGBU(0x26), RGBU(0x27), \
-  RGBU(0x28), RGBU(0x29), RGBU(0x2A), RGBU(0x2B), \
-  RGBU(0x2C), RGBU(0x2D), RGBU(0x2E), RGBU(0x2F), \
-  RGBU(0x30), RGBU(0x31), RGBU(0x32), RGBU(0x33), \
-  RGBU(0x34), RGBU(0x35), RGBU(0x36), RGBU(0x37), \
-  RGBU(0x38), RGBU(0x39), RGBU(0x3A), RGBU(0x3B), \
-  RGBU(0x3C), RGBU(0x3D), RGBU(0x3E), RGBU(0x3F), \
-  RGBU(0x40), RGBU(0x41), RGBU(0x42), RGBU(0x43), \
-  RGBU(0x44), RGBU(0x45), RGBU(0x46), RGBU(0x47), \
-  RGBU(0x48), RGBU(0x49), RGBU(0x4A), RGBU(0x4B), \
-  RGBU(0x4C), RGBU(0x4D), RGBU(0x4E), RGBU(0x4F), \
-  RGBU(0x50), RGBU(0x51), RGBU(0x52), RGBU(0x53), \
-  RGBU(0x54), RGBU(0x55), RGBU(0x56), RGBU(0x57), \
-  RGBU(0x58), RGBU(0x59), RGBU(0x5A), RGBU(0x5B), \
-  RGBU(0x5C), RGBU(0x5D), RGBU(0x5E), RGBU(0x5F), \
-  RGBU(0x60), RGBU(0x61), RGBU(0x62), RGBU(0x63), \
-  RGBU(0x64), RGBU(0x65), RGBU(0x66), RGBU(0x67), \
-  RGBU(0x68), RGBU(0x69), RGBU(0x6A), RGBU(0x6B), \
-  RGBU(0x6C), RGBU(0x6D), RGBU(0x6E), RGBU(0x6F), \
-  RGBU(0x70), RGBU(0x71), RGBU(0x72), RGBU(0x73), \
-  RGBU(0x74), RGBU(0x75), RGBU(0x76), RGBU(0x77), \
-  RGBU(0x78), RGBU(0x79), RGBU(0x7A), RGBU(0x7B), \
-  RGBU(0x7C), RGBU(0x7D), RGBU(0x7E), RGBU(0x7F), \
-  RGBU(0x80), RGBU(0x81), RGBU(0x82), RGBU(0x83), \
-  RGBU(0x84), RGBU(0x85), RGBU(0x86), RGBU(0x87), \
-  RGBU(0x88), RGBU(0x89), RGBU(0x8A), RGBU(0x8B), \
-  RGBU(0x8C), RGBU(0x8D), RGBU(0x8E), RGBU(0x8F), \
-  RGBU(0x90), RGBU(0x91), RGBU(0x92), RGBU(0x93), \
-  RGBU(0x94), RGBU(0x95), RGBU(0x96), RGBU(0x97), \
-  RGBU(0x98), RGBU(0x99), RGBU(0x9A), RGBU(0x9B), \
-  RGBU(0x9C), RGBU(0x9D), RGBU(0x9E), RGBU(0x9F), \
-  RGBU(0xA0), RGBU(0xA1), RGBU(0xA2), RGBU(0xA3), \
-  RGBU(0xA4), RGBU(0xA5), RGBU(0xA6), RGBU(0xA7), \
-  RGBU(0xA8), RGBU(0xA9), RGBU(0xAA), RGBU(0xAB), \
-  RGBU(0xAC), RGBU(0xAD), RGBU(0xAE), RGBU(0xAF), \
-  RGBU(0xB0), RGBU(0xB1), RGBU(0xB2), RGBU(0xB3), \
-  RGBU(0xB4), RGBU(0xB5), RGBU(0xB6), RGBU(0xB7), \
-  RGBU(0xB8), RGBU(0xB9), RGBU(0xBA), RGBU(0xBB), \
-  RGBU(0xBC), RGBU(0xBD), RGBU(0xBE), RGBU(0xBF), \
-  RGBU(0xC0), RGBU(0xC1), RGBU(0xC2), RGBU(0xC3), \
-  RGBU(0xC4), RGBU(0xC5), RGBU(0xC6), RGBU(0xC7), \
-  RGBU(0xC8), RGBU(0xC9), RGBU(0xCA), RGBU(0xCB), \
-  RGBU(0xCC), RGBU(0xCD), RGBU(0xCE), RGBU(0xCF), \
-  RGBU(0xD0), RGBU(0xD1), RGBU(0xD2), RGBU(0xD3), \
-  RGBU(0xD4), RGBU(0xD5), RGBU(0xD6), RGBU(0xD7), \
-  RGBU(0xD8), RGBU(0xD9), RGBU(0xDA), RGBU(0xDB), \
-  RGBU(0xDC), RGBU(0xDD), RGBU(0xDE), RGBU(0xDF), \
-  RGBU(0xE0), RGBU(0xE1), RGBU(0xE2), RGBU(0xE3), \
-  RGBU(0xE4), RGBU(0xE5), RGBU(0xE6), RGBU(0xE7), \
-  RGBU(0xE8), RGBU(0xE9), RGBU(0xEA), RGBU(0xEB), \
-  RGBU(0xEC), RGBU(0xED), RGBU(0xEE), RGBU(0xEF), \
-  RGBU(0xF0), RGBU(0xF1), RGBU(0xF2), RGBU(0xF3), \
-  RGBU(0xF4), RGBU(0xF5), RGBU(0xF6), RGBU(0xF7), \
-  RGBU(0xF8), RGBU(0xF9), RGBU(0xFA), RGBU(0xFB), \
-  RGBU(0xFC), RGBU(0xFD), RGBU(0xFE), RGBU(0xFF), \
-  RGBV(0x00), RGBV(0x01), RGBV(0x02), RGBV(0x03), \
-  RGBV(0x04), RGBV(0x05), RGBV(0x06), RGBV(0x07), \
-  RGBV(0x08), RGBV(0x09), RGBV(0x0A), RGBV(0x0B), \
-  RGBV(0x0C), RGBV(0x0D), RGBV(0x0E), RGBV(0x0F), \
-  RGBV(0x10), RGBV(0x11), RGBV(0x12), RGBV(0x13), \
-  RGBV(0x14), RGBV(0x15), RGBV(0x16), RGBV(0x17), \
-  RGBV(0x18), RGBV(0x19), RGBV(0x1A), RGBV(0x1B), \
-  RGBV(0x1C), RGBV(0x1D), RGBV(0x1E), RGBV(0x1F), \
-  RGBV(0x20), RGBV(0x21), RGBV(0x22), RGBV(0x23), \
-  RGBV(0x24), RGBV(0x25), RGBV(0x26), RGBV(0x27), \
-  RGBV(0x28), RGBV(0x29), RGBV(0x2A), RGBV(0x2B), \
-  RGBV(0x2C), RGBV(0x2D), RGBV(0x2E), RGBV(0x2F), \
-  RGBV(0x30), RGBV(0x31), RGBV(0x32), RGBV(0x33), \
-  RGBV(0x34), RGBV(0x35), RGBV(0x36), RGBV(0x37), \
-  RGBV(0x38), RGBV(0x39), RGBV(0x3A), RGBV(0x3B), \
-  RGBV(0x3C), RGBV(0x3D), RGBV(0x3E), RGBV(0x3F), \
-  RGBV(0x40), RGBV(0x41), RGBV(0x42), RGBV(0x43), \
-  RGBV(0x44), RGBV(0x45), RGBV(0x46), RGBV(0x47), \
-  RGBV(0x48), RGBV(0x49), RGBV(0x4A), RGBV(0x4B), \
-  RGBV(0x4C), RGBV(0x4D), RGBV(0x4E), RGBV(0x4F), \
-  RGBV(0x50), RGBV(0x51), RGBV(0x52), RGBV(0x53), \
-  RGBV(0x54), RGBV(0x55), RGBV(0x56), RGBV(0x57), \
-  RGBV(0x58), RGBV(0x59), RGBV(0x5A), RGBV(0x5B), \
-  RGBV(0x5C), RGBV(0x5D), RGBV(0x5E), RGBV(0x5F), \
-  RGBV(0x60), RGBV(0x61), RGBV(0x62), RGBV(0x63), \
-  RGBV(0x64), RGBV(0x65), RGBV(0x66), RGBV(0x67), \
-  RGBV(0x68), RGBV(0x69), RGBV(0x6A), RGBV(0x6B), \
-  RGBV(0x6C), RGBV(0x6D), RGBV(0x6E), RGBV(0x6F), \
-  RGBV(0x70), RGBV(0x71), RGBV(0x72), RGBV(0x73), \
-  RGBV(0x74), RGBV(0x75), RGBV(0x76), RGBV(0x77), \
-  RGBV(0x78), RGBV(0x79), RGBV(0x7A), RGBV(0x7B), \
-  RGBV(0x7C), RGBV(0x7D), RGBV(0x7E), RGBV(0x7F), \
-  RGBV(0x80), RGBV(0x81), RGBV(0x82), RGBV(0x83), \
-  RGBV(0x84), RGBV(0x85), RGBV(0x86), RGBV(0x87), \
-  RGBV(0x88), RGBV(0x89), RGBV(0x8A), RGBV(0x8B), \
-  RGBV(0x8C), RGBV(0x8D), RGBV(0x8E), RGBV(0x8F), \
-  RGBV(0x90), RGBV(0x91), RGBV(0x92), RGBV(0x93), \
-  RGBV(0x94), RGBV(0x95), RGBV(0x96), RGBV(0x97), \
-  RGBV(0x98), RGBV(0x99), RGBV(0x9A), RGBV(0x9B), \
-  RGBV(0x9C), RGBV(0x9D), RGBV(0x9E), RGBV(0x9F), \
-  RGBV(0xA0), RGBV(0xA1), RGBV(0xA2), RGBV(0xA3), \
-  RGBV(0xA4), RGBV(0xA5), RGBV(0xA6), RGBV(0xA7), \
-  RGBV(0xA8), RGBV(0xA9), RGBV(0xAA), RGBV(0xAB), \
-  RGBV(0xAC), RGBV(0xAD), RGBV(0xAE), RGBV(0xAF), \
-  RGBV(0xB0), RGBV(0xB1), RGBV(0xB2), RGBV(0xB3), \
-  RGBV(0xB4), RGBV(0xB5), RGBV(0xB6), RGBV(0xB7), \
-  RGBV(0xB8), RGBV(0xB9), RGBV(0xBA), RGBV(0xBB), \
-  RGBV(0xBC), RGBV(0xBD), RGBV(0xBE), RGBV(0xBF), \
-  RGBV(0xC0), RGBV(0xC1), RGBV(0xC2), RGBV(0xC3), \
-  RGBV(0xC4), RGBV(0xC5), RGBV(0xC6), RGBV(0xC7), \
-  RGBV(0xC8), RGBV(0xC9), RGBV(0xCA), RGBV(0xCB), \
-  RGBV(0xCC), RGBV(0xCD), RGBV(0xCE), RGBV(0xCF), \
-  RGBV(0xD0), RGBV(0xD1), RGBV(0xD2), RGBV(0xD3), \
-  RGBV(0xD4), RGBV(0xD5), RGBV(0xD6), RGBV(0xD7), \
-  RGBV(0xD8), RGBV(0xD9), RGBV(0xDA), RGBV(0xDB), \
-  RGBV(0xDC), RGBV(0xDD), RGBV(0xDE), RGBV(0xDF), \
-  RGBV(0xE0), RGBV(0xE1), RGBV(0xE2), RGBV(0xE3), \
-  RGBV(0xE4), RGBV(0xE5), RGBV(0xE6), RGBV(0xE7), \
-  RGBV(0xE8), RGBV(0xE9), RGBV(0xEA), RGBV(0xEB), \
-  RGBV(0xEC), RGBV(0xED), RGBV(0xEE), RGBV(0xEF), \
-  RGBV(0xF0), RGBV(0xF1), RGBV(0xF2), RGBV(0xF3), \
-  RGBV(0xF4), RGBV(0xF5), RGBV(0xF6), RGBV(0xF7), \
-  RGBV(0xF8), RGBV(0xF9), RGBV(0xFA), RGBV(0xFB), \
-  RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF), \
-};
-
-// ARGB table
-#define RGBY(i) { \
-  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
-  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
-  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
-  static_cast<int16>(256 * 64 - 1) \
-}
-
-#define RGBU(i) { \
-  static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \
-  static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
-  0, \
-  0 \
-}
-
-#define RGBV(i) { \
-  0, \
-  static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
-  static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
-  0 \
-}
-
-#ifdef OSX
-MAKETABLE(kCoefficientsRgbY)
-#else
-MAKETABLE(_kCoefficientsRgbY)
-#endif
-
-#undef RGBY
-#undef RGBU
-#undef RGBV
-
-// BGRA table
-#define RGBY(i) { \
-  static_cast<int16>(256 * 64 - 1), \
-  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
-  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
-  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5) \
-}
-
-#define RGBU(i) { \
-  0, \
-  0, \
-  static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
-  static_cast<int16>(2.018 * 64 * (i - 128) + 0.5) \
-}
-
-#define RGBV(i) { \
-  0, \
-  static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
-  static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
-  0 \
-}
-
-#ifdef OSX
-MAKETABLE(kCoefficientsBgraY)
-#else
-MAKETABLE(_kCoefficientsBgraY)
-#endif
-
-
-#undef RGBY
-#undef RGBU
-#undef RGBV
-
-// ABGR table
-#define RGBY(i) { \
-  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
-  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
-  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
-  static_cast<int16>(256 * 64 - 1) \
-}
-
-#define RGBU(i) { \
-  0, \
-  static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
-  static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \
-  0 \
-}
-
-#define RGBV(i) { \
-  static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
-  static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
-  0, \
-  0 \
-}
-
-#ifdef OSX
-MAKETABLE(kCoefficientsAbgrY)
-#else
-MAKETABLE(_kCoefficientsAbgrY)
-#endif
-
-
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
-    uint8 r = src_raw[0];
-    uint8 g = src_raw[1];
-    uint8 b = src_raw[2];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = 255u;
-    dst_argb += 4;
-    src_raw += 3;
-  }
-}
-
-void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
-    uint8 b = src_bg24[0];
-    uint8 g = src_bg24[1];
-    uint8 r = src_bg24[2];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = 255u;
-    dst_argb[3] = 255u;
-    dst_argb += 4;
-    src_bg24 += 3;
-  }
-}
-
-// C versions do the same
-void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  BG24ToARGBRow_C(src_argb, row, pix);
-  ARGBToYRow_C(row, dst_y, pix);
-}
-
-void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  RAWToARGBRow_C(src_argb, row, pix);
-  ARGBToYRow_C(row, dst_y, pix);
-}
-
-void RGB24ToUVRow_C(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_u, uint8* dst_v, int pix) {
-  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-  BG24ToARGBRow_C(src_argb, row, pix);
-  BG24ToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
-  ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
-}
-
-void RAWToUVRow_C(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_u, uint8* dst_v, int pix) {
-  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-  RAWToARGBRow_C(src_argb, row, pix);
-  RAWToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
-  ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
-}
-
-static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
-  return (( 66 * r + 129 * g +  25 * b + 128) >> 8) + 16;
-}
-
-static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
-  return ((-38 * r -  74 * g + 112 * b + 128) >> 8) + 128;
-}
-static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
-  return ((112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
-}
-
-#define MAKEROWY(NAME,R,G,B) \
-void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
-  for (int x = 0; x < width; ++x) {                                            \
-    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
-    src_argb0 += 4;                                                            \
-    dst_y += 1;                                                                \
-  }                                                                            \
-}                                                                              \
-void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
-                       uint8* dst_u, uint8* dst_v, int width) {                \
-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
-  for (int x = 0; x < width - 1; x += 2) {                                     \
-    uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] +                                \
-               src_rgb1[B] + src_rgb1[B + 4]) >> 2;                            \
-    uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] +                                \
-               src_rgb1[G] + src_rgb1[G + 4]) >> 2;                            \
-    uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] +                                \
-               src_rgb1[R] + src_rgb1[R + 4]) >> 2;                            \
-    dst_u[0] = RGBToU(ar, ag, ab);                                             \
-    dst_v[0] = RGBToV(ar, ag, ab);                                             \
-    src_rgb0 += 8;                                                             \
-    src_rgb1 += 8;                                                             \
-    dst_u += 1;                                                                \
-    dst_v += 1;                                                                \
-  }                                                                            \
-  if (width & 1) {                                                             \
-    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
-    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
-    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
-    dst_u[0] = RGBToU(ar, ag, ab);                                             \
-    dst_v[0] = RGBToV(ar, ag, ab);                                             \
-  }                                                                            \
-}
-
-MAKEROWY(ARGB,2,1,0)
-MAKEROWY(BGRA,1,2,3)
-MAKEROWY(ABGR,0,1,2)
-
-#if defined(HAS_RAWTOYROW_SSSE3)
-
-void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  BG24ToARGBRow_SSSE3(src_argb, row, pix);
-  ARGBToYRow_SSSE3(row, dst_y, pix);
-}
-
-void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  RAWToARGBRow_SSSE3(src_argb, row, pix);
-  ARGBToYRow_SSSE3(row, dst_y, pix);
-}
-
-#endif
-
-#if defined(HAS_RAWTOUVROW_SSSE3)
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int pix) {
-  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-  BG24ToARGBRow_SSSE3(src_argb, row, pix);
-  BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
-  ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
-}
-
-void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-  RAWToARGBRow_SSSE3(src_argb, row, pix);
-  RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
-  ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
-}
-
-#else
-
-void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int pix) {
-  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-  BG24ToARGBRow_SSSE3(src_argb, row, pix);
-  BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
-  ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
-}
-
-void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-  RAWToARGBRow_SSSE3(src_argb, row, pix);
-  RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
-  ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
-}
-
-#endif
-#endif
-
-}  // extern "C"
diff --git a/files/source/row_win.cc b/files/source/row_win.cc
index 2bc5fb13..e3b01f27 100644
--- a/files/source/row_win.cc
+++ b/files/source/row_win.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,173 +8,925 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "row.h"
+#include "libyuv/row.h"
 
+#ifdef __cplusplus
+namespace libyuv {
 extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 
+// TODO(fbarchard): I420ToRGB24, I420ToRAW
 #ifdef HAS_ARGBTOYROW_SSSE3
-#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
 
-// Constant multiplication table for converting ARGB to I400.
-extern "C" TALIGN16(const int8, kARGBToY[16]) = {
+// Constants for ARGB.
+static const vec8 kARGBToY = {
   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
 };
 
-extern "C" TALIGN16(const int8, kARGBToU[16]) = {
+static const vec8 kARGBToU = {
   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
 };
 
-extern "C" TALIGN16(const int8, kARGBToV[16]) = {
+static const vec8 kARGBToV = {
   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
 };
 
-// Constants for BGRA
-extern "C" TALIGN16(const int8, kBGRAToY[16]) = {
+// Constants for BGRA.
+static const vec8 kBGRAToY = {
   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
 };
 
-extern "C" TALIGN16(const int8, kBGRAToU[16]) = {
+static const vec8 kBGRAToU = {
   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
 };
 
-extern "C" TALIGN16(const int8, kBGRAToV[16]) = {
+static const vec8 kBGRAToV = {
   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
 };
 
-// Constants for ABGR
-extern "C" TALIGN16(const int8, kABGRToY[16]) = {
+// Constants for ABGR.
+static const vec8 kABGRToY = {
   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
 };
 
-extern "C" TALIGN16(const int8, kABGRToU[16]) = {
+static const vec8 kABGRToU = {
   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
 };
 
-extern "C" TALIGN16(const int8, kABGRToV[16]) = {
+static const vec8 kABGRToV = {
   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
 };
 
-extern "C" TALIGN16(const uint8, kAddY16[16]) = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+// Constants for RGBA.
+static const vec8 kRGBAToY = {
+  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+static const vec8 kRGBAToU = {
+  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+static const vec8 kRGBAToV = {
+  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+static const uvec8 kAddY16 = {
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
 };
 
-extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
+static const uvec8 kAddUV128 = {
   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
 };
 
-// Shuffle table for converting BG24 to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
+// Shuffle table for converting RGB24 to ARGB.
+static const uvec8 kShuffleMaskRGB24ToARGB = {
   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
 };
 
 // Shuffle table for converting RAW to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
+static const uvec8 kShuffleMaskRAWToARGB = {
   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
 };
 
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values
-__declspec(naked)
+// Shuffle table for converting BGRA to ARGB.
+static const uvec8 kShuffleMaskBGRAToARGB = {
+  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+// Shuffle table for converting ABGR to ARGB.
+static const uvec8 kShuffleMaskABGRToARGB = {
+  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting RGBA to ARGB.
+static const uvec8 kShuffleMaskRGBAToARGB = {
+  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
+};
+
+// Shuffle table for converting ARGB to RGBA.
+static const uvec8 kShuffleMaskARGBToRGBA = {
+  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+__declspec(naked) __declspec(align(16))
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]        // src_y
+    mov        edx, [esp + 8]        // dst_argb
+    mov        ecx, [esp + 12]       // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
+    pslld      xmm5, 24
+
+    align      16
+  convertloop:
+    movq       xmm0, qword ptr [eax]
+    lea        eax,  [eax + 8]
+    punpcklbw  xmm0, xmm0
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm0
+    punpckhwd  xmm1, xmm1
+    por        xmm0, xmm5
+    por        xmm1, xmm5
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_bgra
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm5, kShuffleMaskBGRAToARGB
+    sub       edx, eax
+
+    align      16
+ convertloop:
+    movdqa    xmm0, [eax]
+    pshufb    xmm0, xmm5
+    sub       ecx, 4
+    movdqa    [eax + edx], xmm0
+    lea       eax, [eax + 16]
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_abgr
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm5, kShuffleMaskABGRToARGB
+    sub       edx, eax
+
+    align      16
+ convertloop:
+    movdqa    xmm0, [eax]
+    pshufb    xmm0, xmm5
+    sub       ecx, 4
+    movdqa    [eax + edx], xmm0
+    lea       eax, [eax + 16]
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_rgba
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm5, kShuffleMaskRGBAToARGB
+    sub       edx, eax
+
+    align      16
+ convertloop:
+    movdqa    xmm0, [eax]
+    pshufb    xmm0, xmm5
+    sub       ecx, 4
+    movdqa    [eax + edx], xmm0
+    lea       eax, [eax + 16]
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgba
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm5, kShuffleMaskARGBToRGBA
+    sub       edx, eax
+
+    align      16
+ convertloop:
+    movdqa    xmm0, [eax]
+    pshufb    xmm0, xmm5
+    sub       ecx, 4
+    movdqa    [eax + edx], xmm0
+    lea       eax, [eax + 16]
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_rgb24
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pslld     xmm5, 24
+    movdqa    xmm4, kShuffleMaskRGB24ToARGB
+
+    align      16
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm3, [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm4
+    por       xmm2, xmm5
+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm4
+    movdqa    [edx + 32], xmm2
+    por       xmm0, xmm5
+    pshufb    xmm1, xmm4
+    movdqa    [edx], xmm0
+    por       xmm1, xmm5
+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm4
+    movdqa    [edx + 16], xmm1
+    por       xmm3, xmm5
+    sub       ecx, 16
+    movdqa    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
+                        int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_raw
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pslld     xmm5, 24
+    movdqa    xmm4, kShuffleMaskRAWToARGB
+
+    align      16
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm3, [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm4
+    por       xmm2, xmm5
+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm4
+    movdqa    [edx + 32], xmm2
+    por       xmm0, xmm5
+    pshufb    xmm1, xmm4
+    movdqa    [edx], xmm0
+    por       xmm1, xmm5
+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm4
+    movdqa    [edx + 16], xmm1
+    por       xmm3, xmm5
+    sub       ecx, 16
+    movdqa    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    jg        convertloop
+    ret
+  }
+}
+
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+// 20 instructions.
+__declspec(naked) __declspec(align(16))
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+                          int pix) {
+__asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
+    psllw     xmm4, 10
+    psrlw     xmm4, 5
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]   // src_rgb565
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+    align      16
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    pand      xmm1, xmm3    // R in upper 5 bits
+    psllw     xmm2, 11      // B in upper 5 bits
+    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2    // RB
+    pand      xmm0, xmm4    // G in middle 6 bits
+    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
+    por       xmm0, xmm7    // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+// 24 instructions
+__declspec(naked) __declspec(align(16))
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+__asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
+    psrlw     xmm4, 6
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]   // src_argb1555
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+    align      16
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    psllw     xmm1, 1       // R in upper 5 bits
+    psllw     xmm2, 11      // B in upper 5 bits
+    pand      xmm1, xmm3
+    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2    // RB
+    movdqa    xmm2, xmm0
+    pand      xmm0, xmm4    // G in middle 5 bits
+    psraw     xmm2, 8       // A
+    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
+    pand      xmm2, xmm7
+    por       xmm0, xmm2    // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+// 18 instructions.
+__declspec(naked) __declspec(align(16))
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) {
+__asm {
+    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
+    movd      xmm4, eax
+    pshufd    xmm4, xmm4, 0
+    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
+    pslld     xmm5, 4
+    mov       eax, [esp + 4]   // src_argb4444
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+    align      16
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
+    movdqa    xmm2, xmm0
+    pand      xmm0, xmm4    // mask low nibbles
+    pand      xmm2, xmm5    // mask high nibbles
+    movdqa    xmm1, xmm0
+    movdqa    xmm3, xmm2
+    psllw     xmm1, 4
+    psrlw     xmm3, 4
+    por       xmm0, xmm1
+    por       xmm2, xmm3
+    movdqa    xmm1, xmm0
+    punpcklbw xmm0, xmm2
+    punpckhbw xmm1, xmm2
+    movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
+    movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm6, kShuffleMaskARGBToRGB24
+
+    align      16
+ convertloop:
+    movdqa    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqa    xmm1, [eax + 16]
+    movdqa    xmm2, [eax + 32]
+    movdqa    xmm3, [eax + 48]
+    lea       eax, [eax + 64]
+    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm1, xmm6
+    pshufb    xmm2, xmm6
+    pshufb    xmm3, xmm6
+    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
+    psrldq    xmm1, 4      // 8 bytes from 1
+    pslldq    xmm4, 12     // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
+    por       xmm0, xmm4   // 4 bytes from 1 for 0
+    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqa    [edx], xmm0  // store 0
+    por       xmm1, xmm5   // 8 bytes from 2 for 1
+    psrldq    xmm2, 8      // 4 bytes from 2
+    pslldq    xmm3, 4      // 12 bytes from 3 for 2
+    por       xmm2, xmm3   // 12 bytes from 3 for 2
+    movdqa    [edx + 16], xmm1   // store 1
+    movdqa    [edx + 32], xmm2   // store 2
+    lea       edx, [edx + 48]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm6, kShuffleMaskARGBToRAW
+
+    align      16
+ convertloop:
+    movdqa    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqa    xmm1, [eax + 16]
+    movdqa    xmm2, [eax + 32]
+    movdqa    xmm3, [eax + 48]
+    lea       eax, [eax + 64]
+    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm1, xmm6
+    pshufb    xmm2, xmm6
+    pshufb    xmm3, xmm6
+    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
+    psrldq    xmm1, 4      // 8 bytes from 1
+    pslldq    xmm4, 12     // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
+    por       xmm0, xmm4   // 4 bytes from 1 for 0
+    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqa    [edx], xmm0  // store 0
+    por       xmm1, xmm5   // 8 bytes from 2 for 1
+    psrldq    xmm2, 8      // 4 bytes from 2
+    pslldq    xmm3, 4      // 12 bytes from 3 for 2
+    por       xmm2, xmm3   // 12 bytes from 3 for 2
+    movdqa    [edx + 16], xmm1   // store 1
+    movdqa    [edx + 32], xmm2   // store 2
+    lea       edx, [edx + 48]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    psrld     xmm3, 27
+    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    psrld     xmm4, 26
+    pslld     xmm4, 5
+    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pslld     xmm5, 11
+
+    align      16
+ convertloop:
+    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    pslld     xmm0, 8       // R
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 5       // G
+    psrad     xmm0, 16      // R
+    pand      xmm1, xmm3    // B
+    pand      xmm2, xmm4    // G
+    pand      xmm0, xmm5    // R
+    por       xmm1, xmm2    // BG
+    por       xmm0, xmm1    // BGR
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+// TODO(fbarchard): Improve sign extension/packing.
+__declspec(naked) __declspec(align(16))
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
+    psrld     xmm4, 27
+    movdqa    xmm5, xmm4       // generate mask 0x000003e0
+    pslld     xmm5, 5
+    movdqa    xmm6, xmm4       // generate mask 0x00007c00
+    pslld     xmm6, 10
+    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
+    pslld     xmm7, 15
+
+    align      16
+ convertloop:
+    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    movdqa    xmm3, xmm0    // R
+    psrad     xmm0, 16      // A
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 6       // G
+    psrld     xmm3, 9       // R
+    pand      xmm0, xmm7    // A
+    pand      xmm1, xmm4    // B
+    pand      xmm2, xmm5    // G
+    pand      xmm3, xmm6    // R
+    por       xmm0, xmm1    // BA
+    por       xmm2, xmm3    // GR
+    por       xmm0, xmm2    // BGRA
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
+    psllw     xmm4, 12
+    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
+    psrlw     xmm3, 8
+
+    align      16
+ convertloop:
+    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0
+    pand      xmm0, xmm3    // low nibble
+    pand      xmm1, xmm4    // high nibble
+    psrl      xmm0, 4
+    psrl      xmm1, 8
+    por       xmm0, xmm1
+    packuswb  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+__declspec(naked) __declspec(align(16))
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm7, _kARGBToY
-    movdqa     xmm6, _kAddY16
+    movdqa     xmm5, kAddY16
+    movdqa     xmm4, kARGBToY
 
- convertloop :
+    align      16
+ convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
     movdqa     xmm2, [eax + 32]
     movdqa     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm7
-    pmaddubsw  xmm1, xmm7
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm3, xmm7
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
     lea        eax, [eax + 64]
     phaddw     xmm0, xmm1
     phaddw     xmm2, xmm3
     psrlw      xmm0, 7
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
-    paddb      xmm0, xmm6
+    paddb      xmm0, xmm5
+    sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm5, kAddY16
+    movdqa     xmm4, kARGBToY
+
+    align      16
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
     sub        ecx, 16
-    ja         convertloop
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         convertloop
     ret
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm7, _kBGRAToY
-    movdqa     xmm6, _kAddY16
+    movdqa     xmm5, kAddY16
+    movdqa     xmm4, kBGRAToY
 
- convertloop :
+    align      16
+ convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
     movdqa     xmm2, [eax + 32]
     movdqa     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm7
-    pmaddubsw  xmm1, xmm7
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm3, xmm7
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
     lea        eax, [eax + 64]
     phaddw     xmm0, xmm1
     phaddw     xmm2, xmm3
     psrlw      xmm0, 7
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
-    paddb      xmm0, xmm6
+    paddb      xmm0, xmm5
+    sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm5, kAddY16
+    movdqa     xmm4, kBGRAToY
+
+    align      16
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
     sub        ecx, 16
-    ja         convertloop
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         convertloop
     ret
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm7, _kABGRToY
-    movdqa     xmm6, _kAddY16
+    movdqa     xmm5, kAddY16
+    movdqa     xmm4, kABGRToY
 
- convertloop :
+    align      16
+ convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
     movdqa     xmm2, [eax + 32]
     movdqa     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm7
-    pmaddubsw  xmm1, xmm7
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm3, xmm7
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    sub        ecx, 16
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm5, kAddY16
+    movdqa     xmm4, kABGRToY
+
+    align      16
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
     lea        eax, [eax + 64]
     phaddw     xmm0, xmm1
     phaddw     xmm2, xmm3
     psrlw      xmm0, 7
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
-    paddb      xmm0, xmm6
+    paddb      xmm0, xmm5
+    sub        ecx, 16
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm5, kAddY16
+    movdqa     xmm4, kRGBAToY
+
+    align      16
+ convertloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + 32]
+    movdqa     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm5, kAddY16
+    movdqa     xmm4, kRGBAToY
+
+    align      16
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
     sub        ecx, 16
-    ja         convertloop
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         convertloop
     ret
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
 __asm {
@@ -185,12 +937,13 @@ __asm {
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, _kARGBToU
-    movdqa     xmm6, _kARGBToV
-    movdqa     xmm5, _kAddUV128
+    movdqa     xmm7, kARGBToU
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm5, kAddUV128
     sub        edi, edx             // stride from u to v
 
- convertloop :
+    align      16
+ convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -227,18 +980,89 @@ __asm {
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
+    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm7, kARGBToU
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm5, kAddUV128
+    sub        edi, edx             // stride from u to v
+
+    align      16
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
     sub        ecx, 16
-    ja         convertloop
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    jg         convertloop
+
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
 __asm {
@@ -249,12 +1073,13 @@ __asm {
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, _kBGRAToU
-    movdqa     xmm6, _kBGRAToV
-    movdqa     xmm5, _kAddUV128
+    movdqa     xmm7, kBGRAToU
+    movdqa     xmm6, kBGRAToV
+    movdqa     xmm5, kAddUV128
     sub        edi, edx             // stride from u to v
 
- convertloop :
+    align      16
+ convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -291,18 +1116,89 @@ __asm {
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
+    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm7, kBGRAToU
+    movdqa     xmm6, kBGRAToV
+    movdqa     xmm5, kAddUV128
+    sub        edi, edx             // stride from u to v
+
+    align      16
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
     sub        ecx, 16
-    ja         convertloop
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    jg         convertloop
+
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
 __asm {
@@ -313,12 +1209,13 @@ __asm {
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, _kABGRToU
-    movdqa     xmm6, _kABGRToV
-    movdqa     xmm5, _kAddUV128
+    movdqa     xmm7, kABGRToU
+    movdqa     xmm6, kABGRToV
+    movdqa     xmm5, kAddUV128
     sub        edi, edx             // stride from u to v
 
- convertloop :
+    align      16
+ convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -355,282 +1252,2846 @@ __asm {
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
+    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm7, kABGRToU
+    movdqa     xmm6, kABGRToV
+    movdqa     xmm5, kAddUV128
+    sub        edi, edx             // stride from u to v
+
+    align      16
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
     sub        ecx, 16
-    ja         convertloop
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    jg         convertloop
+
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked)
-void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
+__declspec(naked) __declspec(align(16))
+void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
 __asm {
-    mov       eax, [esp + 4]   // src_bg24
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    pcmpeqb   xmm7, xmm7       // generate mask 0xff000000
-    pslld     xmm7, 24
-    movdqa    xmm6, _kShuffleMaskBG24ToARGB
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm7, kRGBAToU
+    movdqa     xmm6, kRGBAToV
+    movdqa     xmm5, kAddUV128
+    sub        edi, edx             // stride from u to v
 
- convertloop :
-    movdqa    xmm0, [eax]
-    movdqa    xmm1, [eax + 16]
-    movdqa    xmm3, [eax + 32]
-    lea       eax, [eax + 48]
-    movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
-    pshufb    xmm2, xmm6
-    por       xmm2, xmm7
-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
-    pshufb    xmm0, xmm6
-    movdqa    [edx + 32], xmm2
-    por       xmm0, xmm7
-    pshufb    xmm1, xmm6
-    movdqa    [edx], xmm0
-    por       xmm1, xmm7
-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
-    pshufb    xmm3, xmm6
-    movdqa    [edx + 16], xmm1
-    por       xmm3, xmm7
-    movdqa    [edx + 48], xmm3
-    lea       edx, [edx + 64]
+    align      16
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + 32]
+    movdqa     xmm3, [eax + 48]
+    pavgb      xmm0, [eax + esi]
+    pavgb      xmm1, [eax + esi + 16]
+    pavgb      xmm2, [eax + esi + 32]
+    pavgb      xmm3, [eax + esi + 48]
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    sub        ecx, 16
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm7, kRGBAToU
+    movdqa     xmm6, kRGBAToV
+    movdqa     xmm5, kAddUV128
+    sub        edi, edx             // stride from u to v
+
+    align      16
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    sub        ecx, 16
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+
+#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
+
+#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
+#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
+#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+static const vec8 kUVToB = {
+  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
+};
+
+static const vec8 kUVToR = {
+  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
+};
+
+static const vec8 kUVToG = {
+  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
+};
+
+static const vec8 kVUToB = {
+  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
+};
+
+static const vec8 kVUToR = {
+  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
+};
+
+static const vec8 kVUToG = {
+  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+};
+
+static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
+static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
+static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
+static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
+static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
+
+// TODO(fbarchard): NV12/NV21 fetch UV and use directly.
+// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
+
+// Read 8 UV from 411.
+#define READYUV444 __asm {                                                     \
+    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
+    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+  }
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422 __asm {                                                     \
+    __asm movd       xmm0, [esi]          /* U */                              \
+    __asm movd       xmm1, [esi + edi]    /* V */                              \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+  }
+
+// Read 2 UV from 411, upsample to 8 UV.
+#define READYUV411 __asm {                                                     \
+    __asm movd       xmm0, [esi]          /* U */                              \
+    __asm movd       xmm1, [esi + edi]    /* V */                              \
+    __asm lea        esi,  [esi + 2]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
+  }
+
+// Read 4 UV from NV12, upsample to 8 UV.
+#define READNV12 __asm {                                                       \
+    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+  }
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB __asm {                                                       \
+    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm movdqa     xmm2, xmm0                                                \
+    __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
+    __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
+    __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
+    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
+    __asm psubw      xmm1, kUVBiasG                                            \
+    __asm psubw      xmm2, kUVBiasR                                            \
+    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
+    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
+    __asm lea        eax, [eax + 8]                                            \
+    __asm punpcklbw  xmm3, xmm4                                                \
+    __asm psubsw     xmm3, kYSub16                                             \
+    __asm pmullw     xmm3, kYToRgb                                             \
+    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
+    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
+    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
+    __asm psraw      xmm0, 6                                                   \
+    __asm psraw      xmm1, 6                                                   \
+    __asm psraw      xmm2, 6                                                   \
+    __asm packuswb   xmm0, xmm0           /* B */                              \
+    __asm packuswb   xmm1, xmm1           /* G */                              \
+    __asm packuswb   xmm2, xmm2           /* R */                              \
+  }
+
+// Convert 8 pixels: 8 VU and 8 Y.
+#define YVUTORGB __asm {                                                       \
+    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm movdqa     xmm2, xmm0                                                \
+    __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
+    __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
+    __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
+    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
+    __asm psubw      xmm1, kUVBiasG                                            \
+    __asm psubw      xmm2, kUVBiasR                                            \
+    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
+    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
+    __asm lea        eax, [eax + 8]                                            \
+    __asm punpcklbw  xmm3, xmm4                                                \
+    __asm psubsw     xmm3, kYSub16                                             \
+    __asm pmullw     xmm3, kYToRgb                                             \
+    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
+    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
+    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
+    __asm psraw      xmm0, 6                                                   \
+    __asm psraw      xmm1, 6                                                   \
+    __asm psraw      xmm2, 6                                                   \
+    __asm packuswb   xmm0, xmm0           /* B */                              \
+    __asm packuswb   xmm1, xmm1           /* G */                              \
+    __asm packuswb   xmm2, xmm2           /* R */                              \
+  }
+
+// 8 pixels, dest aligned 16.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I444ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* argb_buf,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READYUV444
+    YUVTORGB
+
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* argb_buf,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READYUV422
+    YUVTORGB
+
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels, dest aligned 16.
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// Similar to I420 but duplicate UV once more.
+__declspec(naked) __declspec(align(16))
+void I411ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* argb_buf,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READYUV411
+    YUVTORGB
+
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* uv_buf,
+                         uint8* argb_buf,
+                         int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READNV12
+    YUVTORGB
+
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* uv_buf,
+                         uint8* argb_buf,
+                         int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // VU
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READNV12
+    YVUTORGB
+
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels, unaligned.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* argb_buf,
+                                   int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READYUV444
+    YUVTORGB
+
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels, unaligned.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* argb_buf,
+                                   int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READYUV422
+    YUVTORGB
+
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels, unaligned.
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// Similar to I420 but duplicate UV once more.
+__declspec(naked) __declspec(align(16))
+void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* argb_buf,
+                                   int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READYUV411
+    YUVTORGB
+
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* uv_buf,
+                                   uint8* argb_buf,
+                                   int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READNV12
+    YUVTORGB
+
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* uv_buf,
+                                   uint8* argb_buf,
+                                   int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // VU
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READNV12
+    YVUTORGB
+
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToBGRARow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* bgra_buf,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // bgra
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READYUV422
+    YUVTORGB
+
+    // Step 3: Weave into BGRA
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    punpcklbw  xmm1, xmm0           // GB
+    punpcklbw  xmm5, xmm2           // AR
+    movdqa     xmm0, xmm5
+    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
+    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
+    movdqa     [edx], xmm5
+    movdqa     [edx + 16], xmm0
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* bgra_buf,
+                                   int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // bgra
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READYUV422
+    YUVTORGB
+
+    // Step 3: Weave into BGRA
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    punpcklbw  xmm1, xmm0           // GB
+    punpcklbw  xmm5, xmm2           // AR
+    movdqa     xmm0, xmm5
+    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
+    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
+    movdqu     [edx], xmm5
+    movdqu     [edx + 16], xmm0
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToABGRRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* abgr_buf,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // abgr
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READYUV422
+    YUVTORGB
+
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm2, xmm1           // RG
+    punpcklbw  xmm0, xmm5           // BA
+    movdqa     xmm1, xmm2
+    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
+    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
+    movdqa     [edx], xmm2
+    movdqa     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* abgr_buf,
+                                   int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // abgr
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READYUV422
+    YUVTORGB
+
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm2, xmm1           // RG
+    punpcklbw  xmm0, xmm5           // BA
+    movdqa     xmm1, xmm2
+    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
+    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToRGBARow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgba_buf,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // rgba
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READYUV422
+    YUVTORGB
+
+    // Step 3: Weave into RGBA
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    punpcklbw  xmm1, xmm2           // GR
+    punpcklbw  xmm5, xmm0           // AB
+    movdqa     xmm0, xmm5
+    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
+    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
+    movdqa     [edx], xmm5
+    movdqa     [edx + 16], xmm0
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* rgba_buf,
+                                   int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // rgba
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    READYUV422
+    YUVTORGB
+
+    // Step 3: Weave into RGBA
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    punpcklbw  xmm1, xmm2           // GR
+    punpcklbw  xmm5, xmm0           // AB
+    movdqa     xmm0, xmm5
+    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
+    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
+    movdqu     [edx], xmm5
+    movdqu     [edx + 16], xmm0
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#endif  // HAS_I422TOARGBROW_SSSE3
+
+#ifdef HAS_YTOARGBROW_SSE2
+__declspec(naked) __declspec(align(16))
+void YToARGBRow_SSE2(const uint8* y_buf,
+                     uint8* rgb_buf,
+                     int width) {
+  __asm {
+    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
+    pslld      xmm4, 24
+    mov        eax,0x10001000
+    movd       xmm3,eax
+    pshufd     xmm3,xmm3,0
+    mov        eax,0x012a012a
+    movd       xmm2,eax
+    pshufd     xmm2,xmm2,0
+    mov        eax, [esp + 4]       // Y
+    mov        edx, [esp + 8]       // rgb
+    mov        ecx, [esp + 12]      // width
+
+    align      16
+ convertloop:
+    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+    movq       xmm0, qword ptr [eax]
+    lea        eax, [eax + 8]
+    punpcklbw  xmm0, xmm0           // Y.Y
+    psubusw    xmm0, xmm3
+    pmulhuw    xmm0, xmm2
+    packuswb   xmm0, xmm0           // G
+
+    // Step 2: Weave into ARGB
+    punpcklbw  xmm0, xmm0           // GG
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
+    por        xmm0, xmm4
+    por        xmm1, xmm4
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_YTOARGBROW_SSE2
+
+#ifdef HAS_MIRRORROW_SSSE3
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {
+  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+__declspec(naked) __declspec(align(16))
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+__asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm5, kShuffleMirror
+    lea       eax, [eax - 16]
+
+    align      16
+ convertloop:
+    movdqa    xmm0, [eax + ecx]
+    pshufb    xmm0, xmm5
     sub       ecx, 16
-    ja        convertloop
+    movdqa    [edx], xmm0
+    lea       edx, [edx + 16]
+    jg        convertloop
     ret
   }
 }
+#endif  // HAS_MIRRORROW_SSSE3
 
-__declspec(naked)
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
-                        int pix) {
+#ifdef HAS_MIRRORROW_SSE2
+// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
+// version can not.
+__declspec(naked) __declspec(align(16))
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
 __asm {
-    mov       eax, [esp + 4]   // src_raw
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    pcmpeqb   xmm7, xmm7       // generate mask 0xff000000
-    pslld     xmm7, 24
-    movdqa    xmm6, _kShuffleMaskRAWToARGB
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    lea       eax, [eax - 16]
+
+    align      16
+ convertloop:
+    movdqu    xmm0, [eax + ecx]
+    movdqa    xmm1, xmm0        // swap bytes
+    psllw     xmm0, 8
+    psrlw     xmm1, 8
+    por       xmm0, xmm1
+    pshuflw   xmm0, xmm0, 0x1b  // swap words
+    pshufhw   xmm0, xmm0, 0x1b
+    pshufd    xmm0, xmm0, 0x4e  // swap qwords
+    sub       ecx, 16
+    movdqu    [edx], xmm0
+    lea       edx, [edx + 16]
+    jg        convertloop
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_SSE2
 
- convertloop :
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+
+__declspec(naked) __declspec(align(16))
+void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  __asm {
+    push      edi
+    mov       eax, [esp + 4 + 4]   // src
+    mov       edx, [esp + 4 + 8]   // dst_u
+    mov       edi, [esp + 4 + 12]  // dst_v
+    mov       ecx, [esp + 4 + 16]  // width
+    movdqa    xmm1, kShuffleMirrorUV
+    lea       eax, [eax + ecx * 2 - 16]
+    sub       edi, edx
+
+    align      16
+ convertloop:
     movdqa    xmm0, [eax]
-    movdqa    xmm1, [eax + 16]
-    movdqa    xmm3, [eax + 32]
-    lea       eax, [eax + 48]
-    movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
-    pshufb    xmm2, xmm6
-    por       xmm2, xmm7
-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
-    pshufb    xmm0, xmm6
-    movdqa    [edx + 32], xmm2
-    por       xmm0, xmm7
-    pshufb    xmm1, xmm6
+    lea       eax, [eax - 16]
+    pshufb    xmm0, xmm1
+    sub       ecx, 8
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [edx + edi], xmm0
+    lea       edx, [edx + 8]
+    jg        convertloop
+
+    pop       edi
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_UV_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSSE3
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kARGBShuffleMirror = {
+  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
+};
+
+__declspec(naked) __declspec(align(16))
+void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+__asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm5, kARGBShuffleMirror
+    lea       eax, [eax - 16]
+
+    align      16
+ convertloop:
+    movdqa    xmm0, [eax + ecx * 4]
+    pshufb    xmm0, xmm5
+    sub       ecx, 4
     movdqa    [edx], xmm0
-    por       xmm1, xmm7
-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
-    pshufb    xmm3, xmm6
-    movdqa    [edx + 16], xmm1
-    por       xmm3, xmm7
-    movdqa    [edx + 48], xmm3
-    lea       edx, [edx + 64]
-    sub       ecx, 16
-    ja        convertloop
+    lea       edx, [edx + 16]
+    jg        convertloop
     ret
   }
 }
+#endif  // HAS_ARGBMIRRORROW_SSSE3
 
-__declspec(naked)
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width) {
+#ifdef HAS_SPLITUV_SSE2
+__declspec(naked) __declspec(align(16))
+void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
-    pushad
-    mov       edx, [esp + 32 + 4]
-    mov       edi, [esp + 32 + 8]
-    mov       esi, [esp + 32 + 12]
-    mov       ebp, [esp + 32 + 16]
-    mov       ecx, [esp + 32 + 20]
-
- convertloop :
-    movzx     eax, byte ptr [edi]
-    lea       edi, [edi + 1]
-    movzx     ebx, byte ptr [esi]
-    lea       esi, [esi + 1]
-    movq      mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
-    movzx     eax, byte ptr [edx]
-    paddsw    mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
-    movzx     ebx, byte ptr [edx + 1]
-    movq      mm1, [_kCoefficientsRgbY + 8 * eax]
-    lea       edx, [edx + 2]
-    movq      mm2, [_kCoefficientsRgbY + 8 * ebx]
-    paddsw    mm1, mm0
-    paddsw    mm2, mm0
-    psraw     mm1, 6
-    psraw     mm2, 6
-    packuswb  mm1, mm2
-    movntq    [ebp], mm1
-    lea       ebp, [ebp + 8]
-    sub       ecx, 2
-    ja        convertloop
-
-    popad
-    ret
-  }
-}
-
-__declspec(naked)
-void FastConvertYUVToBGRARow(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* rgb_buf,
-                             int width) {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_uv
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+    align      16
+  convertloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    movdqa     xmm3, xmm1
+    pand       xmm0, xmm5   // even bytes
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    psrlw      xmm2, 8      // odd bytes
+    psrlw      xmm3, 8
+    packuswb   xmm2, xmm3
+    movdqa     [edx], xmm0
+    movdqa     [edx + edi], xmm2
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+#endif  // HAS_SPLITUV_SSE2
+
+#ifdef HAS_COPYROW_SSE2
+// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked) __declspec(align(16))
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    sub        edx, eax
+
+    align      16
+  convertloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     [eax + edx], xmm0
+    movdqa     [eax + edx + 16], xmm1
+    lea        eax, [eax + 32]
+    sub        ecx, 32
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_X86
+__declspec(naked) __declspec(align(16))
+void CopyRow_X86(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, esi
+    mov        edx, edi
+    mov        esi, [esp + 4]   // src
+    mov        edi, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    shr        ecx, 2
+    rep movsd
+    mov        edi, edx
+    mov        esi, eax
+    ret
+  }
+}
+#endif  // HAS_COPYROW_X86
+
+#ifdef HAS_SETROW_X86
+// SetRow8 writes 'count' bytes using a 32 bit value repeated.
+__declspec(naked) __declspec(align(16))
+void SetRow8_X86(uint8* dst, uint32 v32, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v32
+    mov        ecx, [esp + 12]  // count
+    shr        ecx, 2
+    rep stosd
+    mov        edi, edx
+    ret
+  }
+}
+
+// SetRow32 writes 'count' words using a 32 bit value repeated.
+__declspec(naked) __declspec(align(16))
+void SetRows32_X86(uint8* dst, uint32 v32, int width,
+                   int dst_stride, int height) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebp
+    mov        edi, [esp + 12 + 4]   // dst
+    mov        eax, [esp + 12 + 8]   // v32
+    mov        ebp, [esp + 12 + 12]  // width
+    mov        edx, [esp + 12 + 16]  // dst_stride
+    mov        esi, [esp + 12 + 20]  // height
+    lea        ecx, [ebp * 4]
+    sub        edx, ecx             // stride - width * 4
+
+    align      16
+  convertloop:
+    mov        ecx, ebp
+    rep stosd
+    add        edi, edx
+    sub        esi, 1
+    jg         convertloop
+
+    pop        ebp
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+__declspec(naked) __declspec(align(16))
+void YUY2ToYRow_SSE2(const uint8* src_yuy2,
+                     uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_yuy2
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+    align      16
+  convertloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5   // even bytes are Y
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    sub        ecx, 16
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+    align      16
+  convertloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+    align      16
+  convertloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
+                               uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_yuy2
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+    align      16
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5   // even bytes are Y
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    sub        ecx, 16
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                                uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+    align      16
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
+                                   uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+    align      16
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToYRow_SSE2(const uint8* src_uyvy,
+                     uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_uyvy
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+
+    align      16
+  convertloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8    // odd bytes are Y
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    sub        ecx, 16
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+    align      16
+  convertloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+    align      16
+  convertloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
+                               uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_uyvy
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+
+    align      16
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8    // odd bytes are Y
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    sub        ecx, 16
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                                uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+    align      16
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
+                                   uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+    align      16
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+#endif  // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm7, xmm7       // generate constant 1
+    psrlw      xmm7, 15
+    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
+    psrlw      xmm6, 8
+    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    psllw      xmm5, 8
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pslld      xmm4, 24
+
+    sub        ecx, 1
+    je         convertloop1     // only 1 pixel?
+    jl         convertloop1b
+
+    // 1 pixel loop until destination pointer is aligned.
+  alignloop1:
+    test       edx, 15          // aligned?
+    je         alignloop1b
+    movd       xmm3, [eax]
+    lea        eax, [eax + 4]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movd       xmm2, [esi]      // _r_b
+    psrlw      xmm3, 8          // alpha
+    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
+    pshuflw    xmm3, xmm3,0F5h
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movd       xmm1, [esi]      // _a_g
+    lea        esi, [esi + 4]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    sub        ecx, 1
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    jge        alignloop1
+
+  alignloop1b:
+    add        ecx, 1 - 4
+    jl         convertloop4b
+
+    // 4 pixel loop.
+  convertloop4:
+    movdqu     xmm3, [eax]      // src argb
+    lea        eax, [eax + 16]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movdqu     xmm2, [esi]      // _r_b
+    psrlw      xmm3, 8          // alpha
+    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
+    pshuflw    xmm3, xmm3,0F5h
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movdqu     xmm1, [esi]      // _a_g
+    lea        esi, [esi + 16]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jge        convertloop4
+
+  convertloop4b:
+    add        ecx, 4 - 1
+    jl         convertloop1b
+
+    // 1 pixel loop.
+  convertloop1:
+    movd       xmm3, [eax]      // src argb
+    lea        eax, [eax + 4]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movd       xmm2, [esi]      // _r_b
+    psrlw      xmm3, 8          // alpha
+    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
+    pshuflw    xmm3, xmm3,0F5h
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movd       xmm1, [esi]      // _a_g
+    lea        esi, [esi + 4]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    sub        ecx, 1
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    jge        convertloop1
+
+  convertloop1b:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBBLENDROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static const uvec8 kShuffleAlpha = {
+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+// Same as SSE2, but replaces:
+//    psrlw      xmm3, 8          // alpha
+//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
+//    pshuflw    xmm3, xmm3,0F5h
+// with..
+//    pshufb     xmm3, kShuffleAlpha // alpha
+// Blend 8 pixels at a time.
+
+__declspec(naked) __declspec(align(16))
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                        uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm7, xmm7       // generate constant 1
+    psrlw      xmm7, 15
+    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
+    psrlw      xmm6, 8
+    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    psllw      xmm5, 8
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pslld      xmm4, 24
+
+    sub        ecx, 1
+    je         convertloop1     // only 1 pixel?
+    jl         convertloop1b
+
+    // 1 pixel loop until destination pointer is aligned.
+  alignloop1:
+    test       edx, 15          // aligned?
+    je         alignloop1b
+    movd       xmm3, [eax]
+    lea        eax, [eax + 4]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movd       xmm2, [esi]      // _r_b
+    pshufb     xmm3, kShuffleAlpha // alpha
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movd       xmm1, [esi]      // _a_g
+    lea        esi, [esi + 4]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    sub        ecx, 1
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    jge        alignloop1
+
+  alignloop1b:
+    add        ecx, 1 - 4
+    jl         convertloop4b
+
+    test       eax, 15          // unaligned?
+    jne        convertuloop4
+    test       esi, 15          // unaligned?
+    jne        convertuloop4
+
+    // 4 pixel loop.
+  convertloop4:
+    movdqa     xmm3, [eax]      // src argb
+    lea        eax, [eax + 16]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movdqa     xmm2, [esi]      // _r_b
+    pshufb     xmm3, kShuffleAlpha // alpha
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movdqa     xmm1, [esi]      // _a_g
+    lea        esi, [esi + 16]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jge        convertloop4
+    jmp        convertloop4b
+
+    // 4 pixel unaligned loop.
+  convertuloop4:
+    movdqu     xmm3, [eax]      // src argb
+    lea        eax, [eax + 16]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movdqu     xmm2, [esi]      // _r_b
+    pshufb     xmm3, kShuffleAlpha // alpha
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movdqu     xmm1, [esi]      // _a_g
+    lea        esi, [esi + 16]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jge        convertuloop4
+
+  convertloop4b:
+    add        ecx, 4 - 1
+    jl         convertloop1b
+
+    // 1 pixel loop.
+  convertloop1:
+    movd       xmm3, [eax]      // src argb
+    lea        eax, [eax + 4]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movd       xmm2, [esi]      // _r_b
+    pshufb     xmm3, kShuffleAlpha // alpha
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movd       xmm1, [esi]      // _a_g
+    lea        esi, [esi + 4]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    sub        ecx, 1
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    jge        convertloop1
+
+  convertloop1b:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATE_SSE2
+// Attenuate 4 pixels at a time.
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pslld      xmm4, 24
+    pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
+    psrld      xmm5, 8
+
+    align      16
+ convertloop:
+    movdqa     xmm0, [eax]      // read 4 pixels
+    punpcklbw  xmm0, xmm0       // first 2
+    pshufhw    xmm2, xmm0,0FFh  // 8 alpha words
+    pshuflw    xmm2, xmm2,0FFh
+    pmulhuw    xmm0, xmm2       // rgb * a
+    movdqa     xmm1, [eax]      // read 4 pixels
+    punpckhbw  xmm1, xmm1       // next 2 pixels
+    pshufhw    xmm2, xmm1,0FFh  // 8 alpha words
+    pshuflw    xmm2, xmm2,0FFh
+    pmulhuw    xmm1, xmm2       // rgb * a
+    movdqa     xmm2, [eax]      // alphas
+    psrlw      xmm0, 8
+    pand       xmm2, xmm4
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    pand       xmm0, xmm5       // keep original alphas
+    por        xmm0, xmm2
+    sub        ecx, 4
+    movdqa     [eax + edx], xmm0
+    lea        eax, [eax + 16]
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATE_SSE2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha0 = {
+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+static const uvec8 kShuffleAlpha1 = {
+  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+__declspec(naked) __declspec(align(16))
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
-    pushad
-    mov       edx, [esp + 32 + 4]
-    mov       edi, [esp + 32 + 8]
-    mov       esi, [esp + 32 + 12]
-    mov       ebp, [esp + 32 + 16]
-    mov       ecx, [esp + 32 + 20]
-
- convertloop :
-    movzx     eax, byte ptr [edi]
-    lea       edi, [edi + 1]
-    movzx     ebx, byte ptr [esi]
-    lea       esi, [esi + 1]
-    movq      mm0, [_kCoefficientsBgraY + 2048 + 8 * eax]
-    movzx     eax, byte ptr [edx]
-    paddsw    mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx]
-    movzx     ebx, byte ptr [edx + 1]
-    movq      mm1, [_kCoefficientsBgraY + 8 * eax]
-    lea       edx, [edx + 2]
-    movq      mm2, [_kCoefficientsBgraY + 8 * ebx]
-    paddsw    mm1, mm0
-    paddsw    mm2, mm0
-    psraw     mm1, 6
-    psraw     mm2, 6
-    packuswb  mm1, mm2
-    movntq    [ebp], mm1
-    lea       ebp, [ebp + 8]
-    sub       ecx, 2
-    ja        convertloop
-
-    popad
-    ret
-  }
-}
-
-__declspec(naked)
-void FastConvertYUVToABGRRow(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* rgb_buf,
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
+    pslld      xmm3, 24
+    movdqa     xmm4, kShuffleAlpha0
+    movdqa     xmm5, kShuffleAlpha1
+
+    align      16
+ convertloop:
+    movdqa     xmm0, [eax]      // read 4 pixels
+    pshufb     xmm0, xmm4       // isolate first 2 alphas
+    movdqa     xmm1, [eax]      // read 4 pixels
+    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
+    pmulhuw    xmm0, xmm1       // rgb * a
+    movdqa     xmm1, [eax]      // read 4 pixels
+    pshufb     xmm1, xmm5       // isolate next 2 alphas
+    movdqa     xmm2, [eax]      // read 4 pixels
+    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
+    pmulhuw    xmm1, xmm2       // rgb * a
+    movdqa     xmm2, [eax]      // mask original alpha
+    pand       xmm2, xmm3
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    por        xmm0, xmm2       // copy original alpha
+    sub        ecx, 4
+    movdqa     [eax + edx], xmm0
+    lea        eax, [eax + 16]
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
   __asm {
-    pushad
-    mov       edx, [esp + 32 + 4]
-    mov       edi, [esp + 32 + 8]
-    mov       esi, [esp + 32 + 12]
-    mov       ebp, [esp + 32 + 16]
-    mov       ecx, [esp + 32 + 20]
-
- convertloop :
-    movzx     eax, byte ptr [edi]
-    lea       edi, [edi + 1]
-    movzx     ebx, byte ptr [esi]
-    lea       esi, [esi + 1]
-    movq      mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax]
-    movzx     eax, byte ptr [edx]
-    paddsw    mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx]
-    movzx     ebx, byte ptr [edx + 1]
-    movq      mm1, [_kCoefficientsAbgrY + 8 * eax]
-    lea       edx, [edx + 2]
-    movq      mm2, [_kCoefficientsAbgrY + 8 * ebx]
-    paddsw    mm1, mm0
-    paddsw    mm2, mm0
-    psraw     mm1, 6
-    psraw     mm2, 6
-    packuswb  mm1, mm2
-    movntq    [ebp], mm1
-    lea       ebp, [ebp + 8]
-    sub       ecx, 2
-    ja        convertloop
-
-    popad
-    ret
-  }
-}
-
-__declspec(naked)
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* rgb_buf,
-                                 int width) {
-  __asm {
-    pushad
-    mov       edx, [esp + 32 + 4]   // Y
-    mov       edi, [esp + 32 + 8]   // U
-    mov       esi, [esp + 32 + 12]  // V
-    mov       ebp, [esp + 32 + 16]  // rgb
-    mov       ecx, [esp + 32 + 20]  // width
-
- convertloop :
-    movzx     eax, byte ptr [edi]
-    lea       edi, [edi + 1]
-    movzx     ebx, byte ptr [esi]
-    lea       esi, [esi + 1]
-    movq      mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
-    movzx     eax, byte ptr [edx]
-    paddsw    mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
-    lea       edx, [edx + 1]
-    paddsw    mm0, [_kCoefficientsRgbY + 8 * eax]
-    psraw     mm0, 6
-    packuswb  mm0, mm0
-    movd      [ebp], mm0
-    lea       ebp, [ebp + 4]
-    sub       ecx, 1
-    ja        convertloop
-
-    popad
-    ret
-  }
-}
-
-__declspec(naked)
-void FastConvertYToRGB32Row(const uint8* y_buf,
-                            uint8* rgb_buf,
-                            int width) {
-  __asm {
-    push      ebx
-    mov       eax, [esp + 4 + 4]   // Y
-    mov       edx, [esp + 4 + 8]   // rgb
-    mov       ecx, [esp + 4 + 12]  // width
-
- convertloop :
-    movzx     ebx, byte ptr [eax]
-    movq      mm0, [_kCoefficientsRgbY + 8 * ebx]
-    psraw     mm0, 6
-    movzx     ebx, byte ptr [eax + 1]
-    movq      mm1, [_kCoefficientsRgbY + 8 * ebx]
-    psraw     mm1, 6
-    packuswb  mm0, mm1
-    lea       eax, [eax + 2]
-    movq      [edx], mm0
-    lea       edx, [edx + 8]
-    sub       ecx, 2
-    ja        convertloop
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb0
+    mov        edx, [esp + 8 + 8]   // dst_argb
+    mov        ecx, [esp + 8 + 12]  // width
+    sub        edx, eax
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pslld      xmm4, 24
 
-    pop       ebx
+    align      16
+ convertloop:
+    movdqa     xmm0, [eax]      // read 4 pixels
+    movzx      esi, byte ptr [eax + 3]  // first alpha
+    movzx      edi, byte ptr [eax + 7]  // second alpha
+    punpcklbw  xmm0, xmm0       // first 2
+    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
+    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
+    pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
+    pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
+    movlhps    xmm2, xmm3
+    pmulhuw    xmm0, xmm2       // rgb * a
+
+    movdqa     xmm1, [eax]      // read 4 pixels
+    movzx      esi, byte ptr [eax + 11]  // third alpha
+    movzx      edi, byte ptr [eax + 15]  // forth alpha
+    punpckhbw  xmm1, xmm1       // next 2
+    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
+    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
+    pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
+    pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
+    movlhps    xmm2, xmm3
+    pmulhuw    xmm1, xmm2       // rgb * a
+
+    movdqa     xmm2, [eax]      // alphas
+    pand       xmm2, xmm4
+    packuswb   xmm0, xmm1
+    por        xmm0, xmm2
+    sub        ecx, 4
+    movdqa     [eax + edx], xmm0
+    lea        eax, [eax + 16]
+    jg         convertloop
+    pop        edi
+    pop        esi
     ret
   }
 }
+#endif  // HAS_ARGBUNATTENUATEROW_SSE2
 
-#endif
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R
+static const vec8 kARGBToGray = {
+  14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
+};
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
+__declspec(naked) __declspec(align(16))
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]  /* width */
+    movdqa     xmm4, kARGBToGray
+    sub        edx, eax
+
+    align      16
+ convertloop:
+    movdqa     xmm0, [eax]  // G
+    movdqa     xmm1, [eax + 16]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    phaddw     xmm0, xmm1
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0   // 8 G bytes
+    movdqa     xmm2, [eax]  // A
+    movdqa     xmm3, [eax + 16]
+    psrld      xmm2, 24
+    psrld      xmm3, 24
+    packuswb   xmm2, xmm3
+    packuswb   xmm2, xmm2   // 8 A bytes
+    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
+    punpcklbw  xmm0, xmm0   // 8 GG words
+    punpcklbw  xmm3, xmm2   // 8 GA words
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm3   // GGGA first 4
+    punpckhwd  xmm1, xmm3   // GGGA next 4
+    sub        ecx, 8
+    movdqa     [eax + edx], xmm0
+    movdqa     [eax + edx + 16], xmm1
+    lea        eax, [eax + 32]
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone.
+static const vec8 kARGBToSepiaB = {
+  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+static const vec8 kARGBToSepiaG = {
+  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+static const vec8 kARGBToSepiaR = {
+  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+__declspec(naked) __declspec(align(16))
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* dst_argb */
+    mov        ecx, [esp + 8]   /* width */
+    movdqa     xmm2, kARGBToSepiaB
+    movdqa     xmm3, kARGBToSepiaG
+    movdqa     xmm4, kARGBToSepiaR
+
+    align      16
+ convertloop:
+    movdqa     xmm0, [eax]  // B
+    movdqa     xmm6, [eax + 16]
+    pmaddubsw  xmm0, xmm2
+    pmaddubsw  xmm6, xmm2
+    phaddw     xmm0, xmm6
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0   // 8 B values
+    movdqa     xmm5, [eax]  // G
+    movdqa     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm3
+    pmaddubsw  xmm1, xmm3
+    phaddw     xmm5, xmm1
+    psrlw      xmm5, 7
+    packuswb   xmm5, xmm5   // 8 G values
+    punpcklbw  xmm0, xmm5   // 8 BG values
+    movdqa     xmm5, [eax]  // R
+    movdqa     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm4
+    pmaddubsw  xmm1, xmm4
+    phaddw     xmm5, xmm1
+    psrlw      xmm5, 7
+    packuswb   xmm5, xmm5   // 8 R values
+    movdqa     xmm6, [eax]  // A
+    movdqa     xmm1, [eax + 16]
+    psrld      xmm6, 24
+    psrld      xmm1, 24
+    packuswb   xmm6, xmm1
+    packuswb   xmm6, xmm6   // 8 A values
+    punpcklbw  xmm5, xmm6   // 8 RA values
+    movdqa     xmm1, xmm0   // Weave BG, RA together
+    punpcklwd  xmm0, xmm5   // BGRA first 4
+    punpckhwd  xmm1, xmm5   // BGRA next 4
+    sub        ecx, 8
+    movdqa     [eax], xmm0
+    movdqa     [eax + 16], xmm1
+    lea        eax, [eax + 32]
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
+// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
+__declspec(naked) __declspec(align(16))
+void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
+                              int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* dst_argb */
+    mov        edx, [esp + 8]   /* matrix_argb */
+    mov        ecx, [esp + 12]  /* width */
+    movd       xmm2, [edx]
+    movd       xmm3, [edx + 4]
+    movd       xmm4, [edx + 8]
+    pshufd     xmm2, xmm2, 0
+    pshufd     xmm3, xmm3, 0
+    pshufd     xmm4, xmm4, 0
+
+    align      16
+ convertloop:
+    movdqa     xmm0, [eax]  // B
+    movdqa     xmm6, [eax + 16]
+    pmaddubsw  xmm0, xmm2
+    pmaddubsw  xmm6, xmm2
+    movdqa     xmm5, [eax]  // G
+    movdqa     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm3
+    pmaddubsw  xmm1, xmm3
+    phaddsw    xmm0, xmm6   // B
+    phaddsw    xmm5, xmm1   // G
+    psraw      xmm0, 7      // B
+    psraw      xmm5, 7      // G
+    packuswb   xmm0, xmm0   // 8 B values
+    packuswb   xmm5, xmm5   // 8 G values
+    punpcklbw  xmm0, xmm5   // 8 BG values
+    movdqa     xmm5, [eax]  // R
+    movdqa     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm4
+    pmaddubsw  xmm1, xmm4
+    phaddsw    xmm5, xmm1
+    psraw      xmm5, 7
+    packuswb   xmm5, xmm5   // 8 R values
+    movdqa     xmm6, [eax]  // A
+    movdqa     xmm1, [eax + 16]
+    psrld      xmm6, 24
+    psrld      xmm1, 24
+    packuswb   xmm6, xmm1
+    packuswb   xmm6, xmm6   // 8 A values
+    movdqa     xmm1, xmm0   // Weave BG, RA together
+    punpcklbw  xmm5, xmm6   // 8 RA values
+    punpcklwd  xmm0, xmm5   // BGRA first 4
+    punpckhwd  xmm1, xmm5   // BGRA next 4
+    sub        ecx, 8
+    movdqa     [eax], xmm0
+    movdqa     [eax + 16], xmm1
+    lea        eax, [eax + 32]
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked) __declspec(align(16))
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    push       ebp
+    mov        eax, [esp + 16 + 4]   /* dst_argb */
+    mov        edi, [esp + 16 + 8]   /* table_argb */
+    mov        ecx, [esp + 16 + 12]  /* width */
+    xor        ebx, ebx
+    xor        edx, edx
+
+    align      16
+ convertloop:
+    mov        ebp, dword ptr [eax]  // BGRA
+    mov        esi, ebp
+    and        ebp, 255
+    shr        esi, 8
+    and        esi, 255
+    mov        bl, [edi + ebp * 4 + 0]  // B
+    mov        dl, [edi + esi * 4 + 1]  // G
+    mov        ebp, dword ptr [eax]  // BGRA
+    mov        esi, ebp
+    shr        ebp, 16
+    shr        esi, 24
+    and        ebp, 255
+    mov        [eax], bl
+    mov        [eax + 1], dl
+    mov        bl, [edi + ebp * 4 + 2]  // R
+    mov        dl, [edi + esi * 4 + 3]  // A
+    mov        [eax + 2], bl
+    mov        [eax + 3], dl
+    lea        eax, [eax + 4]
+    sub        ecx, 1
+    jg         convertloop
+    pop        ebp
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  __asm {
+    mov        eax, [esp + 4]    /* dst_argb */
+    movd       xmm2, [esp + 8]   /* scale */
+    movd       xmm3, [esp + 12]  /* interval_size */
+    movd       xmm4, [esp + 16]  /* interval_offset */
+    mov        ecx, [esp + 20]   /* width */
+    pshuflw    xmm2, xmm2, 040h
+    pshufd     xmm2, xmm2, 044h
+    pshuflw    xmm3, xmm3, 040h
+    pshufd     xmm3, xmm3, 044h
+    pshuflw    xmm4, xmm4, 040h
+    pshufd     xmm4, xmm4, 044h
+    pxor       xmm5, xmm5  // constant 0
+    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
+    pslld      xmm6, 24
+
+    align      16
+ convertloop:
+    movdqa     xmm0, [eax]  // read 4 pixels
+    punpcklbw  xmm0, xmm5   // first 2 pixels
+    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
+    movdqa     xmm1, [eax]  // read 4 pixels
+    punpckhbw  xmm1, xmm5   // next 2 pixels
+    pmulhuw    xmm1, xmm2
+    pmullw     xmm0, xmm3   // * interval_size
+    movdqa     xmm7, [eax]  // read 4 pixels
+    pmullw     xmm1, xmm3
+    pand       xmm7, xmm6   // mask alpha
+    paddw      xmm0, xmm4   // + interval_size / 2
+    paddw      xmm1, xmm4
+    packuswb   xmm0, xmm1
+    por        xmm0, xmm7
+    sub        ecx, 4
+    movdqa     [eax], xmm0
+    lea        eax, [eax + 16]
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
+// Consider float CumulativeSum.
+// Consider calling CumulativeSum one row at time as needed.
+// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
+// Convert cumulative sum for an area to an average for 1 pixel.
+// topleft is pointer to top left of CumulativeSum buffer for area.
+// botleft is pointer to bottom left of CumulativeSum buffer.
+// width is offset from left to right of area in CumulativeSum buffer measured
+//   in number of ints.
+// area is the number of pixels in the area being averaged.
+// dst points to pixel to store result to.
+// count is number of averaged pixels to produce.
+// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
+// aligned.
+void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
+                                 int width, int area, uint8* dst, int count) {
+  __asm {
+    mov        eax, topleft  // eax topleft
+    mov        esi, botleft  // esi botleft
+    mov        edx, width
+    movd       xmm4, area
+    mov        edi, dst
+    mov        ecx, count
+    cvtdq2ps   xmm4, xmm4
+    rcpss      xmm4, xmm4  // 1.0f / area
+    pshufd     xmm4, xmm4, 0
+    sub        ecx, 4
+    jl         l4b
+
+    // 4 pixel loop
+    align      4
+  l4:
+    // top left
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + 32]
+    movdqa     xmm3, [eax + 48]
+
+    // - top right
+    psubd      xmm0, [eax + edx * 4]
+    psubd      xmm1, [eax + edx * 4 + 16]
+    psubd      xmm2, [eax + edx * 4 + 32]
+    psubd      xmm3, [eax + edx * 4 + 48]
+    lea        eax, [eax + 64]
+
+    // - bottom left
+    psubd      xmm0, [esi]
+    psubd      xmm1, [esi + 16]
+    psubd      xmm2, [esi + 32]
+    psubd      xmm3, [esi + 48]
+
+    // + bottom right
+    paddd      xmm0, [esi + edx * 4]
+    paddd      xmm1, [esi + edx * 4 + 16]
+    paddd      xmm2, [esi + edx * 4 + 32]
+    paddd      xmm3, [esi + edx * 4 + 48]
+    lea        esi, [esi + 64]
+
+    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
+    cvtdq2ps   xmm1, xmm1
+    mulps      xmm0, xmm4
+    mulps      xmm1, xmm4
+    cvtdq2ps   xmm2, xmm2
+    cvtdq2ps   xmm3, xmm3
+    mulps      xmm2, xmm4
+    mulps      xmm3, xmm4
+    cvtps2dq   xmm0, xmm0
+    cvtps2dq   xmm1, xmm1
+    cvtps2dq   xmm2, xmm2
+    cvtps2dq   xmm3, xmm3
+    packssdw   xmm0, xmm1
+    packssdw   xmm2, xmm3
+    packuswb   xmm0, xmm2
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+    align      4
+  l1:
+    movdqa     xmm0, [eax]
+    psubd      xmm0, [eax + edx * 4]
+    lea        eax, [eax + 16]
+    psubd      xmm0, [esi]
+    paddd      xmm0, [esi + edx * 4]
+    lea        esi, [esi + 16]
+    cvtdq2ps   xmm0, xmm0
+    mulps      xmm0, xmm4
+    cvtps2dq   xmm0, xmm0
+    packssdw   xmm0, xmm0
+    packuswb   xmm0, xmm0
+    movd       dword ptr [edi], xmm0
+    lea        edi, [edi + 4]
+    sub        ecx, 1
+    jge        l1
+  l1b:
+  }
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGE_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width) {
+  __asm {
+    mov        eax, row
+    mov        edx, cumsum
+    mov        esi, previous_cumsum
+    mov        ecx, width
+    sub        esi, edx
+    pxor       xmm0, xmm0
+    pxor       xmm1, xmm1
+
+    sub        ecx, 4
+    jl         l4b
+    test       edx, 15
+    jne        l4b
+
+    // 4 pixel loop
+    align      4
+  l4:
+    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
+    lea        eax, [eax + 16]
+    movdqa     xmm4, xmm2
+
+    punpcklbw  xmm2, xmm1
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm2, xmm1
+    punpckhwd  xmm3, xmm1
+
+    punpckhbw  xmm4, xmm1
+    movdqa     xmm5, xmm4
+    punpcklwd  xmm4, xmm1
+    punpckhwd  xmm5, xmm1
+
+    paddd      xmm0, xmm2
+    movdqa     xmm2, [edx + esi]  // previous row above.
+    paddd      xmm2, xmm0
+
+    paddd      xmm0, xmm3
+    movdqa     xmm3, [edx + esi + 16]
+    paddd      xmm3, xmm0
 
+    paddd      xmm0, xmm4
+    movdqa     xmm4, [edx + esi + 32]
+    paddd      xmm4, xmm0
+
+    paddd      xmm0, xmm5
+    movdqa     xmm5, [edx + esi + 48]
+    paddd      xmm5, xmm0
+
+    movdqa     [edx], xmm2
+    movdqa     [edx + 16], xmm3
+    movdqa     [edx + 32], xmm4
+    movdqa     [edx + 48], xmm5
+
+    lea        edx, [edx + 64]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+    align      4
+  l1:
+    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
+    lea        eax, [eax + 4]
+    punpcklbw  xmm2, xmm1
+    punpcklwd  xmm2, xmm1
+    paddd      xmm0, xmm2
+    movdqu     xmm2, [edx + esi]
+    paddd      xmm2, xmm0
+    movdqu     [edx], xmm2
+    lea        edx, [edx + 16]
+    sub        ecx, 1
+    jge        l1
+
+ l1b:
+  }
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_ARGBSHADE_SSE2
+// Shade 4 pixels at a time by specified value.
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    movd       xmm2, [esp + 16]  // value
+    sub        edx, eax
+    punpcklbw  xmm2, xmm2
+    punpcklqdq xmm2, xmm2
+
+    align      16
+ convertloop:
+    movdqa     xmm0, [eax]      // read 4 pixels
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0       // first 2
+    punpckhbw  xmm1, xmm1       // next 2
+    pmulhuw    xmm0, xmm2       // argb * value
+    pmulhuw    xmm1, xmm2       // argb * value
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [eax + edx], xmm0
+    lea        eax, [eax + 16]
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBSHADE_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+__declspec(naked) __declspec(align(16))
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 12]   // src_argb
+    mov        esi, [esp + 16]  // stride
+    mov        edx, [esp + 20]  // dst_argb
+    mov        ecx, [esp + 24]  // pointer to uv_dudv
+    movq       xmm2, qword ptr [ecx]  // uv
+    movq       xmm7, qword ptr [ecx + 8]  // dudv
+    mov        ecx, [esp + 28]  // width
+    shl        esi, 16          // 4, stride
+    add        esi, 4
+    movd       xmm5, esi
+    sub        ecx, 4
+    jl         l4b
+
+    // setup for 4 pixel loop
+    pshufd     xmm7, xmm7, 0x44  // dup dudv
+    pshufd     xmm5, xmm5, 0  // dup 4, stride
+    movdqa     xmm0, xmm2    // x0, y0, x1, y1
+    addps      xmm0, xmm7
+    movlhps    xmm2, xmm0
+    movdqa     xmm4, xmm7
+    addps      xmm4, xmm4    // dudv *= 2
+    movdqa     xmm3, xmm2    // x2, y2, x3, y3
+    addps      xmm3, xmm4
+    addps      xmm4, xmm4    // dudv *= 4
+
+    // 4 pixel loop
+    align      4
+  l4:
+    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
+    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
+    packssdw   xmm0, xmm1    // x, y as 8 shorts
+    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       edi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       xmm1, [eax + esi]  // read pixel 0
+    movd       xmm6, [eax + edi]  // read pixel 1
+    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
+    addps      xmm2, xmm4    // x, y += dx, dy first 2
+    movq       qword ptr [edx], xmm1
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       edi, xmm0
+    movd       xmm6, [eax + esi]  // read pixel 2
+    movd       xmm0, [eax + edi]  // read pixel 3
+    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
+    addps      xmm3, xmm4    // x, y += dx, dy next 2
+    sub        ecx, 4
+    movq       qword ptr 8[edx], xmm6
+    lea        edx, [edx + 16]
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+    align      4
+  l1:
+    cvttps2dq  xmm0, xmm2    // x, y float to int
+    packssdw   xmm0, xmm0    // x, y as shorts
+    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
+    addps      xmm2, xmm7    // x, y += dx, dy
+    movd       esi, xmm0
+    movd       xmm0, [eax + esi]  // copy a pixel
+    sub        ecx, 1
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    jge        l1
+  l1b:
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
+__declspec(naked) __declspec(align(16))
+void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride, int dst_width,
+                              int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
+    shr        eax, 1
+    cmp        eax, 0
+    je         xloop1
+    cmp        eax, 64
+    je         xloop2
+    movd       xmm0, eax  // high fraction 0..127
+    neg        eax
+    add        eax, 128
+    movd       xmm5, eax  // low fraction 128..1
+    punpcklbw  xmm5, xmm0
+    punpcklwd  xmm5, xmm5
+    pshufd     xmm5, xmm5, 0
+
+    align      16
+  xloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + edx]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm2
+    punpckhbw  xmm1, xmm2
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm1, xmm5
+    psrlw      xmm0, 7
+    psrlw      xmm1, 7
+    packuswb   xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop
+
+    pop        edi
+    pop        esi
+    ret
+
+    align      16
+  xloop1:
+    movdqa     xmm0, [esi]
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop1
+
+    pop        edi
+    pop        esi
+    ret
+
+    align      16
+  xloop2:
+    movdqa     xmm0, [esi]
+    pavgb      xmm0, [esi + edx]
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop2
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#endif  // _M_IX86
+
+#ifdef __cplusplus
 }  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/scale.cc b/files/source/scale.cc
index d3b7d333..38910c91 100644
--- a/files/source/scale.cc
+++ b/files/source/scale.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -12,34 +12,37 @@
 
 #include <assert.h>
 #include <string.h>
+#include <stdlib.h>  // For getenv()
 
 #include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyPlane
+#include "libyuv/row.h"
 
-#if defined(_MSC_VER)
-#define ALIGN16(var) __declspec(align(16)) var
-#else
-#define ALIGN16(var) var __attribute__((aligned(16)))
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
 #endif
 
-// Note: A Neon reference manual
-// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
+// Bilinear SSE2 is disabled.
+#define SSE2_DISABLED 1
+
 // Note: Some SSE2 reference manuals
 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
 
-namespace libyuv {
-
 // Set the following flag to true to revert to only
 // using the reference implementation ScalePlaneBox(), and
 // NOT the optimized versions. Useful for debugging and
 // when comparing the quality of the resulting YUV planes
 // as produced by the optimized and non-optimized versions.
-
 static bool use_reference_impl_ = false;
 
+LIBYUV_API
 void SetUseReferenceImpl(bool use) {
   use_reference_impl_ = use;
 }
 
+// ScaleRowDown2Int also used by planar functions
+
 /**
  * NEON downscalers with interpolation.
  *
@@ -47,126 +50,53 @@ void SetUseReferenceImpl(bool use) {
  *
  */
 
-#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_SCALEROWDOWN2_NEON
-void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
-                        uint8* dst, int dst_width) {
-  __asm__ volatile
-  (
-    "1:\n"
-    "vld2.u8    {q0,q1}, [%0]!    \n"  // load even pixels into q0, odd into q1
-    "vst1.u8    {q0}, [%1]!       \n"  // store even pixels
-    "subs       %2, %2, #16       \n"  // 16 processed per loop
-    "bhi        1b                \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst),              // %1
-      "+r"(dst_width)         // %2
-    :
-    : "q0", "q1"              // Clobber List
-  );
-}
+// Note - not static due to reuse in convert for 444 to 420.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                        uint8* dst, int dst_width);
 
-void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
-                           uint8* dst, int dst_width) {
-  __asm__ volatile
-  (
-    "mov        r4, #2            \n"  // rounding constant
-    "add        %1, %0            \n"  // change the stride to row 2 pointer
-    "vdup.16    q4, r4            \n"
-    "1:\n"
-    "vld1.u8    {q0,q1}, [%0]!    \n"  // load row 1 and post increment
-    "vld1.u8    {q2,q3}, [%1]!    \n"  // load row 2 and post increment
-    "vpaddl.u8  q0, q0            \n"  // row 1 add adjacent
-    "vpaddl.u8  q1, q1            \n"
-    "vpadal.u8  q0, q2            \n"  // row 2 add adjacent, add row 1 to row 2
-    "vpadal.u8  q1, q3            \n"
-    "vadd.u16   q0, q4            \n"  // rounding
-    "vadd.u16   q1, q4            \n"
-    "vshrn.u16  d0, q0, #2        \n"  // downshift and pack
-    "vshrn.u16  d1, q1, #2        \n"
-    "vst1.u8    {q0}, [%2]!       \n"
-    "subs       %3, %3, #16       \n"  // 16 processed per loop
-    "bhi        1b                \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(src_stride),       // %1
-      "+r"(dst),              // %2
-      "+r"(dst_width)         // %3
-    :
-    : "r4", "q0", "q1", "q2", "q3", "q4"              // Clobber List
-   );
-}
+void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width);
 
 #define HAS_SCALEROWDOWN4_NEON
-// Expecting widths on arm devices to be smaller.  Went with 8x4 blocks
-//  to get most coverage.  Look to back and evaluate 16x4 blocks with
-//  handling of leftovers.
-static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
-                               uint8* dst_ptr, int dst_width) {
-  __asm__ volatile
-  (
-    "mov        r4, #4            \n"
-    "1:                           \n"
-    "vld1.u8    {d0[0]}, [%0],r4  \n"   // load up only 2 pixels of data to
-    "vld1.u8    {d0[1]}, [%0],r4  \n"   //  represent the entire 8x4 block
-
-    "vst1.u16   {d0[0]}, [%1]!    \n"
-
-    "subs       %2, #2            \n"   // dst_width -= 2
-    "bhi        1b                \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width)         // %2
-    :
-    : "r4", "q0", "q1", "memory", "cc"
-  );
-}
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
 
-static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
-                                  uint8* dst_ptr, int dst_width) {
-  __asm__ volatile
-  (
-    "1:                           \n"
-    "mov        r4, %0            \n"
-    "vld1.u8    {d0}, [r4],%3     \n"   // load up 8x4 block of input data
-    "vld1.u8    {d1}, [r4],%3     \n"
-    "vld1.u8    {d2}, [r4],%3     \n"
-    "vld1.u8    {d3}, [r4]        \n"
-
-    // data is loaded up int q0 and q1
-    // q0 = a00 a01 a02 a03 b00 b01 b02 b03 a10 a11 a12 a13 b10 b11 b12 b13
-    // q1 = a20 a21 a22 a23 b20 b21 b22 b23 a20 a21 a22 a23 b20 b21 b22 b23
-    // q0 = a00+a01 a02+a03 b00+b01 b02+b03 a10+a11 a12+a13 b10+b11 b12+b13
-    "vpaddl.u8  q0, q0            \n"
-
-    // d0 = a00+a01+a20+a21 a02+a03+a22+a23 b00+b01+b20+b21 b02+b03+b22+b23
-    // d1 = a10+a11+a20+a21 a12+a13+a22+a23 b10+b11+b20+b21 b12+b13+b22+b23
-    "vpadal.u8  q0, q1            \n"
-
-    // d0 = a00+a01+a20+a21+a02+a03+a22+a23 b00+b01+b20+b21+b02+b03+b22+b23
-    // d1 = a10+a11+a20+a21+a12+a13+a22+a23 b10+b11+b20+b21+b12+b13+b22+b23
-    "vpaddl.u16 q0, q0            \n"
-
-
-    // d0 = a00+a01+a20+a21+a02+a03+a22+a23+a10+a11+a20+a21+a12+a13+a22+a23
-    //      b00+b01+b20+b21+b02+b03+b22+b23+b10+b11+b20+b21+b12+b13+b22+b23
-    "vadd.u32   d0, d1            \n"
-
-    "vrshr.u32  d0, d0, #4        \n"   // divide by 16 w/rounding
-
-    "vst1.u8    {d0[0]}, [%1]!    \n"
-    "vst1.u8    {d0[4]}, [%1]!    \n"
-
-    "add        %0, #8            \n"   // move src pointer to next 8 pixels
-    "subs       %2, #2            \n"   // dst_width -= 2
-    "bhi        1b                \n"
-
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width)         // %2
-    : "r"(src_stride)         // %3
-    : "r4", "q0", "q1", "memory", "cc"
-  );
-}
+#define HAS_SCALEROWDOWN34_NEON
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+//  to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t /* src_stride */,
+                         uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+#define HAS_SCALEROWDOWN38_NEON
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t /* src_stride */,
+                         uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+// 16x2 -> 16x1
+#define HAS_SCALEFILTERROWS_NEON
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction);
 
 /**
  * SSE2 downscalers with interpolation.
@@ -175,137 +105,141 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
  *
  */
 
-// Constants for SSE2 code
-#elif (defined(WIN32) || defined(__i386__) || defined(__x86_64__)) && \
-    !defined(COVERAGE_ENABLED) && !TARGET_IPHONE_SIMULATOR
-#if defined(_MSC_VER)
-#define TALIGN16(t, var) __declspec(align(16)) t _ ## var
-#elif defined(OSX)
-#define TALIGN16(t, var) t var __attribute__((aligned(16)))
+
+// Constants for SSSE3 code
+#elif !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
+
+// GCC 4.2 on OSX has link error when passing static or const to inline.
+// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
+#ifdef __APPLE__
+#define CONST
 #else
-#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
+#define CONST static const
 #endif
 
 // Offsets for source bytes 0 to 9
-extern "C" TALIGN16(const uint8, shuf0[16]) =
+CONST uvec8 kShuf0 =
   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
 
 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-extern "C" TALIGN16(const uint8, shuf1[16]) =
+CONST uvec8 kShuf1 =
   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-extern "C" TALIGN16(const uint8, shuf2[16]) =
+CONST uvec8 kShuf2 =
   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
 
 // Offsets for source bytes 0 to 10
-extern "C" TALIGN16(const uint8, shuf01[16]) =
+CONST uvec8 kShuf01 =
   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
 
 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-extern "C" TALIGN16(const uint8, shuf11[16]) =
+CONST uvec8 kShuf11 =
   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-extern "C" TALIGN16(const uint8, shuf21[16]) =
+CONST uvec8 kShuf21 =
   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
 
 // Coefficients for source bytes 0 to 10
-extern "C" TALIGN16(const uint8, madd01[16]) =
+CONST uvec8 kMadd01 =
   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
 
 // Coefficients for source bytes 10 to 21
-extern "C" TALIGN16(const uint8, madd11[16]) =
+CONST uvec8 kMadd11 =
   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
 
 // Coefficients for source bytes 21 to 31
-extern "C" TALIGN16(const uint8, madd21[16]) =
+CONST uvec8 kMadd21 =
   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
 
 // Coefficients for source bytes 21 to 31
-extern "C" TALIGN16(const int16, round34[8]) =
+CONST vec16 kRound34 =
   { 2, 2, 2, 2, 2, 2, 2, 2 };
 
-extern "C" TALIGN16(const uint8, shuf38a[16]) =
+CONST uvec8 kShuf38a =
   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
 
-extern "C" TALIGN16(const uint8, shuf38b[16]) =
+CONST uvec8 kShuf38b =
   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
 
 // Arrange words 0,3,6 into 0,1,2
-extern "C" TALIGN16(const uint8, shufac0[16]) =
+CONST uvec8 kShufAc =
   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
 
 // Arrange words 0,3,6 into 3,4,5
-extern "C" TALIGN16(const uint8, shufac3[16]) =
+CONST uvec8 kShufAc3 =
   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
 
 // Scaling values for boxes of 3x3 and 2x3
-extern "C" TALIGN16(const uint16, scaleac3[8]) =
+CONST uvec16 kScaleAc33 =
   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
 
 // Arrange first value for pixels 0,1,2,3,4,5
-extern "C" TALIGN16(const uint8, shufab0[16]) =
+CONST uvec8 kShufAb0 =
   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
 
 // Arrange second value for pixels 0,1,2,3,4,5
-extern "C" TALIGN16(const uint8, shufab1[16]) =
+CONST uvec8 kShufAb1 =
   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
 
 // Arrange third value for pixels 0,1,2,3,4,5
-extern "C" TALIGN16(const uint8, shufab2[16]) =
+CONST uvec8 kShufAb2 =
   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
 
 // Scaling values for boxes of 3x2 and 2x2
-extern "C" TALIGN16(const uint16, scaleab2[8]) =
+CONST uvec16 kScaleAb2 =
   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
 #endif
 
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 
 #define HAS_SCALEROWDOWN2_SSE2
 // Reads 32 pixels, throws half away and writes 16 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked)
-static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
-    psrlw      xmm7, 8
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
 
+    align      16
   wloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    pand       xmm0, xmm7
-    pand       xmm1, xmm7
+    pand       xmm0, xmm5
+    pand       xmm1, xmm5
     packuswb   xmm0, xmm1
+    sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
-    sub        ecx, 16
-    ja         wloop
+    jg         wloop
 
     ret
   }
 }
 // Blends 32x2 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked)
-static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
-                                  uint8* dst_ptr, int dst_width) {
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_ptr
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_ptr
     mov        ecx, [esp + 4 + 16]   // dst_width
-    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
-    psrlw      xmm7, 8
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
 
+    align      16
   wloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -319,16 +253,91 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
     psrlw      xmm0, 8
     movdqa     xmm3, xmm1
     psrlw      xmm1, 8
-    pand       xmm2, xmm7
-    pand       xmm3, xmm7
+    pand       xmm2, xmm5
+    pand       xmm3, xmm5
     pavgw      xmm0, xmm2
     pavgw      xmm1, xmm3
     packuswb   xmm0, xmm1
 
+    sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
+                                         ptrdiff_t src_stride,
+                                         uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+    align      16
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    sub        ecx, 16
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    ret
+  }
+}
+// Blends 32x2 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+    align      16
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm5
+    pand       xmm3, xmm5
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
     sub        ecx, 16
-    ja         wloop
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
 
     pop        esi
     ret
@@ -338,63 +347,64 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
 #define HAS_SCALEROWDOWN4_SSE2
 // Point samples 32 pixels to 8 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
   __asm {
-    pushad
-    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
-    mov        edi, [esp + 32 + 12]  // dst_ptr
-    mov        ecx, [esp + 32 + 16]  // dst_width
-    pcmpeqb    xmm7, xmm7            // generate mask 0x000000ff
-    psrld      xmm7, 24
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x000000ff
+    psrld      xmm5, 24
 
+    align      16
   wloop:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + 16]
-    lea        esi,  [esi + 32]
-    pand       xmm0, xmm7
-    pand       xmm1, xmm7
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5
+    pand       xmm1, xmm5
     packuswb   xmm0, xmm1
     packuswb   xmm0, xmm0
-    movq       qword ptr [edi], xmm0
-    lea        edi, [edi + 8]
     sub        ecx, 8
-    ja         wloop
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    jg         wloop
 
-    popad
     ret
   }
 }
 
 // Blends 32x4 rectangle to 8x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                                   uint8* dst_ptr, int dst_width) {
   __asm {
-    pushad
-    mov        esi, [esp + 32 + 4]   // src_ptr
-    mov        ebx, [esp + 32 + 8]   // src_stride
-    mov        edi, [esp + 32 + 12]  // dst_ptr
-    mov        ecx, [esp + 32 + 16]  // dst_width
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_ptr
+    mov        esi, [esp + 8 + 8]    // src_stride
+    mov        edx, [esp + 8 + 12]   // dst_ptr
+    mov        ecx, [esp + 8 + 16]   // dst_width
+    lea        edi, [esi + esi * 2]  // src_stride * 3
     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
     psrlw      xmm7, 8
-    lea        edx, [ebx + ebx * 2]  // src_stride * 3
 
+    align      16
   wloop:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + 16]
-    movdqa     xmm2, [esi + ebx]
-    movdqa     xmm3, [esi + ebx + 16]
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
     pavgb      xmm0, xmm2            // average rows
     pavgb      xmm1, xmm3
-    movdqa     xmm2, [esi + ebx * 2]
-    movdqa     xmm3, [esi + ebx * 2 + 16]
-    movdqa     xmm4, [esi + edx]
-    movdqa     xmm5, [esi + edx + 16]
-    lea        esi, [esi + 32]
+    movdqa     xmm2, [eax + esi * 2]
+    movdqa     xmm3, [eax + esi * 2 + 16]
+    movdqa     xmm4, [eax + edi]
+    movdqa     xmm5, [eax + edi + 16]
+    lea        eax, [eax + 32]
     pavgb      xmm2, xmm4
     pavgb      xmm3, xmm5
     pavgb      xmm0, xmm2
@@ -416,12 +426,13 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
     pavgw      xmm0, xmm2
     packuswb   xmm0, xmm0
 
-    movq       qword ptr [edi], xmm0
-    lea        edi, [edi + 8]
     sub        ecx, 8
-    ja         wloop
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    jg         wloop
 
-    popad
+    pop        edi
+    pop        esi
     ret
   }
 }
@@ -429,64 +440,66 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
 #define HAS_SCALEROWDOWN8_SSE2
 // Point samples 32 pixels to 4 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked)
-static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
   __asm {
-    pushad
-    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
-    mov        edi, [esp + 32 + 12]  // dst_ptr
-    mov        ecx, [esp + 32 + 16]  // dst_width
-    pcmpeqb    xmm7, xmm7            // generate mask isolating 1 src 8 bytes
-    psrlq      xmm7, 56
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask isolating 1 src 8 bytes
+    psrlq      xmm5, 56
 
+    align      16
   wloop:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + 16]
-    lea        esi,  [esi + 32]
-    pand       xmm0, xmm7
-    pand       xmm1, xmm7
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5
+    pand       xmm1, xmm5
     packuswb   xmm0, xmm1  // 32->16
     packuswb   xmm0, xmm0  // 16->8
     packuswb   xmm0, xmm0  // 8->4
-    movd       dword ptr [edi], xmm0
-    lea        edi, [edi + 4]
     sub        ecx, 4
-    ja         wloop
+    movd       dword ptr [edx], xmm0
+    lea        edx, [edx + 4]
+    jg         wloop
 
-    popad
     ret
   }
 }
 
 // Blends 32x8 rectangle to 4x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked)
-static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                                   uint8* dst_ptr, int dst_width) {
   __asm {
-    pushad
-    mov        esi, [esp + 32 + 4]   // src_ptr
-    mov        ebx, [esp + 32 + 8]   // src_stride
-    mov        edi, [esp + 32 + 12]  // dst_ptr
-    mov        ecx, [esp + 32 + 16]  // dst_width
-    lea        edx, [ebx + ebx * 2]  // src_stride * 3
+    push       esi
+    push       edi
+    push       ebp
+    mov        eax, [esp + 12 + 4]   // src_ptr
+    mov        esi, [esp + 12 + 8]   // src_stride
+    mov        edx, [esp + 12 + 12]  // dst_ptr
+    mov        ecx, [esp + 12 + 16]  // dst_width
+    lea        edi, [esi + esi * 2]  // src_stride * 3
     pxor       xmm7, xmm7
 
+    align      16
   wloop:
-    movdqa     xmm0, [esi]           // average 8 rows to 1
-    movdqa     xmm1, [esi + 16]
-    movdqa     xmm2, [esi + ebx]
-    movdqa     xmm3, [esi + ebx + 16]
+    movdqa     xmm0, [eax]           // average 8 rows to 1
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
     pavgb      xmm0, xmm2
     pavgb      xmm1, xmm3
-    movdqa     xmm2, [esi + ebx * 2]
-    movdqa     xmm3, [esi + ebx * 2 + 16]
-    movdqa     xmm4, [esi + edx]
-    movdqa     xmm5, [esi + edx + 16]
-    lea        ebp, [esi + ebx * 4]
-    lea        esi, [esi + 32]
+    movdqa     xmm2, [eax + esi * 2]
+    movdqa     xmm3, [eax + esi * 2 + 16]
+    movdqa     xmm4, [eax + edi]
+    movdqa     xmm5, [eax + edi + 16]
+    lea        ebp, [eax + esi * 4]
+    lea        eax, [eax + 32]
     pavgb      xmm2, xmm4
     pavgb      xmm3, xmm5
     pavgb      xmm0, xmm2
@@ -494,15 +507,15 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
 
     movdqa     xmm2, [ebp]
     movdqa     xmm3, [ebp + 16]
-    movdqa     xmm4, [ebp + ebx]
-    movdqa     xmm5, [ebp + ebx + 16]
+    movdqa     xmm4, [ebp + esi]
+    movdqa     xmm5, [ebp + esi + 16]
     pavgb      xmm2, xmm4
     pavgb      xmm3, xmm5
-    movdqa     xmm4, [ebp + ebx * 2]
-    movdqa     xmm5, [ebp + ebx * 2 + 16]
-    movdqa     xmm6, [ebp + edx]
+    movdqa     xmm4, [ebp + esi * 2]
+    movdqa     xmm5, [ebp + esi * 2 + 16]
+    movdqa     xmm6, [ebp + edi]
     pavgb      xmm4, xmm6
-    movdqa     xmm6, [ebp + edx + 16]
+    movdqa     xmm6, [ebp + edi + 16]
     pavgb      xmm5, xmm6
     pavgb      xmm2, xmm4
     pavgb      xmm3, xmm5
@@ -517,60 +530,61 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
     psrlw      xmm0, 3
     packuswb   xmm0, xmm0
     packuswb   xmm0, xmm0
-    movd       dword ptr [edi], xmm0
 
-    lea        edi, [edi + 4]
     sub        ecx, 4
-    ja         wloop
+    movd       dword ptr [edx], xmm0
+    lea        edx, [edx + 4]
+    jg         wloop
 
-    popad
+    pop        ebp
+    pop        edi
+    pop        esi
     ret
   }
 }
 
 #define HAS_SCALEROWDOWN34_SSSE3
 // Point samples 32 pixels to 24 pixels.
-// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 // Then shuffled to do the scaling.
 
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                                  uint8* dst_ptr, int dst_width) {
   __asm {
-    pushad
-    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
-    mov        edi, [esp + 32 + 12]  // dst_ptr
-    mov        ecx, [esp + 32 + 16]  // dst_width
-    movdqa     xmm3, _shuf0
-    movdqa     xmm4, _shuf1
-    movdqa     xmm5, _shuf2
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    movdqa     xmm3, kShuf0
+    movdqa     xmm4, kShuf1
+    movdqa     xmm5, kShuf2
 
+    align      16
   wloop:
-    movdqa     xmm0, [esi]
-    movdqa     xmm2, [esi + 16]
-    lea        esi,  [esi + 32]
-    movdqa     xmm1, xmm2
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm1
     palignr    xmm1, xmm0, 8
     pshufb     xmm0, xmm3
     pshufb     xmm1, xmm4
     pshufb     xmm2, xmm5
-    movq       qword ptr [edi], xmm0
-    movq       qword ptr [edi + 8], xmm1
-    movq       qword ptr [edi + 16], xmm2
-    lea        edi, [edi + 24]
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + 8], xmm1
+    movq       qword ptr [edx + 16], xmm2
+    lea        edx, [edx + 24]
     sub        ecx, 24
-    ja         wloop
+    jg         wloop
 
-    popad
     ret
   }
 }
 
 // Blends 32x2 rectangle to 24x1
-// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 // Then shuffled to do the scaling.
 
 // Register usage:
@@ -581,86 +595,90 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
 // xmm4 shuf 2
 // xmm5 madd 0
 // xmm6 madd 1
-// xmm7 round34
+// xmm7 kRound34
 
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
+                                       ptrdiff_t src_stride,
                                        uint8* dst_ptr, int dst_width) {
   __asm {
-    pushad
-    mov        esi, [esp + 32 + 4]   // src_ptr
-    mov        ebx, [esp + 32 + 8]   // src_stride
-    mov        edi, [esp + 32 + 12]  // dst_ptr
-    mov        ecx, [esp + 32 + 16]  // dst_width
-    movdqa     xmm2, _shuf01
-    movdqa     xmm3, _shuf11
-    movdqa     xmm4, _shuf21
-    movdqa     xmm5, _madd01
-    movdqa     xmm6, _madd11
-    movdqa     xmm7, _round34
-
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShuf01
+    movdqa     xmm3, kShuf11
+    movdqa     xmm4, kShuf21
+    movdqa     xmm5, kMadd01
+    movdqa     xmm6, kMadd11
+    movdqa     xmm7, kRound34
+
+    align      16
   wloop:
-    movdqa     xmm0, [esi]           // pixels 0..7
-    movdqa     xmm1, [esi+ebx]
+    movdqa     xmm0, [eax]           // pixels 0..7
+    movdqa     xmm1, [eax + esi]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm2
     pmaddubsw  xmm0, xmm5
     paddsw     xmm0, xmm7
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
-    movq       qword ptr [edi], xmm0
-    movdqu     xmm0, [esi+8]         // pixels 8..15
-    movdqu     xmm1, [esi+ebx+8]
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm3
     pmaddubsw  xmm0, xmm6
     paddsw     xmm0, xmm7
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
-    movq       qword ptr [edi+8], xmm0
-    movdqa     xmm0, [esi+16]        // pixels 16..23
-    movdqa     xmm1, [esi+ebx+16]
-    lea        esi, [esi+32]
+    movq       qword ptr [edx + 8], xmm0
+    movdqa     xmm0, [eax + 16]      // pixels 16..23
+    movdqa     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm4
-    movdqa     xmm1, _madd21
+    movdqa     xmm1, kMadd21
     pmaddubsw  xmm0, xmm1
     paddsw     xmm0, xmm7
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
-    movq       qword ptr [edi+16], xmm0
-    lea        edi, [edi+24]
     sub        ecx, 24
-    ja         wloop
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx + 24]
+    jg         wloop
 
-    popad
+    pop        esi
     ret
   }
 }
 
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
+                                       ptrdiff_t src_stride,
                                        uint8* dst_ptr, int dst_width) {
   __asm {
-    pushad
-    mov        esi, [esp + 32 + 4]   // src_ptr
-    mov        ebx, [esp + 32 + 8]   // src_stride
-    mov        edi, [esp + 32 + 12]  // dst_ptr
-    mov        ecx, [esp + 32 + 16]  // dst_width
-    movdqa     xmm2, _shuf01
-    movdqa     xmm3, _shuf11
-    movdqa     xmm4, _shuf21
-    movdqa     xmm5, _madd01
-    movdqa     xmm6, _madd11
-    movdqa     xmm7, _round34
-
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShuf01
+    movdqa     xmm3, kShuf11
+    movdqa     xmm4, kShuf21
+    movdqa     xmm5, kMadd01
+    movdqa     xmm6, kMadd11
+    movdqa     xmm7, kRound34
+
+    align      16
   wloop:
-    movdqa     xmm0, [esi]           // pixels 0..7
-    movdqa     xmm1, [esi+ebx]
+    movdqa     xmm0, [eax]           // pixels 0..7
+    movdqa     xmm1, [eax + esi]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm2
@@ -668,9 +686,9 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
     paddsw     xmm0, xmm7
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
-    movq       qword ptr [edi], xmm0
-    movdqu     xmm0, [esi+8]         // pixels 8..15
-    movdqu     xmm1, [esi+ebx+8]
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm3
@@ -678,24 +696,24 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
     paddsw     xmm0, xmm7
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
-    movq       qword ptr [edi+8], xmm0
-    movdqa     xmm0, [esi+16]        // pixels 16..23
-    movdqa     xmm1, [esi+ebx+16]
-    lea        esi, [esi+32]
+    movq       qword ptr [edx + 8], xmm0
+    movdqa     xmm0, [eax + 16]      // pixels 16..23
+    movdqa     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm4
-    movdqa     xmm1, _madd21
+    movdqa     xmm1, kMadd21
     pmaddubsw  xmm0, xmm1
     paddsw     xmm0, xmm7
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
-    movq       qword ptr [edi+16], xmm0
-    lea        edi, [edi+24]
     sub        ecx, 24
-    ja         wloop
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx+24]
+    jg         wloop
 
-    popad
+    pop        esi
     ret
   }
 }
@@ -704,202 +722,219 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
 // 3/8 point sampler
 
 // Scale 32 pixels to 12
-__declspec(naked)
-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                                  uint8* dst_ptr, int dst_width) {
   __asm {
-    pushad
-    mov        esi, [esp + 32 + 4]   // src_ptr
-    mov        edx, [esp + 32 + 8]   // src_stride
-    mov        edi, [esp + 32 + 12]  // dst_ptr
-    mov        ecx, [esp + 32 + 16]  // dst_width
-    movdqa     xmm5, _shuf38a
-    movdqa     xmm6, _shuf38b
-    pxor       xmm7, xmm7
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    movdqa     xmm4, kShuf38a
+    movdqa     xmm5, kShuf38b
 
+    align      16
   xloop:
-    movdqa     xmm0, [esi]           // 16 pixels -> 0,1,2,3,4,5
-    movdqa     xmm1, [esi + 16]      // 16 pixels -> 6,7,8,9,10,11
-    lea        esi, [esi + 32]
-    pshufb     xmm0, xmm5
-    pshufb     xmm1, xmm6
+    movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
+    movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm4
+    pshufb     xmm1, xmm5
     paddusb    xmm0, xmm1
 
-    movq       qword ptr [edi], xmm0 // write 12 pixels
-    movhlps    xmm1, xmm0
-    movd       [edi + 8], xmm1
-    lea        edi, [edi + 12]
     sub        ecx, 12
-    ja         xloop
+    movq       qword ptr [edx], xmm0 // write 12 pixels
+    movhlps    xmm1, xmm0
+    movd       [edx + 8], xmm1
+    lea        edx, [edx + 12]
+    jg         xloop
 
-    popad
     ret
   }
 }
 
 // Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked)
-static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
+                                       ptrdiff_t src_stride,
                                        uint8* dst_ptr, int dst_width) {
   __asm {
-    pushad
-    mov        esi, [esp + 32 + 4]   // src_ptr
-    mov        edx, [esp + 32 + 8]   // src_stride
-    mov        edi, [esp + 32 + 12]  // dst_ptr
-    mov        ecx, [esp + 32 + 16]  // dst_width
-    movdqa     xmm4, _shufac0
-    movdqa     xmm5, _shufac3
-    movdqa     xmm6, _scaleac3
-    pxor       xmm7, xmm7
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShufAc
+    movdqa     xmm3, kShufAc3
+    movdqa     xmm4, kScaleAc33
+    pxor       xmm5, xmm5
 
+    align      16
   xloop:
-    movdqa     xmm0, [esi]           // sum up 3 rows into xmm0/1
-    movdqa     xmm2, [esi + edx]
+    movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
+    movdqa     xmm6, [eax + esi]
     movhlps    xmm1, xmm0
-    movhlps    xmm3, xmm2
-    punpcklbw  xmm0, xmm7
-    punpcklbw  xmm1, xmm7
-    punpcklbw  xmm2, xmm7
-    punpcklbw  xmm3, xmm7
-    paddusw    xmm0, xmm2
-    paddusw    xmm1, xmm3
-    movdqa     xmm2, [esi + edx * 2]
-    lea        esi, [esi + 16]
-    movhlps    xmm3, xmm2
-    punpcklbw  xmm2, xmm7
-    punpcklbw  xmm3, xmm7
-    paddusw    xmm0, xmm2
-    paddusw    xmm1, xmm3
-
-    movdqa     xmm2, xmm0            // 8 pixels -> 0,1,2 of xmm2
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+    movdqa     xmm6, [eax + esi * 2]
+    lea        eax, [eax + 16]
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+
+    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
     psrldq     xmm0, 2
-    paddusw    xmm2, xmm0
+    paddusw    xmm6, xmm0
     psrldq     xmm0, 2
-    paddusw    xmm2, xmm0
-    pshufb     xmm2, xmm4
+    paddusw    xmm6, xmm0
+    pshufb     xmm6, xmm2
 
-    movdqa     xmm3, xmm1            // 8 pixels -> 3,4,5 of xmm2
+    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
     psrldq     xmm1, 2
-    paddusw    xmm3, xmm1
+    paddusw    xmm7, xmm1
     psrldq     xmm1, 2
-    paddusw    xmm3, xmm1
-    pshufb     xmm3, xmm5
-    paddusw    xmm2, xmm3
+    paddusw    xmm7, xmm1
+    pshufb     xmm7, xmm3
+    paddusw    xmm6, xmm7
 
-    pmulhuw    xmm2, xmm6            // divide by 9,9,6, 9,9,6
-    packuswb   xmm2, xmm2
+    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
+    packuswb   xmm6, xmm6
 
-    movd       [edi], xmm2           // write 6 pixels
-    pextrw     eax, xmm2, 2
-    mov        [edi + 4], ax
-    lea        edi, [edi + 6]
     sub        ecx, 6
-    ja         xloop
+    movd       [edx], xmm6           // write 6 pixels
+    psrlq      xmm6, 16
+    movd       [edx + 2], xmm6
+    lea        edx, [edx + 6]
+    jg         xloop
 
-    popad
+    pop        esi
     ret
   }
 }
 
 // Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked)
-static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
+                                       ptrdiff_t src_stride,
                                        uint8* dst_ptr, int dst_width) {
   __asm {
-    pushad
-    mov        esi, [esp + 32 + 4]   // src_ptr
-    mov        edx, [esp + 32 + 8]   // src_stride
-    mov        edi, [esp + 32 + 12]  // dst_ptr
-    mov        ecx, [esp + 32 + 16]  // dst_width
-    movdqa     xmm4, _shufab0
-    movdqa     xmm5, _shufab1
-    movdqa     xmm6, _shufab2
-    movdqa     xmm7, _scaleab2
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShufAb0
+    movdqa     xmm3, kShufAb1
+    movdqa     xmm4, kShufAb2
+    movdqa     xmm5, kScaleAb2
 
+    align      16
   xloop:
-    movdqa     xmm2, [esi]           // average 2 rows into xmm2
-    pavgb      xmm2, [esi + edx]
-    lea        esi, [esi + 16]
-
-    movdqa     xmm0, xmm2            // 16 pixels -> 0,1,2,3,4,5 of xmm0
+    movdqa     xmm0, [eax]           // average 2 rows into xmm0
+    pavgb      xmm0, [eax + esi]
+    lea        eax, [eax + 16]
+
+    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
+    pshufb     xmm1, xmm2
+    movdqa     xmm6, xmm0
+    pshufb     xmm6, xmm3
+    paddusw    xmm1, xmm6
     pshufb     xmm0, xmm4
-    movdqa     xmm1, xmm2
-    pshufb     xmm1, xmm5
-    paddusw    xmm0, xmm1
-    pshufb     xmm2, xmm6
-    paddusw    xmm0, xmm2
+    paddusw    xmm1, xmm0
 
-    pmulhuw    xmm0, xmm7            // divide by 3,3,2, 3,3,2
-    packuswb   xmm0, xmm0
+    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
+    packuswb   xmm1, xmm1
 
-    movd       [edi], xmm0           // write 6 pixels
-    pextrw     eax, xmm0, 2
-    mov        [edi + 4], ax
-    lea        edi, [edi + 6]
     sub        ecx, 6
-    ja         xloop
+    movd       [edx], xmm1           // write 6 pixels
+    psrlq      xmm1, 16
+    movd       [edx + 2], xmm1
+    lea        edx, [edx + 6]
+    jg         xloop
 
-    popad
+    pop        esi
     ret
   }
 }
 
 #define HAS_SCALEADDROWS_SSE2
 
-// Reads 8xN bytes and produces 16 shorts at a time.
-__declspec(naked)
-static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
+// Reads 16xN bytes and produces 16 shorts at a time.
+__declspec(naked) __declspec(align(16))
+static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint16* dst_ptr, int src_width,
                               int src_height) {
   __asm {
-    pushad
-    mov        esi, [esp + 32 + 4]   // src_ptr
-    mov        edx, [esp + 32 + 8]   // src_stride
-    mov        edi, [esp + 32 + 12]  // dst_ptr
-    mov        ecx, [esp + 32 + 16]  // dst_width
-    mov        ebx, [esp + 32 + 20]  // height
-    pxor       xmm7, xmm7
+    push       esi
+    push       edi
+    push       ebx
+    push       ebp
+    mov        esi, [esp + 16 + 4]   // src_ptr
+    mov        edx, [esp + 16 + 8]   // src_stride
+    mov        edi, [esp + 16 + 12]  // dst_ptr
+    mov        ecx, [esp + 16 + 16]  // dst_width
+    mov        ebx, [esp + 16 + 20]  // height
+    pxor       xmm4, xmm4
     dec        ebx
 
+    align      16
   xloop:
     // first row
-    movdqa     xmm2, [esi]
+    movdqa     xmm0, [esi]
     lea        eax, [esi + edx]
-    movhlps    xmm3, xmm2
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm4
+    punpckhbw  xmm1, xmm4
+    lea        esi, [esi + 16]
     mov        ebp, ebx
-    punpcklbw  xmm2, xmm7
-    punpcklbw  xmm3, xmm7
+    test       ebp, ebp
+    je         ydone
 
     // sum remaining rows
+    align      16
   yloop:
-    movdqa     xmm0, [eax]       // read 16 pixels
+    movdqa     xmm2, [eax]       // read 16 pixels
     lea        eax, [eax + edx]  // advance to next row
-    movhlps    xmm1, xmm0
-    punpcklbw  xmm0, xmm7
-    punpcklbw  xmm1, xmm7
-    paddusw    xmm2, xmm0        // sum 16 words
-    paddusw    xmm3, xmm1
+    movdqa     xmm3, xmm2
+    punpcklbw  xmm2, xmm4
+    punpckhbw  xmm3, xmm4
+    paddusw    xmm0, xmm2        // sum 16 words
+    paddusw    xmm1, xmm3
     sub        ebp, 1
-    ja         yloop
-
-    movdqa     [edi], xmm2
-    movdqa     [edi + 16], xmm3
+    jg         yloop
+  ydone:
+    movdqa     [edi], xmm0
+    movdqa     [edi + 16], xmm1
     lea        edi, [edi + 32]
-    lea        esi, [esi + 16]
 
     sub        ecx, 16
-    ja         xloop
+    jg         xloop
 
-    popad
+    pop        ebp
+    pop        ebx
+    pop        edi
+    pop        esi
     ret
   }
 }
 
+#ifndef SSE2_DISABLED
 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
-#define HAS_SCALEFILTERROWS_SSE2
-__declspec(naked)
+// Normal formula for bilinear interpolation is:
+//   source_y_fraction * row1 + (1 - source_y_fraction) row0
+// SSE2 version using the a single multiply of difference:
+//   source_y_fraction * (row1 - row0) + row0
+#define HAS_SCALEFILTERROWS_SSE2_DISABLED
+__declspec(naked) __declspec(align(16))
 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                                 int src_stride, int dst_width,
+                                 ptrdiff_t src_stride, int dst_width,
                                  int source_y_fraction) {
   __asm {
     push       esi
@@ -909,88 +944,88 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
     cmp        eax, 0
     je         xloop1
     cmp        eax, 128
     je         xloop2
 
-    movd       xmm6, eax            // xmm6 = y fraction
-    punpcklwd  xmm6, xmm6
-    pshufd     xmm6, xmm6, 0
-    neg        eax                  // xmm5 = 256 - y fraction
-    add        eax, 256
-    movd       xmm5, eax
+    movd       xmm5, eax            // xmm5 = y fraction
+    punpcklbw  xmm5, xmm5
     punpcklwd  xmm5, xmm5
     pshufd     xmm5, xmm5, 0
-    pxor       xmm7, xmm7
+    pxor       xmm4, xmm4
 
+    align      16
   xloop:
-    movdqa     xmm0, [esi]
-    movdqa     xmm2, [esi + edx]
-    lea        esi, [esi + 16]
+    movdqa     xmm0, [esi]  // row0
+    movdqa     xmm2, [esi + edx]  // row1
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
-    punpcklbw  xmm0, xmm7
-    punpcklbw  xmm2, xmm7
-    punpckhbw  xmm1, xmm7
-    punpckhbw  xmm3, xmm7
-    pmullw     xmm0, xmm5           // scale row 0
-    pmullw     xmm1, xmm5
-    pmullw     xmm2, xmm6           // scale row 1
-    pmullw     xmm3, xmm6
-    paddusw    xmm0, xmm2           // sum rows
-    paddusw    xmm1, xmm3
-    psrlw      xmm0, 8
-    psrlw      xmm1, 8
+    punpcklbw  xmm2, xmm4
+    punpckhbw  xmm3, xmm4
+    punpcklbw  xmm0, xmm4
+    punpckhbw  xmm1, xmm4
+    psubw      xmm2, xmm0  // row1 - row0
+    psubw      xmm3, xmm1
+    pmulhw     xmm2, xmm5  // scale diff
+    pmulhw     xmm3, xmm5
+    paddw      xmm0, xmm2  // sum rows
+    paddw      xmm1, xmm3
     packuswb   xmm0, xmm1
-    movdqa     [edi], xmm0
-    lea        edi, [edi + 16]
     sub        ecx, 16
-    ja         xloop
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop
 
-    mov        al, [edi - 1]
-    mov        [edi], al
+    punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
     pop        edi
     pop        esi
     ret
 
+    align      16
   xloop1:
     movdqa     xmm0, [esi]
-    lea        esi, [esi + 16]
-    movdqa     [edi], xmm0
-    lea        edi, [edi + 16]
     sub        ecx, 16
-    ja         xloop1
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop1
 
-    mov        al, [edi - 1]
-    mov        [edi], al
+    punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
     pop        edi
     pop        esi
     ret
 
+    align      16
   xloop2:
     movdqa     xmm0, [esi]
-    movdqa     xmm2, [esi + edx]
-    lea        esi, [esi + 16]
-    pavgb      xmm0, xmm2
-    movdqa     [edi], xmm0
-    lea        edi, [edi + 16]
+    pavgb      xmm0, [esi + edx]
     sub        ecx, 16
-    ja         xloop2
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop2
 
-    mov        al, [edi - 1]
-    mov        [edi], al
+    punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
     pop        edi
     pop        esi
     ret
   }
 }
-
+#endif  // SSE2_DISABLED
 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
 #define HAS_SCALEFILTERROWS_SSSE3
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                                  int src_stride, int dst_width,
+                                  ptrdiff_t src_stride, int dst_width,
                                   int source_y_fraction) {
   __asm {
     push       esi
@@ -1000,1491 +1035,996 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
+    shr        eax, 1
     cmp        eax, 0
     je         xloop1
-    cmp        eax, 128
+    cmp        eax, 64
     je         xloop2
+    movd       xmm0, eax  // high fraction 0..127
+    neg        eax
+    add        eax, 128
+    movd       xmm5, eax  // low fraction 128..1
+    punpcklbw  xmm5, xmm0
+    punpcklwd  xmm5, xmm5
+    pshufd     xmm5, xmm5, 0
 
-    shr        eax, 1
-    mov        ah,al
-    neg        al
-    add        al, 128
-    movd       xmm7, eax
-    punpcklwd  xmm7, xmm7
-    pshufd     xmm7, xmm7, 0
-
+    align      16
   xloop:
     movdqa     xmm0, [esi]
     movdqa     xmm2, [esi + edx]
-    lea        esi, [esi + 16]
     movdqa     xmm1, xmm0
     punpcklbw  xmm0, xmm2
     punpckhbw  xmm1, xmm2
-    pmaddubsw  xmm0, xmm7
-    pmaddubsw  xmm1, xmm7
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm1, xmm5
     psrlw      xmm0, 7
     psrlw      xmm1, 7
     packuswb   xmm0, xmm1
-    movdqa     [edi], xmm0
-    lea        edi, [edi + 16]
     sub        ecx, 16
-    ja         xloop
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop
+
+    punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
 
-    mov        al, [edi - 1]
-    mov        [edi], al
     pop        edi
     pop        esi
     ret
 
+    align      16
   xloop1:
     movdqa     xmm0, [esi]
-    lea        esi, [esi + 16]
-    movdqa     [edi], xmm0
-    lea        edi, [edi + 16]
     sub        ecx, 16
-    ja         xloop1
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop1
 
-    mov        al, [edi - 1]
-    mov        [edi], al
+    punpckhbw  xmm0, xmm0
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
     pop        edi
     pop        esi
     ret
 
+    align      16
   xloop2:
     movdqa     xmm0, [esi]
-    movdqa     xmm2, [esi + edx]
-    lea        esi, [esi + 16]
-    pavgb      xmm0, xmm2
-    movdqa     [edi], xmm0
-    lea        edi, [edi + 16]
+    pavgb      xmm0, [esi + edx]
     sub        ecx, 16
-    ja         xloop2
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop2
 
-    mov        al, [edi - 1]
-    mov        [edi], al
+    punpckhbw  xmm0, xmm0
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
     pop        edi
     pop        esi
     ret
-
   }
 }
 
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                                    int dst_width) {
-  __asm {
-    mov        edx, [esp + 4]    // dst_ptr
-    mov        eax, [esp + 8]    // src_ptr
-    mov        ecx, [esp + 12]   // dst_width
-    movdqa     xmm1, _round34
-    movdqa     xmm2, _shuf01
-    movdqa     xmm3, _shuf11
-    movdqa     xmm4, _shuf21
-    movdqa     xmm5, _madd01
-    movdqa     xmm6, _madd11
-    movdqa     xmm7, _madd21
-
-  wloop:
-    movdqa     xmm0, [eax]           // pixels 0..7
-    pshufb     xmm0, xmm2
-    pmaddubsw  xmm0, xmm5
-    paddsw     xmm0, xmm1
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax+8]         // pixels 8..15
-    pshufb     xmm0, xmm3
-    pmaddubsw  xmm0, xmm6
-    paddsw     xmm0, xmm1
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx+8], xmm0
-    movdqa     xmm0, [eax+16]        // pixels 16..23
-    lea        eax, [eax+32]
-    pshufb     xmm0, xmm4
-    pmaddubsw  xmm0, xmm7
-    paddsw     xmm0, xmm1
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx+16], xmm0
-    lea        edx, [edx+24]
-    sub        ecx, 24
-    ja         wloop
-    ret
-  }
-}
-
-#elif (defined(__x86_64__) || defined(__i386__)) && \
-    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 
 // GCC versions of row functions are verbatim conversions from Visual C.
 // Generated using gcc disassembly on Visual C object file:
 // objdump -D yuvscaler.obj >yuvscaler.txt
 #define HAS_SCALEROWDOWN2_SSE2
-static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
-  asm volatile(
-  "pcmpeqb    %%xmm7,%%xmm7\n"
-  "psrlw      $0x8,%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "lea        0x20(%0),%0\n"
-  "pand       %%xmm7,%%xmm0\n"
-  "pand       %%xmm7,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm0\n"
-  "movdqa     %%xmm0,(%1)\n"
-  "lea        0x10(%1),%1\n"
-  "sub        $0x10,%2\n"
-  "ja         1b\n"
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
   :
-  : "memory"
-);
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
 }
 
-static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
-                                  uint8* dst_ptr, int dst_width) {
-  asm volatile(
-  "pcmpeqb    %%xmm7,%%xmm7\n"
-  "psrlw      $0x8,%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "movdqa     (%0,%3,1),%%xmm2\n"
-  "movdqa     0x10(%0,%3,1),%%xmm3\n"
-  "lea        0x20(%0),%0\n"
-  "pavgb      %%xmm2,%%xmm0\n"
-  "pavgb      %%xmm3,%%xmm1\n"
-  "movdqa     %%xmm0,%%xmm2\n"
-  "psrlw      $0x8,%%xmm0\n"
-  "movdqa     %%xmm1,%%xmm3\n"
-  "psrlw      $0x8,%%xmm1\n"
-  "pand       %%xmm7,%%xmm2\n"
-  "pand       %%xmm7,%%xmm3\n"
-  "pavgw      %%xmm2,%%xmm0\n"
-  "pavgw      %%xmm3,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm0\n"
-  "movdqa     %%xmm0,(%1)\n"
-  "lea        0x10(%1),%1\n"
-  "sub        $0x10,%2\n"
-  "ja         1b\n"
+void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    (%0,%3,1),%%xmm2                \n"
+    "movdqa    0x10(%0,%3,1),%%xmm3            \n"
+    "lea       0x20(%0),%0                     \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"(static_cast<intptr_t>(src_stride))   // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
+                                         ptrdiff_t src_stride,
+                                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+
+static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "movdqu    (%0,%3,1),%%xmm2                \n"
+    "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+    "lea       0x20(%0),%0                     \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
   : "r"(static_cast<intptr_t>(src_stride))   // %3
-  : "memory"
-);
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
 }
 
 #define HAS_SCALEROWDOWN4_SSE2
-static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
-  asm volatile(
-  "pcmpeqb    %%xmm7,%%xmm7\n"
-  "psrld      $0x18,%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "lea        0x20(%0),%0\n"
-  "pand       %%xmm7,%%xmm0\n"
-  "pand       %%xmm7,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "movq       %%xmm0,(%1)\n"
-  "lea        0x8(%1),%1\n"
-  "sub        $0x8,%2\n"
-  "ja         1b\n"
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrld     $0x18,%%xmm5                    \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
   :
-  : "memory"
-);
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
 }
 
-static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                                   uint8* dst_ptr, int dst_width) {
-  intptr_t temp = 0;
-  asm volatile(
-  "pcmpeqb    %%xmm7,%%xmm7\n"
-  "psrlw      $0x8,%%xmm7\n"
-  "lea        (%4,%4,2),%3\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "movdqa     (%0,%4,1),%%xmm2\n"
-  "movdqa     0x10(%0,%4,1),%%xmm3\n"
-  "pavgb      %%xmm2,%%xmm0\n"
-  "pavgb      %%xmm3,%%xmm1\n"
-  "movdqa     (%0,%4,2),%%xmm2\n"
-  "movdqa     0x10(%0,%4,2),%%xmm3\n"
-  "movdqa     (%0,%3,1),%%xmm4\n"
-  "movdqa     0x10(%0,%3,1),%%xmm5\n"
-  "lea        0x20(%0),%0\n"
-  "pavgb      %%xmm4,%%xmm2\n"
-  "pavgb      %%xmm2,%%xmm0\n"
-  "pavgb      %%xmm5,%%xmm3\n"
-  "pavgb      %%xmm3,%%xmm1\n"
-  "movdqa     %%xmm0,%%xmm2\n"
-  "psrlw      $0x8,%%xmm0\n"
-  "movdqa     %%xmm1,%%xmm3\n"
-  "psrlw      $0x8,%%xmm1\n"
-  "pand       %%xmm7,%%xmm2\n"
-  "pand       %%xmm7,%%xmm3\n"
-  "pavgw      %%xmm2,%%xmm0\n"
-  "pavgw      %%xmm3,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm0\n"
-  "movdqa     %%xmm0,%%xmm2\n"
-  "psrlw      $0x8,%%xmm0\n"
-  "pand       %%xmm7,%%xmm2\n"
-  "pavgw      %%xmm2,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "movq       %%xmm0,(%1)\n"
-  "lea        0x8(%1),%1\n"
-  "sub        $0x8,%2\n"
-  "ja         1b\n"
+  intptr_t stridex3 = 0;
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0x8,%%xmm7                     \n"
+    "lea       (%4,%4,2),%3                    \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    (%0,%4,1),%%xmm2                \n"
+    "movdqa    0x10(%0,%4,1),%%xmm3            \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    (%0,%4,2),%%xmm2                \n"
+    "movdqa    0x10(%0,%4,2),%%xmm3            \n"
+    "movdqa    (%0,%3,1),%%xmm4                \n"
+    "movdqa    0x10(%0,%3,1),%%xmm5            \n"
+    "lea       0x20(%0),%0                     \n"
+    "pavgb     %%xmm4,%%xmm2                   \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm5,%%xmm3                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "pand      %%xmm7,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
     "+r"(dst_width),   // %2
-    "+r"(temp)         // %3
+    "+r"(stridex3)     // %3
   : "r"(static_cast<intptr_t>(src_stride))    // %4
-  : "memory"
-);
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
+#endif
+  );
 }
 
 #define HAS_SCALEROWDOWN8_SSE2
-static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
-  asm volatile(
-  "pcmpeqb    %%xmm7,%%xmm7\n"
-  "psrlq      $0x38,%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "lea        0x20(%0),%0\n"
-  "pand       %%xmm7,%%xmm0\n"
-  "pand       %%xmm7,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "movd       %%xmm0,(%1)\n"
-  "lea        0x4(%1),%1\n"
-  "sub        $0x4,%2\n"
-  "ja         1b\n"
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlq     $0x38,%%xmm5                    \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0,(%1)                     \n"
+    "lea       0x4(%1),%1                      \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
   :
-  : "memory"
-);
-}
-
-#if defined(__i386__)
-extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
-                                      uint8* dst_ptr, int dst_width);
-  asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _ScaleRowDown8Int_SSE2\n"
-"_ScaleRowDown8Int_SSE2:\n"
-#else
-    ".global ScaleRowDown8Int_SSE2\n"
-"ScaleRowDown8Int_SSE2:\n"
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
 #endif
-    "pusha\n"
-    "mov    0x24(%esp),%esi\n"
-    "mov    0x28(%esp),%ebx\n"
-    "mov    0x2c(%esp),%edi\n"
-    "mov    0x30(%esp),%ecx\n"
-    "lea    (%ebx,%ebx,2),%edx\n"
-    "pxor   %xmm7,%xmm7\n"
-
-"1:"
-    "movdqa (%esi),%xmm0\n"
-    "movdqa 0x10(%esi),%xmm1\n"
-    "movdqa (%esi,%ebx,1),%xmm2\n"
-    "movdqa 0x10(%esi,%ebx,1),%xmm3\n"
-    "pavgb  %xmm2,%xmm0\n"
-    "pavgb  %xmm3,%xmm1\n"
-    "movdqa (%esi,%ebx,2),%xmm2\n"
-    "movdqa 0x10(%esi,%ebx,2),%xmm3\n"
-    "movdqa (%esi,%edx,1),%xmm4\n"
-    "movdqa 0x10(%esi,%edx,1),%xmm5\n"
-    "lea    (%esi,%ebx,4),%ebp\n"
-    "lea    0x20(%esi),%esi\n"
-    "pavgb  %xmm4,%xmm2\n"
-    "pavgb  %xmm5,%xmm3\n"
-    "pavgb  %xmm2,%xmm0\n"
-    "pavgb  %xmm3,%xmm1\n"
-    "movdqa 0x0(%ebp),%xmm2\n"
-    "movdqa 0x10(%ebp),%xmm3\n"
-    "movdqa 0x0(%ebp,%ebx,1),%xmm4\n"
-    "movdqa 0x10(%ebp,%ebx,1),%xmm5\n"
-    "pavgb  %xmm4,%xmm2\n"
-    "pavgb  %xmm5,%xmm3\n"
-    "movdqa 0x0(%ebp,%ebx,2),%xmm4\n"
-    "movdqa 0x10(%ebp,%ebx,2),%xmm5\n"
-    "movdqa 0x0(%ebp,%edx,1),%xmm6\n"
-    "pavgb  %xmm6,%xmm4\n"
-    "movdqa 0x10(%ebp,%edx,1),%xmm6\n"
-    "pavgb  %xmm6,%xmm5\n"
-    "pavgb  %xmm4,%xmm2\n"
-    "pavgb  %xmm5,%xmm3\n"
-    "pavgb  %xmm2,%xmm0\n"
-    "pavgb  %xmm3,%xmm1\n"
-    "psadbw %xmm7,%xmm0\n"
-    "psadbw %xmm7,%xmm1\n"
-    "pshufd $0xd8,%xmm0,%xmm0\n"
-    "pshufd $0x8d,%xmm1,%xmm1\n"
-    "por    %xmm1,%xmm0\n"
-    "psrlw  $0x3,%xmm0\n"
-    "packuswb %xmm0,%xmm0\n"
-    "packuswb %xmm0,%xmm0\n"
-    "movd   %xmm0,(%edi)\n"
-    "lea    0x4(%edi),%edi\n"
-    "sub    $0x4,%ecx\n"
-    "ja     1b\n"
-    "popa\n"
-    "ret\n"
-);
-
-// fpic is used for magiccam plugin
-#if !defined(__PIC__)
-#define HAS_SCALEROWDOWN34_SSSE3
-extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
-                                     uint8* dst_ptr, int dst_width);
-  asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _ScaleRowDown34_SSSE3\n"
-"_ScaleRowDown34_SSSE3:\n"
-#else
-    ".global ScaleRowDown34_SSSE3\n"
-"ScaleRowDown34_SSSE3:\n"
-#endif
-    "pusha\n"
-    "mov    0x24(%esp),%esi\n"
-    "mov    0x2c(%esp),%edi\n"
-    "mov    0x30(%esp),%ecx\n"
-    "movdqa _shuf0,%xmm3\n"
-    "movdqa _shuf1,%xmm4\n"
-    "movdqa _shuf2,%xmm5\n"
-
-"1:"
-    "movdqa (%esi),%xmm0\n"
-    "movdqa 0x10(%esi),%xmm2\n"
-    "lea    0x20(%esi),%esi\n"
-    "movdqa %xmm2,%xmm1\n"
-    "palignr $0x8,%xmm0,%xmm1\n"
-    "pshufb %xmm3,%xmm0\n"
-    "pshufb %xmm4,%xmm1\n"
-    "pshufb %xmm5,%xmm2\n"
-    "movq   %xmm0,(%edi)\n"
-    "movq   %xmm1,0x8(%edi)\n"
-    "movq   %xmm2,0x10(%edi)\n"
-    "lea    0x18(%edi),%edi\n"
-    "sub    $0x18,%ecx\n"
-    "ja     1b\n"
-    "popa\n"
-    "ret\n"
-);
-
-extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
-                                           uint8* dst_ptr, int dst_width);
-  asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _ScaleRowDown34_1_Int_SSSE3\n"
-"_ScaleRowDown34_1_Int_SSSE3:\n"
-#else
-    ".global ScaleRowDown34_1_Int_SSSE3\n"
-"ScaleRowDown34_1_Int_SSSE3:\n"
-#endif
-    "pusha\n"
-    "mov    0x24(%esp),%esi\n"
-    "mov    0x28(%esp),%ebp\n"
-    "mov    0x2c(%esp),%edi\n"
-    "mov    0x30(%esp),%ecx\n"
-    "movdqa _shuf01,%xmm2\n"
-    "movdqa _shuf11,%xmm3\n"
-    "movdqa _shuf21,%xmm4\n"
-    "movdqa _madd01,%xmm5\n"
-    "movdqa _madd11,%xmm6\n"
-    "movdqa _round34,%xmm7\n"
-
-"1:"
-    "movdqa (%esi),%xmm0\n"
-    "movdqa (%esi,%ebp),%xmm1\n"
-    "pavgb  %xmm1,%xmm0\n"
-    "pshufb %xmm2,%xmm0\n"
-    "pmaddubsw %xmm5,%xmm0\n"
-    "paddsw %xmm7,%xmm0\n"
-    "psrlw  $0x2,%xmm0\n"
-    "packuswb %xmm0,%xmm0\n"
-    "movq   %xmm0,(%edi)\n"
-    "movdqu 0x8(%esi),%xmm0\n"
-    "movdqu 0x8(%esi,%ebp),%xmm1\n"
-    "pavgb  %xmm1,%xmm0\n"
-    "pshufb %xmm3,%xmm0\n"
-    "pmaddubsw %xmm6,%xmm0\n"
-    "paddsw %xmm7,%xmm0\n"
-    "psrlw  $0x2,%xmm0\n"
-    "packuswb %xmm0,%xmm0\n"
-    "movq   %xmm0,0x8(%edi)\n"
-    "movdqa 0x10(%esi),%xmm0\n"
-    "movdqa 0x10(%esi,%ebp),%xmm1\n"
-    "lea    0x20(%esi),%esi\n"
-    "pavgb  %xmm1,%xmm0\n"
-    "pshufb %xmm4,%xmm0\n"
-    "movdqa  _madd21,%xmm1\n"
-    "pmaddubsw %xmm1,%xmm0\n"
-    "paddsw %xmm7,%xmm0\n"
-    "psrlw  $0x2,%xmm0\n"
-    "packuswb %xmm0,%xmm0\n"
-    "movq   %xmm0,0x10(%edi)\n"
-    "lea    0x18(%edi),%edi\n"
-    "sub    $0x18,%ecx\n"
-    "ja     1b\n"
-
-    "popa\n"
-    "ret\n"
-);
-
-extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
-                                           uint8* dst_ptr, int dst_width);
-  asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _ScaleRowDown34_0_Int_SSSE3\n"
-"_ScaleRowDown34_0_Int_SSSE3:\n"
-#else
-    ".global ScaleRowDown34_0_Int_SSSE3\n"
-"ScaleRowDown34_0_Int_SSSE3:\n"
-#endif
-    "pusha\n"
-    "mov    0x24(%esp),%esi\n"
-    "mov    0x28(%esp),%ebp\n"
-    "mov    0x2c(%esp),%edi\n"
-    "mov    0x30(%esp),%ecx\n"
-    "movdqa _shuf01,%xmm2\n"
-    "movdqa _shuf11,%xmm3\n"
-    "movdqa _shuf21,%xmm4\n"
-    "movdqa _madd01,%xmm5\n"
-    "movdqa _madd11,%xmm6\n"
-    "movdqa _round34,%xmm7\n"
-
-"1:"
-    "movdqa (%esi),%xmm0\n"
-    "movdqa (%esi,%ebp,1),%xmm1\n"
-    "pavgb  %xmm0,%xmm1\n"
-    "pavgb  %xmm1,%xmm0\n"
-    "pshufb %xmm2,%xmm0\n"
-    "pmaddubsw %xmm5,%xmm0\n"
-    "paddsw %xmm7,%xmm0\n"
-    "psrlw  $0x2,%xmm0\n"
-    "packuswb %xmm0,%xmm0\n"
-    "movq   %xmm0,(%edi)\n"
-    "movdqu 0x8(%esi),%xmm0\n"
-    "movdqu 0x8(%esi,%ebp,1),%xmm1\n"
-    "pavgb  %xmm0,%xmm1\n"
-    "pavgb  %xmm1,%xmm0\n"
-    "pshufb %xmm3,%xmm0\n"
-    "pmaddubsw %xmm6,%xmm0\n"
-    "paddsw %xmm7,%xmm0\n"
-    "psrlw  $0x2,%xmm0\n"
-    "packuswb %xmm0,%xmm0\n"
-    "movq   %xmm0,0x8(%edi)\n"
-    "movdqa 0x10(%esi),%xmm0\n"
-    "movdqa 0x10(%esi,%ebp,1),%xmm1\n"
-    "lea    0x20(%esi),%esi\n"
-    "pavgb  %xmm0,%xmm1\n"
-    "pavgb  %xmm1,%xmm0\n"
-    "pshufb %xmm4,%xmm0\n"
-    "movdqa  _madd21,%xmm1\n"
-    "pmaddubsw %xmm1,%xmm0\n"
-    "paddsw %xmm7,%xmm0\n"
-    "psrlw  $0x2,%xmm0\n"
-    "packuswb %xmm0,%xmm0\n"
-    "movq   %xmm0,0x10(%edi)\n"
-    "lea    0x18(%edi),%edi\n"
-    "sub    $0x18,%ecx\n"
-    "ja     1b\n"
-    "popa\n"
-    "ret\n"
-);
-
-#define HAS_SCALEROWDOWN38_SSSE3
-extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
-                                     uint8* dst_ptr, int dst_width);
-  asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _ScaleRowDown38_SSSE3\n"
-"_ScaleRowDown38_SSSE3:\n"
-#else
-    ".global ScaleRowDown38_SSSE3\n"
-"ScaleRowDown38_SSSE3:\n"
-#endif
-    "pusha\n"
-    "mov    0x24(%esp),%esi\n"
-    "mov    0x28(%esp),%edx\n"
-    "mov    0x2c(%esp),%edi\n"
-    "mov    0x30(%esp),%ecx\n"
-    "movdqa _shuf38a ,%xmm5\n"
-    "movdqa _shuf38b ,%xmm6\n"
-    "pxor   %xmm7,%xmm7\n"
-
-"1:"
-    "movdqa (%esi),%xmm0\n"
-    "movdqa 0x10(%esi),%xmm1\n"
-    "lea    0x20(%esi),%esi\n"
-    "pshufb %xmm5,%xmm0\n"
-    "pshufb %xmm6,%xmm1\n"
-    "paddusb %xmm1,%xmm0\n"
-    "movq   %xmm0,(%edi)\n"
-    "movhlps %xmm0,%xmm1\n"
-    "movd   %xmm1,0x8(%edi)\n"
-    "lea    0xc(%edi),%edi\n"
-    "sub    $0xc,%ecx\n"
-    "ja     1b\n"
-    "popa\n"
-    "ret\n"
-);
-
-extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
-                                           uint8* dst_ptr, int dst_width);
-  asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _ScaleRowDown38_3_Int_SSSE3\n"
-"_ScaleRowDown38_3_Int_SSSE3:\n"
-#else
-    ".global ScaleRowDown38_3_Int_SSSE3\n"
-"ScaleRowDown38_3_Int_SSSE3:\n"
-#endif
-    "pusha\n"
-    "mov    0x24(%esp),%esi\n"
-    "mov    0x28(%esp),%edx\n"
-    "mov    0x2c(%esp),%edi\n"
-    "mov    0x30(%esp),%ecx\n"
-    "movdqa _shufac0,%xmm4\n"
-    "movdqa _shufac3,%xmm5\n"
-    "movdqa _scaleac3,%xmm6\n"
-    "pxor   %xmm7,%xmm7\n"
-
-"1:"
-    "movdqa (%esi),%xmm0\n"
-    "movdqa (%esi,%edx,1),%xmm2\n"
-    "movhlps %xmm0,%xmm1\n"
-    "movhlps %xmm2,%xmm3\n"
-    "punpcklbw %xmm7,%xmm0\n"
-    "punpcklbw %xmm7,%xmm1\n"
-    "punpcklbw %xmm7,%xmm2\n"
-    "punpcklbw %xmm7,%xmm3\n"
-    "paddusw %xmm2,%xmm0\n"
-    "paddusw %xmm3,%xmm1\n"
-    "movdqa (%esi,%edx,2),%xmm2\n"
-    "lea    0x10(%esi),%esi\n"
-    "movhlps %xmm2,%xmm3\n"
-    "punpcklbw %xmm7,%xmm2\n"
-    "punpcklbw %xmm7,%xmm3\n"
-    "paddusw %xmm2,%xmm0\n"
-    "paddusw %xmm3,%xmm1\n"
-    "movdqa %xmm0,%xmm2\n"
-    "psrldq $0x2,%xmm0\n"
-    "paddusw %xmm0,%xmm2\n"
-    "psrldq $0x2,%xmm0\n"
-    "paddusw %xmm0,%xmm2\n"
-    "pshufb %xmm4,%xmm2\n"
-    "movdqa %xmm1,%xmm3\n"
-    "psrldq $0x2,%xmm1\n"
-    "paddusw %xmm1,%xmm3\n"
-    "psrldq $0x2,%xmm1\n"
-    "paddusw %xmm1,%xmm3\n"
-    "pshufb %xmm5,%xmm3\n"
-    "paddusw %xmm3,%xmm2\n"
-    "pmulhuw %xmm6,%xmm2\n"
-    "packuswb %xmm2,%xmm2\n"
-    "movd   %xmm2,(%edi)\n"
-    "pextrw $0x2,%xmm2,%eax\n"
-    "mov    %ax,0x4(%edi)\n"
-    "lea    0x6(%edi),%edi\n"
-    "sub    $0x6,%ecx\n"
-    "ja     1b\n"
-    "popa\n"
-    "ret\n"
-);
-
-extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
-                                           uint8* dst_ptr, int dst_width);
-  asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _ScaleRowDown38_2_Int_SSSE3\n"
-"_ScaleRowDown38_2_Int_SSSE3:\n"
-#else
-    ".global ScaleRowDown38_2_Int_SSSE3\n"
-"ScaleRowDown38_2_Int_SSSE3:\n"
-#endif
-    "pusha\n"
-    "mov    0x24(%esp),%esi\n"
-    "mov    0x28(%esp),%edx\n"
-    "mov    0x2c(%esp),%edi\n"
-    "mov    0x30(%esp),%ecx\n"
-    "movdqa _shufab0,%xmm4\n"
-    "movdqa _shufab1,%xmm5\n"
-    "movdqa _shufab2,%xmm6\n"
-    "movdqa _scaleab2,%xmm7\n"
-
-"1:"
-    "movdqa (%esi),%xmm2\n"
-    "pavgb  (%esi,%edx,1),%xmm2\n"
-    "lea    0x10(%esi),%esi\n"
-    "movdqa %xmm2,%xmm0\n"
-    "pshufb %xmm4,%xmm0\n"
-    "movdqa %xmm2,%xmm1\n"
-    "pshufb %xmm5,%xmm1\n"
-    "paddusw %xmm1,%xmm0\n"
-    "pshufb %xmm6,%xmm2\n"
-    "paddusw %xmm2,%xmm0\n"
-    "pmulhuw %xmm7,%xmm0\n"
-    "packuswb %xmm0,%xmm0\n"
-    "movd   %xmm0,(%edi)\n"
-    "pextrw $0x2,%xmm0,%eax\n"
-    "mov    %ax,0x4(%edi)\n"
-    "lea    0x6(%edi),%edi\n"
-    "sub    $0x6,%ecx\n"
-    "ja     1b\n"
-    "popa\n"
-    "ret\n"
-);
-#endif // __PIC__
-
-#define HAS_SCALEADDROWS_SSE2
-extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
-                                  uint16* dst_ptr, int src_width,
-                                  int src_height);
-  asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _ScaleAddRows_SSE2\n"
-"_ScaleAddRows_SSE2:\n"
-#else
-    ".global ScaleAddRows_SSE2\n"
-"ScaleAddRows_SSE2:\n"
-#endif
-    "pusha\n"
-    "mov    0x24(%esp),%esi\n"
-    "mov    0x28(%esp),%edx\n"
-    "mov    0x2c(%esp),%edi\n"
-    "mov    0x30(%esp),%ecx\n"
-    "mov    0x34(%esp),%ebx\n"
-    "pxor   %xmm7,%xmm7\n"
-
-"1:"
-    "movdqa (%esi),%xmm2\n"
-    "lea    (%esi,%edx,1),%eax\n"
-    "movhlps %xmm2,%xmm3\n"
-    "lea    -0x1(%ebx),%ebp\n"
-    "punpcklbw %xmm7,%xmm2\n"
-    "punpcklbw %xmm7,%xmm3\n"
-
-"2:"
-    "movdqa (%eax),%xmm0\n"
-    "lea    (%eax,%edx,1),%eax\n"
-    "movhlps %xmm0,%xmm1\n"
-    "punpcklbw %xmm7,%xmm0\n"
-    "punpcklbw %xmm7,%xmm1\n"
-    "paddusw %xmm0,%xmm2\n"
-    "paddusw %xmm1,%xmm3\n"
-    "sub    $0x1,%ebp\n"
-    "ja     2b\n"
-
-    "movdqa %xmm2,(%edi)\n"
-    "movdqa %xmm3,0x10(%edi)\n"
-    "lea    0x20(%edi),%edi\n"
-    "lea    0x10(%esi),%esi\n"
-    "sub    $0x10,%ecx\n"
-    "ja     1b\n"
-    "popa\n"
-    "ret\n"
-);
-
-// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
-#define HAS_SCALEFILTERROWS_SSE2
-extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
-                                     const uint8* src_ptr, int src_stride,
-                                     int dst_width, int source_y_fraction);
-  asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _ScaleFilterRows_SSE2\n"
-"_ScaleFilterRows_SSE2:\n"
-#else
-    ".global ScaleFilterRows_SSE2\n"
-"ScaleFilterRows_SSE2:\n"
-#endif
-    "push   %esi\n"
-    "push   %edi\n"
-    "mov    0xc(%esp),%edi\n"
-    "mov    0x10(%esp),%esi\n"
-    "mov    0x14(%esp),%edx\n"
-    "mov    0x18(%esp),%ecx\n"
-    "mov    0x1c(%esp),%eax\n"
-    "cmp    $0x0,%eax\n"
-    "je     2f\n"
-    "cmp    $0x80,%eax\n"
-    "je     3f\n"
-    "movd   %eax,%xmm6\n"
-    "punpcklwd %xmm6,%xmm6\n"
-    "pshufd $0x0,%xmm6,%xmm6\n"
-    "neg    %eax\n"
-    "add    $0x100,%eax\n"
-    "movd   %eax,%xmm5\n"
-    "punpcklwd %xmm5,%xmm5\n"
-    "pshufd $0x0,%xmm5,%xmm5\n"
-    "pxor   %xmm7,%xmm7\n"
-
-"1:"
-    "movdqa (%esi),%xmm0\n"
-    "movdqa (%esi,%edx,1),%xmm2\n"
-    "lea    0x10(%esi),%esi\n"
-    "movdqa %xmm0,%xmm1\n"
-    "movdqa %xmm2,%xmm3\n"
-    "punpcklbw %xmm7,%xmm0\n"
-    "punpcklbw %xmm7,%xmm2\n"
-    "punpckhbw %xmm7,%xmm1\n"
-    "punpckhbw %xmm7,%xmm3\n"
-    "pmullw %xmm5,%xmm0\n"
-    "pmullw %xmm5,%xmm1\n"
-    "pmullw %xmm6,%xmm2\n"
-    "pmullw %xmm6,%xmm3\n"
-    "paddusw %xmm2,%xmm0\n"
-    "paddusw %xmm3,%xmm1\n"
-    "psrlw  $0x8,%xmm0\n"
-    "psrlw  $0x8,%xmm1\n"
-    "packuswb %xmm1,%xmm0\n"
-    "movdqa %xmm0,(%edi)\n"
-    "lea    0x10(%edi),%edi\n"
-    "sub    $0x10,%ecx\n"
-    "ja     1b\n"
-    "mov    -0x1(%edi),%al\n"
-    "mov    %al,(%edi)\n"
-    "pop    %edi\n"
-    "pop    %esi\n"
-    "ret\n"
-
-"2:"
-    "movdqa (%esi),%xmm0\n"
-    "lea    0x10(%esi),%esi\n"
-    "movdqa %xmm0,(%edi)\n"
-    "lea    0x10(%edi),%edi\n"
-    "sub    $0x10,%ecx\n"
-    "ja     2b\n"
-
-    "mov    -0x1(%edi),%al\n"
-    "mov    %al,(%edi)\n"
-    "pop    %edi\n"
-    "pop    %esi\n"
-    "ret\n"
-
-"3:"
-    "movdqa (%esi),%xmm0\n"
-    "movdqa (%esi,%edx,1),%xmm2\n"
-    "lea    0x10(%esi),%esi\n"
-    "pavgb  %xmm2,%xmm0\n"
-    "movdqa %xmm0,(%edi)\n"
-    "lea    0x10(%edi),%edi\n"
-    "sub    $0x10,%ecx\n"
-    "ja     3b\n"
-
-    "mov    -0x1(%edi),%al\n"
-    "mov    %al,(%edi)\n"
-    "pop    %edi\n"
-    "pop    %esi\n"
-    "ret\n"
-);
+  );
+}
 
-// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
-#define HAS_SCALEFILTERROWS_SSSE3
-extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
-                                      const uint8* src_ptr, int src_stride,
-                                      int dst_width, int source_y_fraction);
-  asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _ScaleFilterRows_SSSE3\n"
-"_ScaleFilterRows_SSSE3:\n"
-#else
-    ".global ScaleFilterRows_SSSE3\n"
-"ScaleFilterRows_SSSE3:\n"
-#endif
-    "push   %esi\n"
-    "push   %edi\n"
-    "mov    0xc(%esp),%edi\n"
-    "mov    0x10(%esp),%esi\n"
-    "mov    0x14(%esp),%edx\n"
-    "mov    0x18(%esp),%ecx\n"
-    "mov    0x1c(%esp),%eax\n"
-    "cmp    $0x0,%eax\n"
-    "je     2f\n"
-    "cmp    $0x80,%eax\n"
-    "je     3f\n"
-    "shr    %eax\n"
-    "mov    %al,%ah\n"
-    "neg    %al\n"
-    "add    $0x80,%al\n"
-    "movd   %eax,%xmm7\n"
-    "punpcklwd %xmm7,%xmm7\n"
-    "pshufd $0x0,%xmm7,%xmm7\n"
-
-"1:"
-    "movdqa (%esi),%xmm0\n"
-    "movdqa (%esi,%edx,1),%xmm2\n"
-    "lea    0x10(%esi),%esi\n"
-    "movdqa %xmm0,%xmm1\n"
-    "punpcklbw %xmm2,%xmm0\n"
-    "punpckhbw %xmm2,%xmm1\n"
-    "pmaddubsw %xmm7,%xmm0\n"
-    "pmaddubsw %xmm7,%xmm1\n"
-    "psrlw  $0x7,%xmm0\n"
-    "psrlw  $0x7,%xmm1\n"
-    "packuswb %xmm1,%xmm0\n"
-    "movdqa %xmm0,(%edi)\n"
-    "lea    0x10(%edi),%edi\n"
-    "sub    $0x10,%ecx\n"
-    "ja     1b\n"
-    "mov    -0x1(%edi),%al\n"
-    "mov    %al,(%edi)\n"
-    "pop    %edi\n"
-    "pop    %esi\n"
-    "ret\n"
-
-"2:"
-    "movdqa (%esi),%xmm0\n"
-    "lea    0x10(%esi),%esi\n"
-    "movdqa %xmm0,(%edi)\n"
-    "lea    0x10(%edi),%edi\n"
-    "sub    $0x10,%ecx\n"
-    "ja     2b\n"
-    "mov    -0x1(%edi),%al\n"
-    "mov    %al,(%edi)\n"
-    "pop    %edi\n"
-    "pop    %esi\n"
-    "ret\n"
-
-"3:"
-    "movdqa (%esi),%xmm0\n"
-    "movdqa (%esi,%edx,1),%xmm2\n"
-    "lea    0x10(%esi),%esi\n"
-    "pavgb  %xmm2,%xmm0\n"
-    "movdqa %xmm0,(%edi)\n"
-    "lea    0x10(%edi),%edi\n"
-    "sub    $0x10,%ecx\n"
-    "ja     3b\n"
-    "mov    -0x1(%edi),%al\n"
-    "mov    %al,(%edi)\n"
-    "pop    %edi\n"
-    "pop    %esi\n"
-    "ret\n"
-);
-
-#elif defined(__x86_64__)
-static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                                   uint8* dst_ptr, int dst_width) {
-  asm volatile(
-  "lea        (%3,%3,2),%%r10\n"
-  "pxor       %%xmm7,%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "movdqa     (%0,%3,1),%%xmm2\n"
-  "movdqa     0x10(%0,%3,1),%%xmm3\n"
-  "pavgb      %%xmm2,%%xmm0\n"
-  "pavgb      %%xmm3,%%xmm1\n"
-  "movdqa     (%0,%3,2),%%xmm2\n"
-  "movdqa     0x10(%0,%3,2),%%xmm3\n"
-  "movdqa     (%0,%%r10,1),%%xmm4\n"
-  "movdqa     0x10(%0,%%r10,1),%%xmm5\n"
-  "lea        (%0,%3,4),%%r11\n"
-  "lea        0x20(%0),%0\n"
-  "pavgb      %%xmm4,%%xmm2\n"
-  "pavgb      %%xmm5,%%xmm3\n"
-  "pavgb      %%xmm2,%%xmm0\n"
-  "pavgb      %%xmm3,%%xmm1\n"
-  "movdqa     0x0(%%r11),%%xmm2\n"
-  "movdqa     0x10(%%r11),%%xmm3\n"
-  "movdqa     0x0(%%r11,%3,1),%%xmm4\n"
-  "movdqa     0x10(%%r11,%3,1),%%xmm5\n"
-  "pavgb      %%xmm4,%%xmm2\n"
-  "pavgb      %%xmm5,%%xmm3\n"
-  "movdqa     0x0(%%r11,%3,2),%%xmm4\n"
-  "movdqa     0x10(%%r11,%3,2),%%xmm5\n"
-  "movdqa     0x0(%%r11,%%r10,1),%%xmm6\n"
-  "pavgb      %%xmm6,%%xmm4\n"
-  "movdqa     0x10(%%r11,%%r10,1),%%xmm6\n"
-  "pavgb      %%xmm6,%%xmm5\n"
-  "pavgb      %%xmm4,%%xmm2\n"
-  "pavgb      %%xmm5,%%xmm3\n"
-  "pavgb      %%xmm2,%%xmm0\n"
-  "pavgb      %%xmm3,%%xmm1\n"
-  "psadbw     %%xmm7,%%xmm0\n"
-  "psadbw     %%xmm7,%%xmm1\n"
-  "pshufd     $0xd8,%%xmm0,%%xmm0\n"
-  "pshufd     $0x8d,%%xmm1,%%xmm1\n"
-  "por        %%xmm1,%%xmm0\n"
-  "psrlw      $0x3,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "movd       %%xmm0,(%1)\n"
-  "lea        0x4(%1),%1\n"
-  "sub        $0x4,%2\n"
-  "ja         1b\n"
+  intptr_t stridex3 = 0;
+  intptr_t row4 = 0;
+  asm volatile (
+    "lea       (%5,%5,2),%3                    \n"
+    "pxor      %%xmm7,%%xmm7                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    (%0,%5,1),%%xmm2                \n"
+    "movdqa    0x10(%0,%5,1),%%xmm3            \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    (%0,%5,2),%%xmm2                \n"
+    "movdqa    0x10(%0,%5,2),%%xmm3            \n"
+    "movdqa    (%0,%3,1),%%xmm4                \n"
+    "movdqa    0x10(%0,%3,1),%%xmm5            \n"
+    "lea       (%0,%5,4),%4                    \n"
+    "lea       0x20(%0),%0                     \n"
+    "pavgb     %%xmm4,%%xmm2                   \n"
+    "pavgb     %%xmm5,%%xmm3                   \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    0x0(%4),%%xmm2                  \n"
+    "movdqa    0x10(%4),%%xmm3                 \n"
+    "movdqa    0x0(%4,%5,1),%%xmm4             \n"
+    "movdqa    0x10(%4,%5,1),%%xmm5            \n"
+    "pavgb     %%xmm4,%%xmm2                   \n"
+    "pavgb     %%xmm5,%%xmm3                   \n"
+    "movdqa    0x0(%4,%5,2),%%xmm4             \n"
+    "movdqa    0x10(%4,%5,2),%%xmm5            \n"
+    "movdqa    0x0(%4,%3,1),%%xmm6             \n"
+    "pavgb     %%xmm6,%%xmm4                   \n"
+    "movdqa    0x10(%4,%3,1),%%xmm6            \n"
+    "pavgb     %%xmm6,%%xmm5                   \n"
+    "pavgb     %%xmm4,%%xmm2                   \n"
+    "pavgb     %%xmm5,%%xmm3                   \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "psadbw    %%xmm7,%%xmm0                   \n"
+    "psadbw    %%xmm7,%%xmm1                   \n"
+    "pshufd    $0xd8,%%xmm0,%%xmm0             \n"
+    "pshufd    $0x8d,%%xmm1,%%xmm1             \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x3,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0,(%1)                     \n"
+    "lea       0x4(%1),%1                      \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
-    "+r"(dst_width)    // %2
-  : "r"(static_cast<intptr_t>(src_stride))   // %3
-  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3",
-    "xmm4", "xmm5", "xmm6", "xmm7"
-);
+    "+rm"(dst_width),  // %2
+    "+r"(stridex3),    // %3
+    "+r"(row4)         // %4
+  : "r"(static_cast<intptr_t>(src_stride))  // %5
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
 }
 
 #define HAS_SCALEROWDOWN34_SSSE3
-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                                  uint8* dst_ptr, int dst_width) {
-  asm volatile(
-  "movdqa     (%3),%%xmm3\n"
-  "movdqa     (%4),%%xmm4\n"
-  "movdqa     (%5),%%xmm5\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm2\n"
-  "lea        0x20(%0),%0\n"
-  "movdqa     %%xmm2,%%xmm1\n"
-  "palignr    $0x8,%%xmm0,%%xmm1\n"
-  "pshufb     %%xmm3,%%xmm0\n"
-  "pshufb     %%xmm4,%%xmm1\n"
-  "pshufb     %%xmm5,%%xmm2\n"
-  "movq       %%xmm0,(%1)\n"
-  "movq       %%xmm1,0x8(%1)\n"
-  "movq       %%xmm2,0x10(%1)\n"
-  "lea        0x18(%1),%1\n"
-  "sub        $0x18,%2\n"
-  "ja         1b\n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width)    // %2
-  : "r"(_shuf0),   // %3
-    "r"(_shuf1),   // %4
-    "r"(_shuf2)    // %5
-  : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-);
+  asm volatile (
+    "movdqa    %0,%%xmm3                       \n"
+    "movdqa    %1,%%xmm4                       \n"
+    "movdqa    %2,%%xmm5                       \n"
+  :
+  : "m"(kShuf0),  // %0
+    "m"(kShuf1),  // %1
+    "m"(kShuf2)   // %2
+  );
+  asm volatile (
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm2                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "palignr   $0x8,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm3,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "movq      %%xmm1,0x8(%1)                  \n"
+    "movq      %%xmm2,0x10(%1)                 \n"
+    "lea       0x18(%1),%1                     \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
 }
 
-static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
+                                       ptrdiff_t src_stride,
                                        uint8* dst_ptr, int dst_width) {
-  asm volatile(
-  "movdqa     (%4),%%xmm2\n"  // _shuf01
-  "movdqa     (%5),%%xmm3\n"  // _shuf11
-  "movdqa     (%6),%%xmm4\n"  // _shuf21
-  "movdqa     (%7),%%xmm5\n"  // _madd01
-  "movdqa     (%8),%%xmm6\n"  // _madd11
-  "movdqa     (%9),%%xmm7\n"  // _round34
-  "movdqa     (%10),%%xmm8\n"  // _madd21
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     (%0,%3),%%xmm1\n"
-  "pavgb      %%xmm1,%%xmm0\n"
-  "pshufb     %%xmm2,%%xmm0\n"
-  "pmaddubsw  %%xmm5,%%xmm0\n"
-  "paddsw     %%xmm7,%%xmm0\n"
-  "psrlw      $0x2,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "movq       %%xmm0,(%1)\n"
-  "movdqu     0x8(%0),%%xmm0\n"
-  "movdqu     0x8(%0,%3),%%xmm1\n"
-  "pavgb      %%xmm1,%%xmm0\n"
-  "pshufb     %%xmm3,%%xmm0\n"
-  "pmaddubsw  %%xmm6,%%xmm0\n"
-  "paddsw     %%xmm7,%%xmm0\n"
-  "psrlw      $0x2,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "movq       %%xmm0,0x8(%1)\n"
-  "movdqa     0x10(%0),%%xmm0\n"
-  "movdqa     0x10(%0,%3),%%xmm1\n"
-  "lea        0x20(%0),%0\n"
-  "pavgb      %%xmm1,%%xmm0\n"
-  "pshufb     %%xmm4,%%xmm0\n"
-  "pmaddubsw  %%xmm8,%%xmm0\n"
-  "paddsw     %%xmm7,%%xmm0\n"
-  "psrlw      $0x2,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "movq       %%xmm0,0x10(%1)\n"
-  "lea        0x18(%1),%1\n"
-  "sub        $0x18,%2\n"
-  "ja         1b\n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width)    // %2
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"  // kShuf01
+    "movdqa    %1,%%xmm3                       \n"  // kShuf11
+    "movdqa    %2,%%xmm4                       \n"  // kShuf21
+  :
+  : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+  );
+  asm volatile (
+    "movdqa    %0,%%xmm5                       \n"  // kMadd01
+    "movdqa    %1,%%xmm0                       \n"  // kMadd11
+    "movdqa    %2,%%xmm1                       \n"  // kRound34
+  :
+  : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+  );
+  asm volatile (
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm6                     \n"
+    "movdqa    (%0,%3),%%xmm7                  \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6,(%1)                     \n"
+    "movdqu    0x8(%0),%%xmm6                  \n"
+    "movdqu    0x8(%0,%3),%%xmm7               \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm0,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6,0x8(%1)                  \n"
+    "movdqa    0x10(%0),%%xmm6                 \n"
+    "movdqa    0x10(%0,%3),%%xmm7              \n"
+    "lea       0x20(%0),%0                     \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm4,%%xmm6                   \n"
+    "pmaddubsw %4,%%xmm6                       \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6,0x10(%1)                 \n"
+    "lea       0x18(%1),%1                     \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
   : "r"(static_cast<intptr_t>(src_stride)),  // %3
-    "r"(_shuf01),   // %4
-    "r"(_shuf11),   // %5
-    "r"(_shuf21),   // %6
-    "r"(_madd01),   // %7
-    "r"(_madd11),   // %8
-    "r"(_round34),  // %9
-    "r"(_madd21)    // %10
-  : "memory", "xmm0", "xmm1", "xmm2", "xmm3",
-    "xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
-);
-}
-
-static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
+    "m"(kMadd21)     // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
+}
+
+static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
+                                       ptrdiff_t src_stride,
                                        uint8* dst_ptr, int dst_width) {
-  asm volatile(
-  "movdqa     (%4),%%xmm2\n"  // _shuf01
-  "movdqa     (%5),%%xmm3\n"  // _shuf11
-  "movdqa     (%6),%%xmm4\n"  // _shuf21
-  "movdqa     (%7),%%xmm5\n"  // _madd01
-  "movdqa     (%8),%%xmm6\n"  // _madd11
-  "movdqa     (%9),%%xmm7\n"  // _round34
-  "movdqa     (%10),%%xmm8\n"  // _madd21
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     (%0,%3,1),%%xmm1\n"
-  "pavgb      %%xmm0,%%xmm1\n"
-  "pavgb      %%xmm1,%%xmm0\n"
-  "pshufb     %%xmm2,%%xmm0\n"
-  "pmaddubsw  %%xmm5,%%xmm0\n"
-  "paddsw     %%xmm7,%%xmm0\n"
-  "psrlw      $0x2,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "movq       %%xmm0,(%1)\n"
-  "movdqu     0x8(%0),%%xmm0\n"
-  "movdqu     0x8(%0,%3,1),%%xmm1\n"
-  "pavgb      %%xmm0,%%xmm1\n"
-  "pavgb      %%xmm1,%%xmm0\n"
-  "pshufb     %%xmm3,%%xmm0\n"
-  "pmaddubsw  %%xmm6,%%xmm0\n"
-  "paddsw     %%xmm7,%%xmm0\n"
-  "psrlw      $0x2,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "movq       %%xmm0,0x8(%1)\n"
-  "movdqa     0x10(%0),%%xmm0\n"
-  "movdqa     0x10(%0,%3,1),%%xmm1\n"
-  "lea        0x20(%0),%0\n"
-  "pavgb      %%xmm0,%%xmm1\n"
-  "pavgb      %%xmm1,%%xmm0\n"
-  "pshufb     %%xmm4,%%xmm0\n"
-  "pmaddubsw  %%xmm8,%%xmm0\n"
-  "paddsw     %%xmm7,%%xmm0\n"
-  "psrlw      $0x2,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "movq       %%xmm0,0x10(%1)\n"
-  "lea        0x18(%1),%1\n"
-  "sub        $0x18,%2\n"
-  "ja         1b\n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width)    // %2
-  : "r"(static_cast<intptr_t>(src_stride)),  // %3
-    "r"(_shuf01),   // %4
-    "r"(_shuf11),   // %5
-    "r"(_shuf21),   // %6
-    "r"(_madd01),   // %7
-    "r"(_madd11),   // %8
-    "r"(_round34),  // %9
-    "r"(_madd21)    // %10
-  : "memory", "xmm0", "xmm1", "xmm2", "xmm3",
-    "xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
-);
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"  // kShuf01
+    "movdqa    %1,%%xmm3                       \n"  // kShuf11
+    "movdqa    %2,%%xmm4                       \n"  // kShuf21
+  :
+  : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+  );
+  asm volatile (
+    "movdqa    %0,%%xmm5                       \n"  // kMadd01
+    "movdqa    %1,%%xmm0                       \n"  // kMadd11
+    "movdqa    %2,%%xmm1                       \n"  // kRound34
+  :
+  : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+  );
+
+  asm volatile (
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm6                     \n"
+    "movdqa    (%0,%3,1),%%xmm7                \n"
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6,(%1)                     \n"
+    "movdqu    0x8(%0),%%xmm6                  \n"
+    "movdqu    0x8(%0,%3,1),%%xmm7             \n"
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm0,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6,0x8(%1)                  \n"
+    "movdqa    0x10(%0),%%xmm6                 \n"
+    "movdqa    0x10(%0,%3,1),%%xmm7            \n"
+    "lea       0x20(%0),%0                     \n"
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm4,%%xmm6                   \n"
+    "pmaddubsw %4,%%xmm6                       \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6,0x10(%1)                 \n"
+    "lea       0x18(%1),%1                     \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+    : "+r"(src_ptr),   // %0
+      "+r"(dst_ptr),   // %1
+      "+r"(dst_width)  // %2
+    : "r"(static_cast<intptr_t>(src_stride)),  // %3
+      "m"(kMadd21)     // %4
+    : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
 }
 
 #define HAS_SCALEROWDOWN38_SSSE3
-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                                  uint8* dst_ptr, int dst_width) {
-  asm volatile(
-  "movdqa     (%3),%%xmm5\n"
-  "movdqa     (%4),%%xmm6\n"
-  "pxor       %%xmm7,%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "lea        0x20(%0),%0\n"
-  "pshufb     %%xmm5,%%xmm0\n"
-  "pshufb     %%xmm6,%%xmm1\n"
-  "paddusb    %%xmm1,%%xmm0\n"
-  "movq       %%xmm0,(%1)\n"
-  "movhlps    %%xmm0,%%xmm1\n"
-  "movd       %%xmm1,0x8(%1)\n"
-  "lea        0xc(%1),%1\n"
-  "sub        $0xc,%2\n"
-  "ja         1b\n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width)    // %2
-  : "r"(_shuf38a),  // %3
-    "r"(_shuf38b)   // %4
-  : "memory", "xmm0", "xmm1", "xmm5", "xmm6", "xmm7"
-);
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "movhlps   %%xmm0,%%xmm1                   \n"
+    "movd      %%xmm1,0x8(%1)                  \n"
+    "lea       0xc(%1),%1                      \n"
+    "sub       $0xc,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  : "m"(kShuf38a),   // %3
+    "m"(kShuf38b)    // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+      , "xmm0", "xmm1", "xmm4", "xmm5"
+#endif
+  );
 }
 
-static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
+                                       ptrdiff_t src_stride,
                                        uint8* dst_ptr, int dst_width) {
-  asm volatile(
-  "movdqa     (%4),%%xmm4\n"
-  "movdqa     (%5),%%xmm5\n"
-  "movdqa     (%6),%%xmm6\n"
-  "pxor       %%xmm7,%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "movdqa     (%0,%3,1),%%xmm2\n"
-  "movhlps    %%xmm0,%%xmm1\n"
-  "movhlps    %%xmm2,%%xmm3\n"
-  "punpcklbw  %%xmm7,%%xmm0\n"
-  "punpcklbw  %%xmm7,%%xmm1\n"
-  "punpcklbw  %%xmm7,%%xmm2\n"
-  "punpcklbw  %%xmm7,%%xmm3\n"
-  "paddusw    %%xmm2,%%xmm0\n"
-  "paddusw    %%xmm3,%%xmm1\n"
-  "movdqa     (%0,%3,2),%%xmm2\n"
-  "lea        0x10(%0),%0\n"
-  "movhlps    %%xmm2,%%xmm3\n"
-  "punpcklbw  %%xmm7,%%xmm2\n"
-  "punpcklbw  %%xmm7,%%xmm3\n"
-  "paddusw    %%xmm2,%%xmm0\n"
-  "paddusw    %%xmm3,%%xmm1\n"
-  "movdqa     %%xmm0,%%xmm2\n"
-  "psrldq     $0x2,%%xmm0\n"
-  "paddusw    %%xmm0,%%xmm2\n"
-  "psrldq     $0x2,%%xmm0\n"
-  "paddusw    %%xmm0,%%xmm2\n"
-  "pshufb     %%xmm4,%%xmm2\n"
-  "movdqa     %%xmm1,%%xmm3\n"
-  "psrldq     $0x2,%%xmm1\n"
-  "paddusw    %%xmm1,%%xmm3\n"
-  "psrldq     $0x2,%%xmm1\n"
-  "paddusw    %%xmm1,%%xmm3\n"
-  "pshufb     %%xmm5,%%xmm3\n"
-  "paddusw    %%xmm3,%%xmm2\n"
-  "pmulhuw    %%xmm6,%%xmm2\n"
-  "packuswb   %%xmm2,%%xmm2\n"
-  "movd       %%xmm2,(%1)\n"
-  "pextrw     $0x2,%%xmm2,%%eax\n"
-  "mov        %%ax,0x4(%1)\n"
-  "lea        0x6(%1),%1\n"
-  "sub        $0x6,%2\n"
-  "ja         1b\n"
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm4                       \n"
+    "movdqa    %3,%%xmm5                       \n"
+  :
+  : "m"(kShufAb0),   // %0
+    "m"(kShufAb1),   // %1
+    "m"(kShufAb2),   // %2
+    "m"(kScaleAb2)   // %3
+  );
+  asm volatile (
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "pavgb     (%0,%3,1),%%xmm0                \n"
+    "lea       0x10(%0),%0                     \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pshufb    %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "paddusw   %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "paddusw   %%xmm0,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "sub       $0x6,%2                         \n"
+    "movd      %%xmm1,(%1)                     \n"
+    "psrlq     $0x10,%%xmm1                    \n"
+    "movd      %%xmm1,0x2(%1)                  \n"
+    "lea       0x6(%1),%1                      \n"
+    "jg        1b                              \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
     "+r"(dst_width)    // %2
-  : "r"(static_cast<intptr_t>(src_stride)),  // %3
-    "r"(_shufac0),   // %4
-    "r"(_shufac3),   // %5
-    "r"(_scaleac3)   // %6
-  : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3",
-    "xmm4", "xmm5", "xmm6", "xmm7"
-);
+  : "r"(static_cast<intptr_t>(src_stride))  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+  );
 }
 
-static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
+                                       ptrdiff_t src_stride,
                                        uint8* dst_ptr, int dst_width) {
-  asm volatile(
-  "movdqa     (%4),%%xmm4\n"
-  "movdqa     (%5),%%xmm5\n"
-  "movdqa     (%6),%%xmm6\n"
-  "movdqa     (%7),%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm2\n"
-  "pavgb      (%0,%3,1),%%xmm2\n"
-  "lea        0x10(%0),%0\n"
-  "movdqa     %%xmm2,%%xmm0\n"
-  "pshufb     %%xmm4,%%xmm0\n"
-  "movdqa     %%xmm2,%%xmm1\n"
-  "pshufb     %%xmm5,%%xmm1\n"
-  "paddusw    %%xmm1,%%xmm0\n"
-  "pshufb     %%xmm6,%%xmm2\n"
-  "paddusw    %%xmm2,%%xmm0\n"
-  "pmulhuw    %%xmm7,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "movd       %%xmm0,(%1)\n"
-  "pextrw     $0x2,%%xmm0,%%eax\n"
-  "mov        %%ax,0x4(%1)\n"
-  "lea        0x6(%1),%1\n"
-  "sub        $0x6,%2\n"
-  "ja         1b\n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width)    // %2
-  : "r"(static_cast<intptr_t>(src_stride)),  // %3
-    "r"(_shufab0),   // %4
-    "r"(_shufab1),   // %5
-    "r"(_shufab2),   // %6
-    "r"(_scaleab2)   // %7
-  : "memory", "rax", "xmm0", "xmm1", "xmm2",
-    "xmm4", "xmm5", "xmm6", "xmm7"
-);
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm4                       \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+  :
+  : "m"(kShufAc),    // %0
+    "m"(kShufAc3),   // %1
+    "m"(kScaleAc33)  // %2
+  );
+  asm volatile (
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    (%0,%3,1),%%xmm6                \n"
+    "movhlps   %%xmm0,%%xmm1                   \n"
+    "movhlps   %%xmm6,%%xmm7                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm6                   \n"
+    "punpcklbw %%xmm5,%%xmm7                   \n"
+    "paddusw   %%xmm6,%%xmm0                   \n"
+    "paddusw   %%xmm7,%%xmm1                   \n"
+    "movdqa    (%0,%3,2),%%xmm6                \n"
+    "lea       0x10(%0),%0                     \n"
+    "movhlps   %%xmm6,%%xmm7                   \n"
+    "punpcklbw %%xmm5,%%xmm6                   \n"
+    "punpcklbw %%xmm5,%%xmm7                   \n"
+    "paddusw   %%xmm6,%%xmm0                   \n"
+    "paddusw   %%xmm7,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "psrldq    $0x2,%%xmm0                     \n"
+    "paddusw   %%xmm0,%%xmm6                   \n"
+    "psrldq    $0x2,%%xmm0                     \n"
+    "paddusw   %%xmm0,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "movdqa    %%xmm1,%%xmm7                   \n"
+    "psrldq    $0x2,%%xmm1                     \n"
+    "paddusw   %%xmm1,%%xmm7                   \n"
+    "psrldq    $0x2,%%xmm1                     \n"
+    "paddusw   %%xmm1,%%xmm7                   \n"
+    "pshufb    %%xmm3,%%xmm7                   \n"
+    "paddusw   %%xmm7,%%xmm6                   \n"
+    "pmulhuw   %%xmm4,%%xmm6                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "sub       $0x6,%2                         \n"
+    "movd      %%xmm6,(%1)                     \n"
+    "psrlq     $0x10,%%xmm6                    \n"
+    "movd      %%xmm6,0x2(%1)                  \n"
+    "lea       0x6(%1),%1                      \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"(static_cast<intptr_t>(src_stride))   // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
 }
 
 #define HAS_SCALEADDROWS_SSE2
-static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
-                              uint16* dst_ptr, int src_width,
-                              int src_height) {
-  asm volatile(
-  "pxor       %%xmm7,%%xmm7\n"
-"1:"
-  "movdqa     (%0),%%xmm2\n"
-  "lea        (%0,%4,1),%%r10\n"
-  "movhlps    %%xmm2,%%xmm3\n"
-  "lea        -0x1(%3),%%r11\n"
-  "punpcklbw  %%xmm7,%%xmm2\n"
-  "punpcklbw  %%xmm7,%%xmm3\n"
-
-"2:"
-  "movdqa     (%%r10),%%xmm0\n"
-  "lea        (%%r10,%4,1),%%r10\n"
-  "movhlps    %%xmm0,%%xmm1\n"
-  "punpcklbw  %%xmm7,%%xmm0\n"
-  "punpcklbw  %%xmm7,%%xmm1\n"
-  "paddusw    %%xmm0,%%xmm2\n"
-  "paddusw    %%xmm1,%%xmm3\n"
-  "sub        $0x1,%%r11\n"
-  "ja         2b\n"
-
-  "movdqa     %%xmm2,(%1)\n"
-  "movdqa     %%xmm3,0x10(%1)\n"
-  "lea        0x20(%1),%1\n"
-  "lea        0x10(%0),%0\n"
-  "sub        $0x10,%2\n"
-  "ja         1b\n"
+static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint16* dst_ptr, int src_width, int src_height) {
+  int tmp_height = 0;
+  intptr_t tmp_src = 0;
+  asm volatile (
+    "pxor      %%xmm4,%%xmm4                   \n"
+    "sub       $0x1,%5                         \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "mov       %0,%3                           \n"
+    "add       %6,%0                           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm4,%%xmm0                   \n"
+    "punpckhbw %%xmm4,%%xmm1                   \n"
+    "mov       %5,%2                           \n"
+    "test      %2,%2                           \n"
+    "je        3f                              \n"
+  "2:                                          \n"
+    "movdqa    (%0),%%xmm2                     \n"
+    "add       %6,%0                           \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm2                   \n"
+    "punpckhbw %%xmm4,%%xmm3                   \n"
+    "paddusw   %%xmm2,%%xmm0                   \n"
+    "paddusw   %%xmm3,%%xmm1                   \n"
+    "sub       $0x1,%2                         \n"
+    "jg        2b                              \n"
+  "3:                                          \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "movdqa    %%xmm1,0x10(%1)                 \n"
+    "lea       0x10(%3),%0                     \n"
+    "lea       0x20(%1),%1                     \n"
+    "sub       $0x10,%4                        \n"
+    "jg        1b                              \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
-    "+r"(src_width),   // %2
-    "+r"(src_height)   // %3
-  : "r"(static_cast<intptr_t>(src_stride))  // %4
-  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
-);
+    "+r"(tmp_height),  // %2
+    "+r"(tmp_src),     // %3
+    "+r"(src_width),   // %4
+    "+rm"(src_height)  // %5
+  : "rm"(static_cast<intptr_t>(src_stride))  // %6
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+  );
 }
 
+#ifndef SSE2_DISABLED
 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
-#define HAS_SCALEFILTERROWS_SSE2
+#define HAS_SCALEFILTERROWS_SSE2_DISABLED
 static void ScaleFilterRows_SSE2(uint8* dst_ptr,
-                                 const uint8* src_ptr, int src_stride,
+                                 const uint8* src_ptr, ptrdiff_t src_stride,
                                  int dst_width, int source_y_fraction) {
-  if (source_y_fraction == 0) {
-    asm volatile(
-    "1:"
-      "movdqa     (%1),%%xmm0\n"
-      "lea        0x10(%1),%1\n"
-      "movdqa     %%xmm0,(%0)\n"
-      "lea        0x10(%0),%0\n"
-      "sub        $0x10,%2\n"
-      "ja         1b\n"
-      "mov        -0x1(%0),%%al\n"
-      "mov        %%al,(%0)\n"
-      : "+r"(dst_ptr),     // %0
-        "+r"(src_ptr),     // %1
-        "+r"(dst_width)    // %2
-      :
-      : "memory", "rax", "xmm0"
-    );
-    return;
-  } else if (source_y_fraction == 128) {
-    asm volatile(
-    "1:"
-      "movdqa     (%1),%%xmm0\n"
-      "movdqa     (%1,%3,1),%%xmm2\n"
-      "lea        0x10(%1),%1\n"
-      "pavgb      %%xmm2,%%xmm0\n"
-      "movdqa     %%xmm0,(%0)\n"
-      "lea        0x10(%0),%0\n"
-      "sub        $0x10,%2\n"
-      "ja         1b\n"
-      "mov        -0x1(%0),%%al\n"
-      "mov        %%al,(%0)\n"
-      : "+r"(dst_ptr),     // %0
-        "+r"(src_ptr),     // %1
-        "+r"(dst_width)    // %2
-      : "r"(static_cast<intptr_t>(src_stride))  // %3
-      : "memory", "rax", "xmm0", "xmm2"
-    );
-    return;
-  } else {
-    asm volatile(
-      "mov        %3,%%eax\n"
-      "movd       %%eax,%%xmm6\n"
-      "punpcklwd  %%xmm6,%%xmm6\n"
-      "pshufd     $0x0,%%xmm6,%%xmm6\n"
-      "neg        %%eax\n"
-      "add        $0x100,%%eax\n"
-      "movd       %%eax,%%xmm5\n"
-      "punpcklwd  %%xmm5,%%xmm5\n"
-      "pshufd     $0x0,%%xmm5,%%xmm5\n"
-      "pxor       %%xmm7,%%xmm7\n"
-    "1:"
-      "movdqa     (%1),%%xmm0\n"
-      "movdqa     (%1,%4,1),%%xmm2\n"
-      "lea        0x10(%1),%1\n"
-      "movdqa     %%xmm0,%%xmm1\n"
-      "movdqa     %%xmm2,%%xmm3\n"
-      "punpcklbw  %%xmm7,%%xmm0\n"
-      "punpcklbw  %%xmm7,%%xmm2\n"
-      "punpckhbw  %%xmm7,%%xmm1\n"
-      "punpckhbw  %%xmm7,%%xmm3\n"
-      "pmullw     %%xmm5,%%xmm0\n"
-      "pmullw     %%xmm5,%%xmm1\n"
-      "pmullw     %%xmm6,%%xmm2\n"
-      "pmullw     %%xmm6,%%xmm3\n"
-      "paddusw    %%xmm2,%%xmm0\n"
-      "paddusw    %%xmm3,%%xmm1\n"
-      "psrlw      $0x8,%%xmm0\n"
-      "psrlw      $0x8,%%xmm1\n"
-      "packuswb   %%xmm1,%%xmm0\n"
-      "movdqa     %%xmm0,(%0)\n"
-      "lea        0x10(%0),%0\n"
-      "sub        $0x10,%2\n"
-      "ja         1b\n"
-      "mov        -0x1(%0),%%al\n"
-      "mov        %%al,(%0)\n"
-      : "+r"(dst_ptr),     // %0
-        "+r"(src_ptr),     // %1
-        "+r"(dst_width),   // %2
-        "+r"(source_y_fraction)  // %3
-      : "r"(static_cast<intptr_t>(src_stride))  // %4
-      : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3",
-        "xmm5", "xmm6", "xmm7"
-    );
-  }
-  return;
+  asm volatile (
+    "sub       %1,%0                           \n"
+    "cmp       $0x0,%3                         \n"
+    "je        2f                              \n"
+    "cmp       $0x80,%3                        \n"
+    "je        3f                              \n"
+    "movd      %3,%%xmm5                       \n"
+    "punpcklbw %%xmm5,%%xmm5                   \n"
+    "punpcklwd %%xmm5,%%xmm5                   \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm2                \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm2                   \n"
+    "punpckhbw %%xmm4,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm0                   \n"
+    "punpckhbw %%xmm4,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm2                   \n"
+    "psubw     %%xmm1,%%xmm3                   \n"
+    "pmulhw    %%xmm5,%%xmm2                   \n"
+    "pmulhw    %%xmm5,%%xmm3                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+    "jmp       4f                              \n"
+    ".p2align  4                               \n"
+  "2:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        2b                              \n"
+    "jmp       4f                              \n"
+    ".p2align  4                               \n"
+  "3:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "pavgb     (%1,%4,1),%%xmm0                \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        3b                              \n"
+    ".p2align  4                               \n"
+  "4:                                          \n"
+    "punpckhbw %%xmm0,%%xmm0                   \n"
+    "pshufhw   $0xff,%%xmm0,%%xmm0             \n"
+    "punpckhqdq %%xmm0,%%xmm0                  \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
+    "+r"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"(static_cast<intptr_t>(src_stride))  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
 }
+#endif  // SSE2_DISABLED
 
 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
 #define HAS_SCALEFILTERROWS_SSSE3
 static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
-                                  const uint8* src_ptr, int src_stride,
+                                  const uint8* src_ptr, ptrdiff_t src_stride,
                                   int dst_width, int source_y_fraction) {
-  if (source_y_fraction == 0) {
-    asm volatile(
-   "1:"
-      "movdqa     (%1),%%xmm0\n"
-      "lea        0x10(%1),%1\n"
-      "movdqa     %%xmm0,(%0)\n"
-      "lea        0x10(%0),%0\n"
-      "sub        $0x10,%2\n"
-      "ja         1b\n"
-      "mov        -0x1(%0),%%al\n"
-      "mov        %%al,(%0)\n"
-      : "+r"(dst_ptr),     // %0
-        "+r"(src_ptr),     // %1
-        "+r"(dst_width)    // %2
-      :
-      : "memory", "rax", "xmm0"
-    );
-    return;
-  } else if (source_y_fraction == 128) {
-    asm volatile(
-    "1:"
-      "movdqa     (%1),%%xmm0\n"
-      "movdqa     (%1,%3,1),%%xmm2\n"
-      "lea        0x10(%1),%1\n"
-      "pavgb      %%xmm2,%%xmm0\n"
-      "movdqa     %%xmm0,(%0)\n"
-      "lea        0x10(%0),%0\n"
-      "sub        $0x10,%2\n"
-      "ja         1b\n"
-      "mov        -0x1(%0),%%al\n"
-      "mov        %%al,(%0)\n"
-      : "+r"(dst_ptr),     // %0
-        "+r"(src_ptr),     // %1
-        "+r"(dst_width)    // %2
-      : "r"(static_cast<intptr_t>(src_stride))  // %3
-     : "memory", "rax", "xmm0", "xmm2"
-    );
-    return;
-  } else {
-    asm volatile(
-      "mov        %3,%%eax\n"
-      "shr        %%eax\n"
-      "mov        %%al,%%ah\n"
-      "neg        %%al\n"
-      "add        $0x80,%%al\n"
-      "movd       %%eax,%%xmm7\n"
-      "punpcklwd  %%xmm7,%%xmm7\n"
-      "pshufd     $0x0,%%xmm7,%%xmm7\n"
-    "1:"
-      "movdqa     (%1),%%xmm0\n"
-      "movdqa     (%1,%4,1),%%xmm2\n"
-      "lea        0x10(%1),%1\n"
-      "movdqa     %%xmm0,%%xmm1\n"
-      "punpcklbw  %%xmm2,%%xmm0\n"
-      "punpckhbw  %%xmm2,%%xmm1\n"
-      "pmaddubsw  %%xmm7,%%xmm0\n"
-      "pmaddubsw  %%xmm7,%%xmm1\n"
-      "psrlw      $0x7,%%xmm0\n"
-      "psrlw      $0x7,%%xmm1\n"
-      "packuswb   %%xmm1,%%xmm0\n"
-      "movdqa     %%xmm0,(%0)\n"
-      "lea        0x10(%0),%0\n"
-      "sub        $0x10,%2\n"
-      "ja         1b\n"
-      "mov        -0x1(%0),%%al\n"
-      "mov        %%al,(%0)\n"
-      : "+r"(dst_ptr),     // %0
-        "+r"(src_ptr),     // %1
-        "+r"(dst_width),   // %2
-        "+r"(source_y_fraction)  // %3
-      : "r"(static_cast<intptr_t>(src_stride))  // %4
-      : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm7"
-    );
-  }
-  return;
-}
-#endif
+  asm volatile (
+    "sub       %1,%0                           \n"
+    "shr       %3                              \n"
+    "cmp       $0x0,%3                         \n"
+    "je        2f                              \n"
+    "cmp       $0x40,%3                        \n"
+    "je        3f                              \n"
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x80,%3                        \n"
+    "movd      %3,%%xmm5                       \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
+    "punpcklwd %%xmm5,%%xmm5                   \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm2                \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm2,%%xmm0                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    "pmaddubsw %%xmm5,%%xmm0                   \n"
+    "pmaddubsw %%xmm5,%%xmm1                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+    "jmp       4f                              \n"
+    ".p2align  4                               \n"
+  "2:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        2b                              \n"
+    "jmp       4f                              \n"
+    ".p2align  4                               \n"
+  "3:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "pavgb     (%1,%4,1),%%xmm0                \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        3b                              \n"
+    ".p2align  4                               \n"
+  "4:                                          \n"
+    "punpckhbw %%xmm0,%%xmm0                   \n"
+    "pshufhw   $0xff,%%xmm0,%%xmm0             \n"
+    "punpckhqdq %%xmm0,%%xmm0                  \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
+    "+r"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"(static_cast<intptr_t>(src_stride))  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm5"
 #endif
+  );
+}
+#endif  // defined(__x86_64__) || defined(__i386__)
 
 // CPU agnostic row functions
-static void ScaleRowDown2_C(const uint8* src_ptr, int,
+static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
                             uint8* dst, int dst_width) {
-  for (int x = 0; x < dst_width; ++x) {
-    *dst++ = *src_ptr;
-    src_ptr += 2;
+  uint8* dend = dst + dst_width - 1;
+  do {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[2];
+    dst += 2;
+    src_ptr += 4;
+  } while (dst < dend);
+  if (dst_width & 1) {
+    dst[0] = src_ptr[0];
   }
 }
 
-static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
-                               uint8* dst, int dst_width) {
-  for (int x = 0; x < dst_width; ++x) {
-    *dst++ = (src_ptr[0] + src_ptr[1] +
-              src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
-    src_ptr += 2;
+void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  uint8* dend = dst + dst_width - 1;
+  do {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  } while (dst < dend);
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
   }
 }
 
-static void ScaleRowDown4_C(const uint8* src_ptr, int,
+static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
                             uint8* dst, int dst_width) {
-  for (int x = 0; x < dst_width; ++x) {
-    *dst++ = *src_ptr;
-    src_ptr += 4;
+  uint8* dend = dst + dst_width - 1;
+  do {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[4];
+    dst += 2;
+    src_ptr += 8;
+  } while (dst < dend);
+  if (dst_width & 1) {
+    dst[0] = src_ptr[0];
   }
 }
 
-static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown4Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
                                uint8* dst, int dst_width) {
-  for (int x = 0; x < dst_width; ++x) {
-    *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-              src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-              src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
-              src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
-              src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
-              src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
-              src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
-              8) >> 4;
-    src_ptr += 4;
+  intptr_t stride = src_stride;
+  uint8* dend = dst + dst_width - 1;
+  do {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+             src_ptr[stride + 4] + src_ptr[stride + 5] +
+             src_ptr[stride + 6] + src_ptr[stride + 7] +
+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+             8) >> 4;
+    dst += 2;
+    src_ptr += 8;
+  } while (dst < dend);
+  if (dst_width & 1) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
   }
 }
 
@@ -2493,19 +2033,25 @@ static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
 static const int kMaxOutputWidth = 640;
 static const int kMaxRow12 = kMaxOutputWidth * 2;
 
-static void ScaleRowDown8_C(const uint8* src_ptr, int,
+static void ScaleRowDown8_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
                             uint8* dst, int dst_width) {
-  for (int x = 0; x < dst_width; ++x) {
-    *dst++ = *src_ptr;
-    src_ptr += 8;
+  uint8* dend = dst + dst_width - 1;
+  do {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[8];
+    dst += 2;
+    src_ptr += 16;
+  } while (dst < dend);
+  if (dst_width & 1) {
+    dst[0] = src_ptr[0];
   }
 }
 
 // Note calling code checks width is less than max and if not
 // uses ScaleRowDown8_C instead.
-static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown8Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
                                uint8* dst, int dst_width) {
-  ALIGN16(uint8 src_row[kMaxRow12 * 2]);
+  SIMD_ALIGNED(uint8 src_row[kMaxRow12 * 2]);
   assert(dst_width <= kMaxOutputWidth);
   ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
   ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
@@ -2514,7 +2060,7 @@ static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
   ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
 }
 
-static void ScaleRowDown34_C(const uint8* src_ptr, int,
+static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
                              uint8* dst, int dst_width) {
   assert((dst_width % 3 == 0) && (dst_width > 0));
   uint8* dend = dst + dst_width;
@@ -2528,12 +2074,12 @@ static void ScaleRowDown34_C(const uint8* src_ptr, int,
 }
 
 // Filter rows 0 and 1 together, 3 : 1
-static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
                                    uint8* d, int dst_width) {
   assert((dst_width % 3 == 0) && (dst_width > 0));
-  uint8* dend = d + dst_width;
   const uint8* s = src_ptr;
   const uint8* t = src_ptr + src_stride;
+  uint8* dend = d + dst_width;
   do {
     uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
     uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@@ -2551,12 +2097,12 @@ static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
 }
 
 // Filter rows 1 and 2 together, 1 : 1
-static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
                                    uint8* d, int dst_width) {
   assert((dst_width % 3 == 0) && (dst_width > 0));
-  uint8* dend = d + dst_width;
   const uint8* s = src_ptr;
   const uint8* t = src_ptr + src_stride;
+  uint8* dend = d + dst_width;
   do {
     uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
     uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@@ -2573,13 +2119,42 @@ static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
   } while (d < dend);
 }
 
+// (1-f)a + fb can be replaced with a + f(b-a)
+#define BLENDER(a, b, f) (static_cast<int>(a) + \
+    ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
+
+static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                              int dst_width, int x, int dx) {
+  for (int j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+
+static const int kMaxInputWidth = 2560;
+
 #if defined(HAS_SCALEFILTERROWS_SSE2)
 // Filter row to 3/4
 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
                                 int dst_width) {
   assert((dst_width % 3 == 0) && (dst_width > 0));
-  uint8* dend = dst_ptr + dst_width;
   const uint8* s = src_ptr;
+  uint8* dend = dst_ptr + dst_width;
   do {
     dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
     dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@@ -2588,45 +2163,30 @@ static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
     s += 4;
   } while (dst_ptr < dend);
 }
-#endif
-
-static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                              int dst_width, int dx) {
-  int x = 0;
-  for (int j = 0; j < dst_width; ++j) {
-    int xi = x >> 16;
-    int xf1 = x & 0xffff;
-    int xf0 = 65536 - xf1;
-
-    *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
-    x += dx;
-  }
-}
 
-static const int kMaxInputWidth = 2560;
-#if defined(HAS_SCALEFILTERROWS_SSE2)
-#define HAS_SCALEROWDOWN34_SSE2
+#define HAS_SCALEROWDOWN34_SSE2_DISABLED
 // Filter rows 0 and 1 together, 3 : 1
-static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
                                       uint8* dst_ptr, int dst_width) {
   assert((dst_width % 3 == 0) && (dst_width > 0));
-  ALIGN16(uint8 row[kMaxInputWidth]);
-  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3,
-                       256 / 4);
+  SIMD_ALIGNED(uint8 row[kMaxInputWidth]);
+  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);
   ScaleFilterCols34_C(dst_ptr, row, dst_width);
 }
 
 // Filter rows 1 and 2 together, 1 : 1
-static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
                                       uint8* dst_ptr, int dst_width) {
   assert((dst_width % 3 == 0) && (dst_width > 0));
-  ALIGN16(uint8 row[kMaxInputWidth]);
+  SIMD_ALIGNED(uint8 row[kMaxInputWidth]);
   ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
   ScaleFilterCols34_C(dst_ptr, row, dst_width);
 }
 #endif
 
-static void ScaleRowDown38_C(const uint8* src_ptr, int,
+static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
                              uint8* dst, int dst_width) {
   assert(dst_width % 3 == 0);
   for (int x = 0; x < dst_width; x += 3) {
@@ -2639,23 +2199,25 @@ static void ScaleRowDown38_C(const uint8* src_ptr, int,
 }
 
 // 8x3 -> 3x1
-static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown38_3_Int_C(const uint8* src_ptr,
+                                   ptrdiff_t src_stride,
                                    uint8* dst_ptr, int dst_width) {
   assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (int i = 0; i < dst_width; i+=3) {
+  intptr_t stride = src_stride;
+  for (int i = 0; i < dst_width; i += 3) {
     dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-        src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
-        src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
         (65536 / 9) >> 16;
     dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
-        src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
-        src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
         (65536 / 9) >> 16;
     dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
-        src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
+        src_ptr[stride + 6] + src_ptr[stride + 7] +
+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
         (65536 / 6) >> 16;
     src_ptr += 8;
     dst_ptr += 3;
@@ -2663,18 +2225,19 @@ static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
 }
 
 // 8x2 -> 3x1
-static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
                                    uint8* dst_ptr, int dst_width) {
   assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (int i = 0; i < dst_width; i+=3) {
+  intptr_t stride = src_stride;
+  for (int i = 0; i < dst_width; i += 3) {
     dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-        src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2]) * (65536 / 6) >> 16;
     dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
-        src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5]) * (65536 / 6) >> 16;
     dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
+        src_ptr[stride + 6] + src_ptr[stride + 7]) *
         (65536 / 4) >> 16;
     src_ptr += 8;
     dst_ptr += 3;
@@ -2683,7 +2246,7 @@ static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
 
 // C version 8x2 -> 8x1
 static void ScaleFilterRows_C(uint8* dst_ptr,
-                              const uint8* src_ptr, int src_stride,
+                              const uint8* src_ptr, ptrdiff_t src_stride,
                               int dst_width, int source_y_fraction) {
   assert(dst_width > 0);
   int y1_fraction = source_y_fraction;
@@ -2706,7 +2269,7 @@ static void ScaleFilterRows_C(uint8* dst_ptr,
   dst_ptr[0] = dst_ptr[-1];
 }
 
-void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
+void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
                     uint16* dst_ptr, int src_width, int src_height) {
   assert(src_width > 0);
   assert(src_height > 0);
@@ -2728,35 +2291,31 @@ void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
  * its original size.
  *
  */
-static void ScalePlaneDown2(int src_width, int src_height,
+static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
                             const uint8* src_ptr, uint8* dst_ptr,
                             FilterMode filtering) {
-  assert(src_width % 2 == 0);
-  assert(src_height % 2 == 0);
-  void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride,
-                        uint8* dst_ptr, int dst_width);
-
+  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) =
+      filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
 #if defined(HAS_SCALEROWDOWN2_NEON)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
-      (dst_width % 16 == 0) && (src_stride % 16 == 0) &&
-      (dst_stride % 16 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) {
+  if (TestCpuFlag(kCpuHasNEON) &&
+      IS_ALIGNED(dst_width, 16)) {
     ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
-  } else
-#endif
-#if defined(HAS_SCALEROWDOWN2_SSE2)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-      (dst_width % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
-      IS_ALIGNED(dst_ptr, 16)) {
-    ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
-  } else
-#endif
-  {
-    ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
   }
+#elif defined(HAS_SCALEROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
+    ScaleRowDown2 = filtering ? ScaleRowDown2Int_Unaligned_SSE2 :
+        ScaleRowDown2_Unaligned_SSE2;
+    if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+      ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
+    }
+  }
+#endif
 
+  // TODO(fbarchard): Loop through source height to allow odd height.
   for (int y = 0; y < dst_height; ++y) {
     ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
     src_ptr += (src_stride << 1);
@@ -2770,34 +2329,26 @@ static void ScalePlaneDown2(int src_width, int src_height,
  * This is an optimized version for scaling down a plane to 1/4 of
  * its original size.
  */
-static void ScalePlaneDown4(int src_width, int src_height,
+static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
                             const uint8* src_ptr, uint8* dst_ptr,
                             FilterMode filtering) {
-  assert(src_width % 4 == 0);
-  assert(src_height % 4 == 0);
-  void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,
-                        uint8* dst_ptr, int dst_width);
-
+  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) =
+      filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
 #if defined(HAS_SCALEROWDOWN4_NEON)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
-      (dst_width % 2 == 0) && (src_stride % 8 == 0) &&
-      IS_ALIGNED(src_ptr, 8)) {
+  if (TestCpuFlag(kCpuHasNEON) &&
+      IS_ALIGNED(dst_width, 4)) {
     ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
-  } else
-#endif
-#if defined(HAS_SCALEROWDOWN4_SSE2)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-      (dst_width % 8 == 0) && (src_stride % 16 == 0) &&
-      (dst_stride % 8 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
+  }
+#elif defined(HAS_SCALEROWDOWN4_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(dst_width, 8) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
     ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
-  } else
-#endif
-  {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
   }
+#endif
 
   for (int y = 0; y < dst_height; ++y) {
     ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
@@ -2813,27 +2364,23 @@ static void ScalePlaneDown4(int src_width, int src_height,
  * of its original size.
  *
  */
-static void ScalePlaneDown8(int src_width, int src_height,
+static void ScalePlaneDown8(int /* src_width */, int /* src_height */,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
                             const uint8* src_ptr, uint8* dst_ptr,
                             FilterMode filtering) {
-  assert(src_width % 8 == 0);
-  assert(src_height % 8 == 0);
-  void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride,
-                        uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown8)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) =
+      filtering && (dst_width <= kMaxOutputWidth) ?
+      ScaleRowDown8Int_C : ScaleRowDown8_C;
 #if defined(HAS_SCALEROWDOWN8_SSE2)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-      (dst_width % 16 == 0) && dst_width <= kMaxOutputWidth &&
-      (src_stride % 16 == 0) && (dst_stride % 16 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(dst_width, 4) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
     ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
-  } else
-#endif
-  {
-    ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?
-        ScaleRowDown8Int_C : ScaleRowDown8_C;
   }
+#endif
+
   for (int y = 0; y < dst_height; ++y) {
     ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
     src_ptr += (src_stride << 3);
@@ -2847,72 +2394,75 @@ static void ScalePlaneDown8(int src_width, int src_height,
  * Provided by Frank Barchard (fbarchard@google.com)
  *
  */
-static void ScalePlaneDown34(int src_width, int src_height,
+static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
                              int dst_width, int dst_height,
                              int src_stride, int dst_stride,
                              const uint8* src_ptr, uint8* dst_ptr,
                              FilterMode filtering) {
   assert(dst_width % 3 == 0);
-  void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride,
+  void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
-  void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
+  void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
-#if defined(HAS_SCALEROWDOWN34_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
-      (dst_stride % 8 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
+  if (!filtering) {
+    ScaleRowDown34_0 = ScaleRowDown34_C;
+    ScaleRowDown34_1 = ScaleRowDown34_C;
+  } else {
+    ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
+  }
+#if defined(HAS_SCALEROWDOWN34_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
     if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+      ScaleRowDown34_0 = ScaleRowDown34_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_NEON;
     } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
+      ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
     }
-  } else
+  }
 #endif
 #if defined(HAS_SCALEROWDOWN34_SSE2)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-      (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
-      (dst_stride % 8 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) &&
-      filtering) {
+  if (TestCpuFlag(kCpuHasSSE2) && (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && filtering) {
     ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
     ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
-  } else
+  }
 #endif
-  {
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
     if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_C;
-      ScaleRowDown34_1 = ScaleRowDown34_C;
+      ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
     } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
+      ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
     }
   }
-  int src_row = 0;
-  for (int y = 0; y < dst_height; ++y) {
-    switch (src_row) {
-      case 0:
-        ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
-        break;
-
-      case 1:
-        ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
-        break;
-
-      case 2:
-        ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
-                         dst_ptr, dst_width);
-        break;
-    }
-    ++src_row;
+#endif
+
+  for (int y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    if (src_row >= 3) {
-      src_ptr += src_stride;
-      src_row = 0;
-    }
+    ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
+                     dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
   }
 }
 
@@ -2922,23 +2472,47 @@ static void ScalePlaneDown34(int src_width, int src_height,
  * This is an optimized version for scaling down a plane to 3/8
  * of its original size.
  *
- * Reduces 16x3 to 6x1
+ * Uses box filter arranges like this
+ * aaabbbcc -> abc
+ * aaabbbcc    def
+ * aaabbbcc    ghi
+ * dddeeeff
+ * dddeeeff
+ * dddeeeff
+ * ggghhhii
+ * ggghhhii
+ * Boxes are 3x3, 2x3, 3x2 and 2x2
  */
-static void ScalePlaneDown38(int src_width, int src_height,
+static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
                              int dst_width, int dst_height,
                              int src_stride, int dst_stride,
                              const uint8* src_ptr, uint8* dst_ptr,
                              FilterMode filtering) {
   assert(dst_width % 3 == 0);
-  void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride,
+  void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
-  void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
+  void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
-#if defined(HAS_SCALEROWDOWN38_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
-      (dst_stride % 8 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
+  if (!filtering) {
+    ScaleRowDown38_3 = ScaleRowDown38_C;
+    ScaleRowDown38_2 = ScaleRowDown38_C;
+  } else {
+    ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
+    ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
+  }
+#if defined(HAS_SCALEROWDOWN38_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_NEON;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
+    }
+  }
+#elif defined(HAS_SCALEROWDOWN38_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
     if (!filtering) {
       ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
       ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
@@ -2946,39 +2520,34 @@ static void ScalePlaneDown38(int src_width, int src_height,
       ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
       ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
     }
-  } else
+  }
 #endif
-  {
-    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_C;
-      ScaleRowDown38_2 = ScaleRowDown38_C;
-    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
-    }
+
+  for (int y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
   }
-  int src_row = 0;
-  for (int y = 0; y < dst_height; ++y) {
-    switch (src_row) {
-      case 0:
-      case 1:
-        ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
-        src_ptr += src_stride * 3;
-        ++src_row;
-        break;
-
-      case 2:
-        ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
-        src_ptr += src_stride * 2;
-        src_row = 0;
-        break;
-    }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
     dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
   }
 }
 
-inline static uint32 SumBox(int iboxwidth, int iboxheight,
-                            int src_stride, const uint8* src_ptr) {
+static __inline uint32 SumBox(int iboxwidth, int iboxheight,
+                              ptrdiff_t src_stride, const uint8* src_ptr) {
   assert(iboxwidth > 0);
   assert(iboxheight > 0);
   uint32 sum = 0u;
@@ -2991,10 +2560,9 @@ inline static uint32 SumBox(int iboxwidth, int iboxheight,
   return sum;
 }
 
-static void ScalePlaneBoxRow(int dst_width, int boxheight,
-                             int dx, int src_stride,
-                             const uint8* src_ptr, uint8* dst_ptr) {
-  int x = 0;
+static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
+                               int x, int dx, ptrdiff_t src_stride,
+                               const uint8* src_ptr, uint8* dst_ptr) {
   for (int i = 0; i < dst_width; ++i) {
     int ix = x >> 16;
     x += dx;
@@ -3004,7 +2572,7 @@ static void ScalePlaneBoxRow(int dst_width, int boxheight,
   }
 }
 
-inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
+static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
   assert(iboxwidth > 0);
   uint32 sum = 0u;
   for (int x = 0; x < iboxwidth; ++x) {
@@ -3013,14 +2581,13 @@ inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
   return sum;
 }
 
-static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
+static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
                             const uint16* src_ptr, uint8* dst_ptr) {
   int scaletbl[2];
   int minboxwidth = (dx >> 16);
   scaletbl[0] = 65536 / (minboxwidth * boxheight);
   scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
   int *scaleptr = scaletbl - minboxwidth;
-  int x = 0;
   for (int i = 0; i < dst_width; ++i) {
     int ix = x >> 16;
     x += dx;
@@ -3029,11 +2596,10 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
   }
 }
 
-static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,
+static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
                             const uint16* src_ptr, uint8* dst_ptr) {
   int boxwidth = (dx >> 16);
   int scaleval = 65536 / (boxwidth * boxheight);
-  int x = 0;
   for (int i = 0; i < dst_width; ++i) {
     *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
     x += boxwidth;
@@ -3055,61 +2621,56 @@ static void ScalePlaneBox(int src_width, int src_height,
                           const uint8* src_ptr, uint8* dst_ptr) {
   assert(dst_width > 0);
   assert(dst_height > 0);
-  int dy = (src_height << 16) / dst_height;
   int dx = (src_width << 16) / dst_width;
-  if ((src_width % 16 != 0) || (src_width > kMaxInputWidth) ||
+  int dy = (src_height << 16) / dst_height;
+  int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+  int maxy = (src_height << 16);
+  if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
       dst_height * 2 > src_height) {
     uint8* dst = dst_ptr;
-    int dy = (src_height << 16) / dst_height;
-    int dx = (src_width << 16) / dst_width;
-    int y = 0;
     for (int j = 0; j < dst_height; ++j) {
       int iy = y >> 16;
-      const uint8* const src = src_ptr + iy * src_stride;
+      const uint8* src = src_ptr + iy * src_stride;
       y += dy;
-      if (y > (src_height << 16)) {
-        y = (src_height << 16);
+      if (y > maxy) {
+        y = maxy;
       }
       int boxheight = (y >> 16) - iy;
-      ScalePlaneBoxRow(dst_width, boxheight,
-                       dx, src_stride,
-                       src, dst);
-
+      ScalePlaneBoxRow_C(dst_width, boxheight,
+                         x, dx, src_stride,
+                         src, dst);
       dst += dst_stride;
     }
   } else {
-    ALIGN16(uint16 row[kMaxInputWidth]);
-    void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
-                         uint16* dst_ptr, int src_width, int src_height);
-    void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
+    SIMD_ALIGNED(uint16 row[kMaxInputWidth]);
+    void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst_ptr, int src_width, int src_height)=
+        ScaleAddRows_C;
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
                          const uint16* src_ptr, uint8* dst_ptr);
-#if defined(HAS_SCALEADDROWS_SSE2)
-    if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-        (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
-        (src_width % 16) == 0) {
-      ScaleAddRows = ScaleAddRows_SSE2;
-    } else
-#endif
-    {
-      ScaleAddRows = ScaleAddRows_C;
-    }
     if (dx & 0xffff) {
       ScaleAddCols = ScaleAddCols2_C;
     } else {
       ScaleAddCols = ScaleAddCols1_C;
     }
+#if defined(HAS_SCALEADDROWS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) &&
+        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
+      ScaleAddRows = ScaleAddRows_SSE2;
+    }
+#endif
 
-    int y = 0;
     for (int j = 0; j < dst_height; ++j) {
       int iy = y >> 16;
-      const uint8* const src = src_ptr + iy * src_stride;
+      const uint8* src = src_ptr + iy * src_stride;
       y += dy;
       if (y > (src_height << 16)) {
         y = (src_height << 16);
       }
       int boxheight = (y >> 16) - iy;
       ScaleAddRows(src, src_stride, row, src_width, boxheight);
-      ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);
+      ScaleAddCols(dst_width, boxheight, x, dx, row, dst_ptr);
       dst_ptr += dst_stride;
     }
   }
@@ -3122,33 +2683,34 @@ static void ScalePlaneBilinearSimple(int src_width, int src_height,
                                      int dst_width, int dst_height,
                                      int src_stride, int dst_stride,
                                      const uint8* src_ptr, uint8* dst_ptr) {
-  uint8* dst = dst_ptr;
   int dx = (src_width << 16) / dst_width;
   int dy = (src_height << 16) / dst_height;
-  int maxx = ((src_width - 1) << 16) - 1;
-  int maxy = ((src_height - 1) << 16) - 1;
-  int y = (dst_height < src_height) ? 32768 :
-      (src_height << 16) / dst_height - 32768;
+  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+  int maxx = (src_width > 1) ? ((src_width - 1) << 16) - 1 : 0;
+  int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
   for (int i = 0; i < dst_height; ++i) {
-    int cy = (y < 0) ? 0 : y;
-    int yi = cy >> 16;
-    int yf = cy & 0xffff;
-    const uint8* const src = src_ptr + yi * src_stride;
-    int x = (dst_width < src_width) ? 32768 :
-        (src_width << 16) / dst_width - 32768;
+    int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+    int yi = y >> 16;
+    int yf = y & 0xffff;
+    const uint8* src0 = src_ptr + yi * src_stride;
+    const uint8* src1 = (yi < src_height - 1) ? src0 + src_stride : src0;
+    uint8* dst = dst_ptr;
     for (int j = 0; j < dst_width; ++j) {
-      int cx = (x < 0) ? 0 : x;
-      int xi = cx >> 16;
-      int xf = cx & 0xffff;
-      int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
-      int r1 = (src[xi + src_stride] * (65536 - xf) +
-          src[xi + src_stride + 1] * xf) >> 16;
-      *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
+      int xi = x >> 16;
+      int xf = x & 0xffff;
+      int x1 = (xi < src_width - 1) ? xi + 1 : xi;
+      int a = src0[xi];
+      int b = src0[x1];
+      int r0 = BLENDER(a, b, xf);
+      a = src1[xi];
+      b = src1[x1];
+      int r1 = BLENDER(a, b, xf);
+      *dst++ = BLENDER(r0, r1, yf);
       x += dx;
       if (x > maxx)
         x = maxx;
     }
-    dst += dst_stride - dst_width;
+    dst_ptr += dst_stride;
     y += dy;
     if (y > maxy)
       y = maxy;
@@ -3159,52 +2721,51 @@ static void ScalePlaneBilinearSimple(int src_width, int src_height,
  * Scale plane to/from any dimensions, with bilinear
  * interpolation.
  */
-static void ScalePlaneBilinear(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint8* src_ptr, uint8* dst_ptr) {
+void ScalePlaneBilinear(int src_width, int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_ptr, uint8* dst_ptr) {
   assert(dst_width > 0);
   assert(dst_height > 0);
-  int dy = (src_height << 16) / dst_height;
-  int dx = (src_width << 16) / dst_width;
-  if ((src_width % 8 != 0) || (src_width > kMaxInputWidth)) {
+  if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
     ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
                              src_stride, dst_stride, src_ptr, dst_ptr);
 
   } else {
-    ALIGN16(uint8 row[kMaxInputWidth + 1]);
+    SIMD_ALIGNED(uint8 row[kMaxInputWidth + 16]);
     void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
-                            int src_stride,
-                            int dst_width, int source_y_fraction);
-    void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
-                            int dst_width, int dx);
-#if defined(HAS_SCALEFILTERROWS_SSSE3)
-    if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-        (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
-        (src_width % 16) == 0) {
-      ScaleFilterRows = ScaleFilterRows_SSSE3;
-    } else
+                            ptrdiff_t src_stride,
+                            int dst_width, int source_y_fraction) =
+        ScaleFilterRows_C;
+#if defined(HAS_SCALEFILTERROWS_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      ScaleFilterRows = ScaleFilterRows_NEON;
+    }
 #endif
 #if defined(HAS_SCALEFILTERROWS_SSE2)
-    if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-        (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
-        (src_width % 16) == 0) {
+    if (TestCpuFlag(kCpuHasSSE2) &&
+        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
       ScaleFilterRows = ScaleFilterRows_SSE2;
-    } else
+    }
 #endif
-    {
-      ScaleFilterRows = ScaleFilterRows_C;
+#if defined(HAS_SCALEFILTERROWS_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3) &&
+        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
+      ScaleFilterRows = ScaleFilterRows_SSSE3;
     }
-    ScaleFilterCols = ScaleFilterCols_C;
+#endif
 
-    int y = 0;
-    int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.
+    int dx = (src_width << 16) / dst_width;
+    int dy = (src_height << 16) / dst_height;
+    int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+    int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+    int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
     for (int j = 0; j < dst_height; ++j) {
-      int iy = y >> 16;
-      int fy = (y >> 8) & 255;
-      const uint8* const src = src_ptr + iy * src_stride;
-      ScaleFilterRows(row, src, src_stride, src_width, fy);
-      ScaleFilterCols(dst_ptr, row, dst_width, dx);
+      int yi = y >> 16;
+      int yf = (y >> 8) & 255;
+      const uint8* src = src_ptr + yi * src_stride;
+      ScaleFilterRows(row, src, src_stride, src_width, yf);
+      ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
       dst_ptr += dst_stride;
       y += dy;
       if (y > maxy) {
@@ -3224,18 +2785,20 @@ static void ScalePlaneSimple(int src_width, int src_height,
                              int dst_width, int dst_height,
                              int src_stride, int dst_stride,
                              const uint8* src_ptr, uint8* dst_ptr) {
-  uint8* dst = dst_ptr;
   int dx = (src_width << 16) / dst_width;
-  for (int y = 0; y < dst_height; ++y) {
-    const uint8* const src = src_ptr + (y * src_height / dst_height) *
-        src_stride;
-    // TODO(fbarchard): Round X coordinate by setting x=0x8000.
-    int x = 0;
+  int dy = (src_height << 16) / dst_height;
+  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+  for (int j = 0; j < dst_height; ++j) {
+    int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+    int yi = y >> 16;
+    const uint8* src = src_ptr + yi * src_stride;
+    uint8* dst = dst_ptr;
     for (int i = 0; i < dst_width; ++i) {
       *dst++ = src[x >> 16];
       x += dx;
     }
-    dst += dst_stride - dst_width;
+    dst_ptr += dst_stride;
+    y += dy;
   }
 }
 
@@ -3283,47 +2846,31 @@ static void ScalePlaneDown(int src_width, int src_height,
   }
 }
 
-/**
- * Copy plane, no scaling
- *
- * This simply copies the given plane without scaling.
- * The current implementation is ~115 times faster
- * compared to the reference implementation.
- *
- */
-static void CopyPlane(int src_width, int src_height,
-                      int dst_width, int dst_height,
-                      int src_stride, int dst_stride,
-                      const uint8* src_ptr, uint8* dst_ptr) {
-  if (src_stride == src_width && dst_stride == dst_width) {
-    // All contiguous, so can use REALLY fast path.
-    memcpy(dst_ptr, src_ptr, src_width * src_height);
-  } else {
-    // Not all contiguous; must copy scanlines individually
-    const uint8* src = src_ptr;
-    uint8* dst = dst_ptr;
-    for (int i = 0; i < src_height; ++i) {
-      memcpy(dst, src, src_width);
-      dst += dst_stride;
-      src += src_stride;
-    }
+// Scale a plane.
+// This function in turn calls a scaling function suitable for handling
+// the desired resolutions.
+
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+                int src_width, int src_height,
+                uint8* dst, int dst_stride,
+                int dst_width, int dst_height,
+                FilterMode filtering) {
+#ifdef CPU_X86
+  // environment variable overrides for testing.
+  char *filter_override = getenv("LIBYUV_FILTER");
+  if (filter_override) {
+    filtering = (FilterMode)atoi(filter_override);  // NOLINT
   }
-}
-
-static void ScalePlane(const uint8* src, int src_stride,
-                       int src_width, int src_height,
-                       uint8* dst, int dst_stride,
-                       int dst_width, int dst_height,
-                       FilterMode filtering, bool use_ref) {
+#endif
   // Use specialized scales to improve performance for common resolutions.
   // For example, all the 1/2 scalings will use ScalePlaneDown2()
   if (dst_width == src_width && dst_height == src_height) {
     // Straight copy.
-    CopyPlane(src_width, src_height, dst_width, dst_height, src_stride,
-              dst_stride, src, dst);
+    CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
   } else if (dst_width <= src_width && dst_height <= src_height) {
     // Scale down.
-    if (use_ref) {
+    if (use_reference_impl_) {
       // For testing, allow the optimized versions to be disabled.
       ScalePlaneDown(src_width, src_height, dst_width, dst_height,
                      src_stride, dst_stride, src, dst, filtering);
@@ -3342,11 +2889,13 @@ static void ScalePlane(const uint8* src, int src_stride,
       // optimized, 3/8
       ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
                        src_stride, dst_stride, src, dst, filtering);
-    } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
+    } else if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+               filtering != kFilterBilinear) {
       // optimized, 1/4
       ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
                       src_stride, dst_stride, src, dst, filtering);
-    } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
+    } else if (8 * dst_width == src_width && 8 * dst_height == src_height &&
+               filtering != kFilterBilinear) {
       // optimized, 1/8
       ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
                       src_stride, dst_stride, src, dst, filtering);
@@ -3362,14 +2911,12 @@ static void ScalePlane(const uint8* src, int src_stride,
   }
 }
 
-/**
- * Scale a plane.
- *
- * This function in turn calls a scaling function
- * suitable for handling the desired resolutions.
- *
- */
+// Scale an I420 image.
+// This function in turn calls a scaling function for each plane.
+
+#define UNDER_ALLOCATED_HACK 1
 
+LIBYUV_API
 int I420Scale(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
@@ -3394,23 +2941,47 @@ int I420Scale(const uint8* src_y, int src_stride_y,
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
   }
-  int halfsrc_width = (src_width + 1) >> 1;
-  int halfsrc_height = (src_height + 1) >> 1;
-  int halfdst_width = (dst_width + 1) >> 1;
-  int halfoheight = (dst_height + 1) >> 1;
+  int src_halfwidth = (src_width + 1) >> 1;
+  int src_halfheight = (src_height + 1) >> 1;
+  int dst_halfwidth = (dst_width + 1) >> 1;
+  int dst_halfheight = (dst_height + 1) >> 1;
+
+#ifdef UNDER_ALLOCATED_HACK
+  // If caller passed width / 2 for stride, adjust halfwidth to match.
+  if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) {
+    src_halfwidth = src_width >> 1;
+  }
+  if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) {
+    dst_halfwidth = dst_width >> 1;
+  }
+  // If caller used height / 2 when computing src_v, it will point into what
+  // should be the src_u plane. Detect this and reduce halfheight to match.
+  int uv_src_plane_size = src_halfwidth * src_halfheight;
+  if ((src_height & 1) &&
+      (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
+    src_halfheight = src_height >> 1;
+  }
+  int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
+  if ((dst_height & 1) &&
+      (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
+    dst_halfheight = dst_height >> 1;
+  }
+#endif
 
   ScalePlane(src_y, src_stride_y, src_width, src_height,
              dst_y, dst_stride_y, dst_width, dst_height,
-             filtering, use_reference_impl_);
-  ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
-             dst_u, dst_stride_u, halfdst_width, halfoheight,
-             filtering, use_reference_impl_);
-  ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
-             dst_v, dst_stride_v, halfdst_width, halfoheight,
-             filtering, use_reference_impl_);
+             filtering);
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+             filtering);
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+             filtering);
   return 0;
 }
 
+// Deprecated api
+LIBYUV_API
 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
           int src_stride_y, int src_stride_u, int src_stride_v,
           int src_width, int src_height,
@@ -3433,49 +3004,77 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
   }
-  int halfsrc_width = (src_width + 1) >> 1;
-  int halfsrc_height = (src_height + 1) >> 1;
-  int halfdst_width = (dst_width + 1) >> 1;
-  int halfoheight = (dst_height + 1) >> 1;
+  int src_halfwidth = (src_width + 1) >> 1;
+  int src_halfheight = (src_height + 1) >> 1;
+  int dst_halfwidth = (dst_width + 1) >> 1;
+  int dst_halfheight = (dst_height + 1) >> 1;
   FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
 
+#ifdef UNDER_ALLOCATED_HACK
+  // If caller passed width / 2 for stride, adjust halfwidth to match.
+  if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) {
+    src_halfwidth = src_width >> 1;
+  }
+  if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) {
+    dst_halfwidth = dst_width >> 1;
+  }
+  // If caller used height / 2 when computing src_v, it will point into what
+  // should be the src_u plane. Detect this and reduce halfheight to match.
+  int uv_src_plane_size = src_halfwidth * src_halfheight;
+  if ((src_height & 1) &&
+      (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
+    src_halfheight = src_height >> 1;
+  }
+  int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
+  if ((dst_height & 1) &&
+      (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
+    dst_halfheight = dst_height >> 1;
+  }
+#endif
+
   ScalePlane(src_y, src_stride_y, src_width, src_height,
              dst_y, dst_stride_y, dst_width, dst_height,
-             filtering, use_reference_impl_);
-  ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
-             dst_u, dst_stride_u, halfdst_width, halfoheight,
-             filtering, use_reference_impl_);
-  ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
-             dst_v, dst_stride_v, halfdst_width, halfoheight,
-             filtering, use_reference_impl_);
+             filtering);
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+             filtering);
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+             filtering);
   return 0;
 }
 
-int Scale(const uint8* src, int src_width, int src_height,
-          uint8* dst, int dst_width, int dst_height, int ooffset,
-          bool interpolate) {
+// Deprecated api
+LIBYUV_API
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+                bool interpolate) {
   if (!src || src_width <= 0 || src_height <= 0 ||
-      !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 ||
-      ooffset >= dst_height) {
+      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
+      dst_yoffset >= dst_height) {
     return -1;
   }
-  ooffset = ooffset & ~1;  // chroma requires offset to multiple of 2.
-  int halfsrc_width = (src_width + 1) >> 1;
-  int halfsrc_height = (src_height + 1) >> 1;
-  int halfdst_width = (dst_width + 1) >> 1;
-  int halfoheight = (dst_height + 1) >> 1;
-  int aheight = dst_height - ooffset * 2;  // actual output height
-  const uint8* const iyptr = src;
-  uint8* oyptr = dst + ooffset * dst_width;
-  const uint8* const iuptr = src + src_width * src_height;
-  uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfdst_width;
-  const uint8* const ivptr = src + src_width * src_height +
-                             halfsrc_width * halfsrc_height;
-  uint8* ovptr = dst + dst_width * dst_height + halfdst_width * halfoheight +
-                 (ooffset >> 1) * halfdst_width;
-  return Scale(iyptr, iuptr, ivptr, src_width, halfsrc_width, halfsrc_width,
-               src_width, src_height, oyptr, ouptr, ovptr, dst_width,
-               halfdst_width, halfdst_width, dst_width, aheight, interpolate);
-}
-
+  dst_yoffset = dst_yoffset & ~1;  // chroma requires offset to multiple of 2.
+  int src_halfwidth = (src_width + 1) >> 1;
+  int src_halfheight = (src_height + 1) >> 1;
+  int dst_halfwidth = (dst_width + 1) >> 1;
+  int dst_halfheight = (dst_height + 1) >> 1;
+  int aheight = dst_height - dst_yoffset * 2;  // actual output height
+  const uint8* src_y = src;
+  const uint8* src_u = src + src_width * src_height;
+  const uint8* src_v = src + src_width * src_height +
+                             src_halfwidth * src_halfheight;
+  uint8* dst_y = dst + dst_yoffset * dst_width;
+  uint8* dst_u = dst + dst_width * dst_height +
+                 (dst_yoffset >> 1) * dst_halfwidth;
+  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
+                 (dst_yoffset >> 1) * dst_halfwidth;
+  return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
+               src_width, src_height, dst_y, dst_u, dst_v, dst_width,
+               dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
 }  // namespace libyuv
+#endif
diff --git a/files/source/scale_argb.cc b/files/source/scale_argb.cc
new file mode 100644
index 00000000..5d4e1ac0
--- /dev/null
+++ b/files/source/scale_argb.cc
@@ -0,0 +1,1035 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>  // For getenv()
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Bilinear SSE2 is disabled.
+#define SSE2_DISABLED 1
+
+// ARGB scaling uses bilinear or point, but not box filter.
+/**
+ * SSE2 downscalers with bilinear interpolation.
+ */
+
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+
+#define HAS_SCALEARGBROWDOWN2_SSE2
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr,
+                                   ptrdiff_t /* src_stride */,
+                                   uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+    align      16
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    shufps     xmm0, xmm1, 0x88
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 8x2 rectangle to 4x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+
+    align      16
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       ebx
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_ptr
+                                     // src_stride ignored
+    mov        ebx, [esp + 8 + 12]   // src_stepx
+    mov        edx, [esp + 8 + 16]   // dst_ptr
+    mov        ecx, [esp + 8 + 20]   // dst_width
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+    align      16
+  wloop:
+    movd       xmm0, [eax]
+    movd       xmm1, [eax + ebx]
+    punpckldq  xmm0, xmm1
+    movd       xmm2, [eax + ebx * 2]
+    movd       xmm3, [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    pop        edi
+    pop        ebx
+    ret
+  }
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr,
+                                         ptrdiff_t src_stride,
+                                         int src_stepx,
+                                         uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]    // src_ptr
+    mov        esi, [esp + 12 + 8]    // src_stride
+    mov        ebx, [esp + 12 + 12]   // src_stepx
+    mov        edx, [esp + 12 + 16]   // dst_ptr
+    mov        ecx, [esp + 12 + 20]   // dst_width
+    lea        esi, [eax + esi]      // row1 pointer
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+    align      16
+  wloop:
+    movq       xmm0, qword ptr [eax] // row0 4 pairs
+    movhps     xmm0, qword ptr [eax + ebx]
+    movq       xmm1, qword ptr [eax + ebx * 2]
+    movhps     xmm1, qword ptr [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    movq       xmm2, qword ptr [esi] // row1 4 pairs
+    movhps     xmm2, qword ptr [esi + ebx]
+    movq       xmm3, qword ptr [esi + ebx * 2]
+    movhps     xmm3, qword ptr [esi + edi]
+    lea        esi,  [esi + ebx * 4]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSE2 version.
+#ifndef SSE2_DISABLED
+#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
+__declspec(naked) __declspec(align(16))
+void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride, int dst_width,
+                              int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
+    cmp        eax, 0
+    je         xloop1
+    cmp        eax, 128
+    je         xloop2
+
+    movd       xmm5, eax            // xmm5 = y fraction
+    punpcklbw  xmm5, xmm5
+    punpcklwd  xmm5, xmm5
+    pshufd     xmm5, xmm5, 0
+    pxor       xmm4, xmm4
+
+    // f * row1 + (1 - frac) row0
+    // frac * (row1 - row0) + row0
+    align      16
+  xloop:
+    movdqa     xmm0, [esi]  // row0
+    movdqa     xmm2, [esi + edx]  // row1
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    punpcklbw  xmm2, xmm4
+    punpckhbw  xmm3, xmm4
+    punpcklbw  xmm0, xmm4
+    punpckhbw  xmm1, xmm4
+    psubw      xmm2, xmm0  // row1 - row0
+    psubw      xmm3, xmm1
+    pmulhw     xmm2, xmm5  // scale diff
+    pmulhw     xmm3, xmm5
+    paddw      xmm0, xmm2  // sum rows
+    paddw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop
+
+    shufps     xmm0, xmm0, 0xff
+    movdqa     [esi + edi], xmm0    // duplicate last pixel for filtering
+    pop        edi
+    pop        esi
+    ret
+
+    align      16
+  xloop1:
+    movdqa     xmm0, [esi]
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop1
+
+    shufps     xmm0, xmm0, 0xff
+    movdqa     [esi + edi], xmm0
+    pop        edi
+    pop        esi
+    ret
+
+    align      16
+  xloop2:
+    movdqa     xmm0, [esi]
+    pavgb      xmm0, [esi + edx]
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop2
+
+    shufps     xmm0, xmm0, 0xff
+    movdqa     [esi + edi], xmm0
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // SSE2_DISABLED
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
+#define HAS_SCALEARGBFILTERROWS_SSSE3
+__declspec(naked) __declspec(align(16))
+void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                               ptrdiff_t src_stride, int dst_width,
+                               int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
+    shr        eax, 1
+    cmp        eax, 0
+    je         xloop1
+    cmp        eax, 64
+    je         xloop2
+    movd       xmm0, eax  // high fraction 0..127
+    neg        eax
+    add        eax, 128
+    movd       xmm5, eax  // low fraction 128..1
+    punpcklbw  xmm5, xmm0
+    punpcklwd  xmm5, xmm5
+    pshufd     xmm5, xmm5, 0
+
+    align      16
+  xloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + edx]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm2
+    punpckhbw  xmm1, xmm2
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm1, xmm5
+    psrlw      xmm0, 7
+    psrlw      xmm1, 7
+    packuswb   xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop
+
+    shufps     xmm0, xmm0, 0xff
+    movdqa     [esi + edi], xmm0    // duplicate last pixel for filtering
+    pop        edi
+    pop        esi
+    ret
+
+    align      16
+  xloop1:
+    movdqa     xmm0, [esi]
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop1
+
+    shufps     xmm0, xmm0, 0xff
+    movdqa     [esi + edi], xmm0
+    pop        edi
+    pop        esi
+    ret
+
+    align      16
+  xloop2:
+    movdqa     xmm0, [esi]
+    pavgb      xmm0, [esi + edx]
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop2
+
+    shufps     xmm0, xmm0, 0xff
+    movdqa     [esi + edi], xmm0
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+#define HAS_SCALEARGBROWDOWN2_SSE2
+static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr,
+                                   ptrdiff_t /* src_stride */,
+                                   uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+
+static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    (%0,%3,1),%%xmm2                \n"
+    "movdqa    0x10(%0,%3,1),%%xmm3            \n"
+    "lea       0x20(%0),%0                     \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"(static_cast<intptr_t>(src_stride))   // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+  );
+}
+
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_ptr 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_ptr, int dst_width) {
+  intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
+  intptr_t src_stepx_x12 = 0;
+  asm volatile (
+    "lea       0x0(,%1,4),%1                   \n"
+    "lea       (%1,%1,2),%4                    \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movd      (%0),%%xmm0                     \n"
+    "movd      (%0,%1,1),%%xmm1                \n"
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    "movd      (%0,%1,2),%%xmm2                \n"
+    "movd      (%0,%4,1),%%xmm3                \n"
+    "lea       (%0,%1,4),%0                    \n"
+    "punpckldq %%xmm3,%%xmm2                   \n"
+    "punpcklqdq %%xmm2,%%xmm0                  \n"
+    "sub       $0x4,%3                         \n"
+    "movdqa    %%xmm0,(%2)                     \n"
+    "lea       0x10(%2),%2                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),       // %0
+    "+r"(src_stepx_x4),  // %1
+    "+r"(dst_ptr),       // %2
+    "+r"(dst_width),     // %3
+    "+r"(src_stepx_x12)  // %4
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+  );
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_ptr 16 byte aligned.
+static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr,
+                                         ptrdiff_t src_stride, int src_stepx,
+                                         uint8* dst_ptr, int dst_width) {
+  intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
+  intptr_t src_stepx_x12 = 0;
+  intptr_t row1 = static_cast<intptr_t>(src_stride);
+  asm volatile (
+    "lea       0x0(,%1,4),%1                   \n"
+    "lea       (%1,%1,2),%4                    \n"
+    "lea       (%0,%5,1),%5                    \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movq      (%0),%%xmm0                     \n"
+    "movhps    (%0,%1,1),%%xmm0                \n"
+    "movq      (%0,%1,2),%%xmm1                \n"
+    "movhps    (%0,%4,1),%%xmm1                \n"
+    "lea       (%0,%1,4),%0                    \n"
+    "movq      (%5),%%xmm2                     \n"
+    "movhps    (%5,%1,1),%%xmm2                \n"
+    "movq      (%5,%1,2),%%xmm3                \n"
+    "movhps    (%5,%4,1),%%xmm3                \n"
+    "lea       (%5,%1,4),%5                    \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "sub       $0x4,%3                         \n"
+    "movdqa    %%xmm0,(%2)                     \n"
+    "lea       0x10(%2),%2                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),        // %0
+    "+r"(src_stepx_x4),   // %1
+    "+r"(dst_ptr),        // %2
+    "+rm"(dst_width),     // %3
+    "+r"(src_stepx_x12),  // %4
+    "+r"(row1)            // %5
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+  );
+}
+
+#ifndef SSE2_DISABLED
+// Bilinear row filtering combines 4x2 -> 4x1. SSE2 version
+#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
+void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride, int dst_width,
+                              int source_y_fraction) {
+  asm volatile (
+    "sub       %1,%0                           \n"
+    "cmp       $0x0,%3                         \n"
+    "je        2f                              \n"
+    "cmp       $0x80,%3                        \n"
+    "je        3f                              \n"
+    "movd      %3,%%xmm5                       \n"
+    "punpcklbw %%xmm5,%%xmm5                   \n"
+    "punpcklwd %%xmm5,%%xmm5                   \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm2                \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm2                   \n"
+    "punpckhbw %%xmm4,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm0                   \n"
+    "punpckhbw %%xmm4,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm2                   \n"
+    "psubw     %%xmm1,%%xmm3                   \n"
+    "pmulhw    %%xmm5,%%xmm2                   \n"
+    "pmulhw    %%xmm5,%%xmm3                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+    "jmp       4f                              \n"
+    ".p2align  4                               \n"
+  "2:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        2b                              \n"
+    "jmp       4f                              \n"
+    ".p2align  4                               \n"
+  "3:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "pavgb     (%1,%4,1),%%xmm0                \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        3b                              \n"
+    ".p2align  4                               \n"
+  "4:                                          \n"
+    "shufps    $0xff,%%xmm0,%%xmm0             \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+r"(dst_width),   // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"(static_cast<intptr_t>(src_stride))  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+#endif  // SSE2_DISABLED
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+#define HAS_SCALEARGBFILTERROWS_SSSE3
+void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                               ptrdiff_t src_stride, int dst_width,
+                               int source_y_fraction) {
+  asm volatile (
+    "sub       %1,%0                           \n"
+    "shr       %3                              \n"
+    "cmp       $0x0,%3                         \n"
+    "je        2f                              \n"
+    "cmp       $0x40,%3                        \n"
+    "je        3f                              \n"
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x80,%3                        \n"
+    "movd      %3,%%xmm5                       \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
+    "punpcklwd %%xmm5,%%xmm5                   \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm2                \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm2,%%xmm0                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    "pmaddubsw %%xmm5,%%xmm0                   \n"
+    "pmaddubsw %%xmm5,%%xmm1                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+    "jmp       4f                              \n"
+    ".p2align  4                               \n"
+  "2:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        2b                              \n"
+    "jmp       4f                              \n"
+    ".p2align  4                               \n"
+  "3:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "pavgb     (%1,%4,1),%%xmm0                \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        3b                              \n"
+  "4:                                          \n"
+    ".p2align  4                               \n"
+    "shufps    $0xff,%%xmm0,%%xmm0             \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+r"(dst_width),   // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"(static_cast<intptr_t>(src_stride))  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm5"
+#endif
+  );
+}
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+static void ScaleARGBRowDown2_C(const uint8* src_ptr,
+                                ptrdiff_t /* src_stride */,
+                                uint8* dst_ptr, int dst_width) {
+  const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
+  uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
+
+  for (int x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[0];
+    dst[1] = src[2];
+    src += 4;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+static void ScaleARGBRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[4] +
+                  src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2;
+    dst_ptr[1] = (src_ptr[1] + src_ptr[5] +
+                  src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2;
+    dst_ptr[2] = (src_ptr[2] + src_ptr[6] +
+                  src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2;
+    dst_ptr[3] = (src_ptr[3] + src_ptr[7] +
+                  src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2;
+    src_ptr += 8;
+    dst_ptr += 4;
+  }
+}
+
+void ScaleARGBRowDownEven_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                            int src_stepx,
+                            uint8* dst_ptr, int dst_width) {
+  const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
+  uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
+
+  for (int x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[0];
+    dst[1] = src[src_stepx];
+    src += src_stepx * 2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+static void ScaleARGBRowDownEvenInt_C(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8* dst_ptr, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[4] +
+                  src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2;
+    dst_ptr[1] = (src_ptr[1] + src_ptr[5] +
+                  src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2;
+    dst_ptr[2] = (src_ptr[2] + src_ptr[6] +
+                  src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2;
+    dst_ptr[3] = (src_ptr[3] + src_ptr[7] +
+                  src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2;
+    src_ptr += src_stepx * 4;
+    dst_ptr += 4;
+  }
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+
+#define BLENDER1(a, b, f) (static_cast<int>(a) + \
+    ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
+
+#define BLENDERC(a, b, f, s) static_cast<uint32>( \
+    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+
+#define BLENDER(a, b, f) \
+    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
+    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+static void ScaleARGBFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                                  int dst_width, int x, int dx) {
+  const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
+  uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
+  for (int j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+
+static const int kMaxInputWidth = 2560;
+
+// C version 2x2 -> 2x1
+void ScaleARGBFilterRows_C(uint8* dst_ptr, const uint8* src_ptr,
+                           ptrdiff_t src_stride,
+                           int dst_width, int source_y_fraction) {
+  assert(dst_width > 0);
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  uint8* end = dst_ptr + (dst_width << 2);
+  do {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
+    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
+    dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
+    dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
+    dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
+    dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
+    src_ptr += 8;
+    src_ptr1 += 8;
+    dst_ptr += 8;
+  } while (dst_ptr < end);
+  // Duplicate the last pixel (4 bytes) for filtering.
+  dst_ptr[0] = dst_ptr[-4];
+  dst_ptr[1] = dst_ptr[-3];
+  dst_ptr[2] = dst_ptr[-2];
+  dst_ptr[3] = dst_ptr[-1];
+}
+
+/**
+ * ScaleARGB ARGB, 1/2
+ *
+ * This is an optimized version for scaling down a ARGB to 1/2 of
+ * its original size.
+ *
+ */
+static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint8* src_ptr, uint8* dst_ptr,
+                           FilterMode filtering) {
+  void (*ScaleARGBRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) =
+      filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C;
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(dst_width, 4) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 :
+        ScaleARGBRowDown2_SSE2;
+  }
+#endif
+
+  // TODO(fbarchard): Loop through source height to allow odd height.
+  for (int y = 0; y < dst_height; ++y) {
+    ScaleARGBRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += (src_stride << 1);
+    dst_ptr += dst_stride;
+  }
+}
+
+/**
+ * ScaleARGB ARGB Even
+ *
+ * This is an optimized version for scaling down a ARGB to even
+ * multiple of its original size.
+ *
+ */
+static void ScaleARGBDownEven(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_ptr, uint8* dst_ptr,
+                              FilterMode filtering) {
+  assert(IS_ALIGNED(src_width, 2));
+  assert(IS_ALIGNED(src_height, 2));
+  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, ptrdiff_t src_stride,
+                               int src_step, uint8* dst_ptr, int dst_width) =
+      filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(dst_width, 4) &&
+      IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_SSE2 :
+        ScaleARGBRowDownEven_SSE2;
+  }
+#endif
+  int src_step = src_width / dst_width;
+  // Adjust to point to center of box.
+  int row_step = src_height / dst_height;
+  int row_stride = row_step * src_stride;
+  src_ptr += ((row_step >> 1) - 1) * src_stride + ((src_step >> 1) - 1) * 4;
+  for (int y = 0; y < dst_height; ++y) {
+    ScaleARGBRowDownEven(src_ptr, src_stride, src_step, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+/**
+ * ScaleARGB ARGB to/from any dimensions, with bilinear
+ * interpolation.
+ */
+
+static void ScaleARGBBilinear(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_ptr, uint8* dst_ptr) {
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  assert(src_width <= kMaxInputWidth);
+  SIMD_ALIGNED(uint8 row[kMaxInputWidth * 4 + 16]);
+  void (*ScaleARGBFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride,
+                              int dst_width, int source_y_fraction) =
+      ScaleARGBFilterRows_C;
+#if defined(HAS_SCALEARGBFILTERROWS_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
+    ScaleARGBFilterRows = ScaleARGBFilterRows_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERROWS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
+    ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3;
+  }
+#endif
+  int dx = (src_width << 16) / dst_width;
+  int dy = (src_height << 16) / dst_height;
+  int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+  int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  for (int j = 0; j < dst_height; ++j) {
+    int yi = y >> 16;
+    int yf = (y >> 8) & 255;
+    const uint8* src = src_ptr + yi * src_stride;
+    ScaleARGBFilterRows(row, src, src_stride, src_width, yf);
+    ScaleARGBFilterCols_C(dst_ptr, row, dst_width, x, dx);
+    dst_ptr += dst_stride;
+    y += dy;
+    if (y > maxy) {
+      y = maxy;
+    }
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+// Code is adapted from libyuv bilinear yuv scaling, but with bilinear
+//     interpolation off, and argb pixels instead of yuv.
+static void ScaleARGBCols(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx) {
+  const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
+  uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
+  for (int j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+/**
+ * ScaleARGB ARGB to/from any dimensions, without interpolation.
+ * Fixed point math is used for performance: The upper 16 bits
+ * of x and dx is the integer part of the source position and
+ * the lower 16 bits are the fixed decimal part.
+ */
+
+static void ScaleARGBSimple(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr) {
+  int dx = (src_width << 16) / dst_width;
+  int dy = (src_height << 16) / dst_height;
+  int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+  for (int i = 0; i < dst_height; ++i) {
+    ScaleARGBCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+    dst_ptr += dst_stride;
+    y += dy;
+  }
+}
+
+/**
+ * ScaleARGB ARGB to/from any dimensions.
+ */
+static void ScaleARGBAnySize(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr,
+                             FilterMode filtering) {
+  if (!filtering || (src_width > kMaxInputWidth)) {
+    ScaleARGBSimple(src_width, src_height, dst_width, dst_height,
+                    src_stride, dst_stride, src_ptr, dst_ptr);
+  } else {
+    ScaleARGBBilinear(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src_ptr, dst_ptr);
+  }
+}
+
+// ScaleARGB a ARGB.
+//
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+
+static void ScaleARGB(const uint8* src, int src_stride,
+                      int src_width, int src_height,
+                      uint8* dst, int dst_stride,
+                      int dst_width, int dst_height,
+                      FilterMode filtering) {
+#ifdef CPU_X86
+  // environment variable overrides for testing.
+  char *filter_override = getenv("LIBYUV_FILTER");
+  if (filter_override) {
+    filtering = (FilterMode)atoi(filter_override);  // NOLINT
+  }
+#endif
+  if (dst_width == src_width && dst_height == src_height) {
+    // Straight copy.
+    ARGBCopy(src, src_stride, dst, dst_stride, dst_width, dst_height);
+    return;
+  }
+  if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+    // Optimized 1/2.
+    ScaleARGBDown2(src_width, src_height, dst_width, dst_height,
+                   src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  int scale_down_x = src_width / dst_width;
+  int scale_down_y = src_height / dst_height;
+  if (dst_width * scale_down_x == src_width &&
+      dst_height * scale_down_y == src_height) {
+    if (!(scale_down_x & 1) && !(scale_down_y & 1)) {
+      // Optimized even scale down. ie 4, 6, 8, 10x
+      ScaleARGBDownEven(src_width, src_height, dst_width, dst_height,
+                        src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if ((scale_down_x & 1) && (scale_down_y & 1)) {
+      filtering = kFilterNone;
+    }
+  }
+  // Arbitrary scale up and/or down.
+  ScaleARGBAnySize(src_width, src_height, dst_width, dst_height,
+                   src_stride, dst_stride, src, dst, filtering);
+}
+
+// ScaleARGB an ARGB image.
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+             int src_width, int src_height,
+             uint8* dst_argb, int dst_stride_argb,
+             int dst_width, int dst_height,
+             FilterMode filtering) {
+  if (!src_argb || src_width <= 0 || src_height == 0 ||
+      !dst_argb || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src_argb = src_argb + (src_height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+            dst_argb, dst_stride_argb, dst_width, dst_height,
+            filtering);
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc
new file mode 100644
index 00000000..a1946f05
--- /dev/null
+++ b/files/source/scale_neon.cc
@@ -0,0 +1,534 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+
+/**
+ * NEON downscalers with interpolation.
+ *
+ * Provided by Fritz Koenig
+ *
+ */
+
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                        uint8* dst, int dst_width) {
+  asm volatile (
+    "1:                                        \n"
+    // load even pixels into q0, odd into q1
+    "vld2.u8    {q0,q1}, [%0]!                 \n"
+    "vst1.u8    {q0}, [%1]!                    \n"  // store even pixels
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    "bgt        1b                             \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst),              // %1
+      "+r"(dst_width)         // %2
+    :
+    : "q0", "q1"              // Clobber List
+  );
+}
+
+void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %0                         \n"
+    "1:                                        \n"
+    "vld1.u8    {q0,q1}, [%0]!                 \n"  // load row 1 and post inc
+    "vld1.u8    {q2,q3}, [%1]!                 \n"  // load row 2 and post inc
+    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
+    "vpadal.u8  q1, q3                         \n"
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    "vst1.u8    {q0}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "bgt        1b                             \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(src_stride),       // %1
+      "+r"(dst),              // %2
+      "+r"(dst_width)         // %3
+    :
+    : "q0", "q1", "q2", "q3"     // Clobber List
+   );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "1:                                        \n"
+    "vld2.u8    {d0, d1}, [%0]!                \n"
+    "vtrn.u8    d1, d0                         \n"
+    "vshrn.u16  d0, q0, #8                     \n"
+    "vst1.u32   {d0[1]}, [%1]!                 \n"
+    "subs       %2, #4                         \n"
+    "bgt        1b                             \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    :
+    : "q0", "q1", "memory", "cc"
+  );
+}
+
+void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "add        r4, %0, %3                     \n"
+    "add        r5, r4, %3                     \n"
+    "add        %3, r5, %3                     \n"
+    "1:                                        \n"
+    "vld1.u8    {q0}, [%0]!                    \n"   // load up 16x4
+    "vld1.u8    {q1}, [r4]!                    \n"
+    "vld1.u8    {q2}, [r5]!                    \n"
+    "vld1.u8    {q3}, [%3]!                    \n"
+    "vpaddl.u8  q0, q0                         \n"
+    "vpadal.u8  q0, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"
+    "vpadal.u8  q0, q3                         \n"
+    "vpaddl.u16 q0, q0                         \n"
+    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
+    "vmovn.u16  d0, q0                         \n"
+    "vst1.u32   {d0[0]}, [%1]!                 \n"
+    "subs       %2, #4                         \n"
+    "bgt        1b                             \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    : "r"(src_stride)         // %3
+    : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
+  );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t /* src_stride */,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "1:                                        \n"
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "vmov         d2, d3                       \n" // order d0, d1, d2
+    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
+    "subs         %2, #24                      \n"
+    "bgt          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    :
+    : "d0", "d1", "d2", "d3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8      d24, #3                      \n"
+    "add          %3, %0                       \n"
+    "1:                                        \n"
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "vmovl.u8     q8, d4                       \n"
+    "vmovl.u8     q9, d5                       \n"
+    "vmovl.u8     q10, d6                      \n"
+    "vmovl.u8     q11, d7                      \n"
+
+    // 3 * line_0 + line_1
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vmlal.u8     q9, d1, d24                  \n"
+    "vmlal.u8     q10, d2, d24                 \n"
+    "vmlal.u8     q11, d3, d24                 \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "vqrshrn.u16  d0, q8, #2                   \n"
+    "vqrshrn.u16  d1, q9, #2                   \n"
+    "vqrshrn.u16  d2, q10, #2                  \n"
+    "vqrshrn.u16  d3, q11, #2                  \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q8, d1                       \n"
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vqrshrn.u16  d0, q8, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q8, d2                       \n"
+    "vmlal.u8     q8, d3, d24                  \n"
+    "vqrshrn.u16  d2, q8, #2                   \n"
+
+    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
+
+    "subs         %2, #24                      \n"
+    "bgt          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width),        // %2
+      "+r"(src_stride)        // %3
+    :
+    : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8      d24, #3                      \n"
+    "add          %3, %0                       \n"
+    "1:                                        \n"
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+
+    // average src line 0 with src line 1
+    "vrhadd.u8    q0, q0, q2                   \n"
+    "vrhadd.u8    q1, q1, q3                   \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q3, d1                       \n"
+    "vmlal.u8     q3, d0, d24                  \n"
+    "vqrshrn.u16  d0, q3, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q3, d2                       \n"
+    "vmlal.u8     q3, d3, d24                  \n"
+    "vqrshrn.u16  d2, q3, #2                   \n"
+
+    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
+
+    "subs         %2, #24                      \n"
+    "bgt          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width),        // %2
+      "+r"(src_stride)        // %3
+    :
+    : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+  );
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+const uvec8 kShuf38 =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+const uvec8 kShuf38_2 =
+  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+const vec16 kMult38_Div6 =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+const vec16 kMult38_Div9 =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t /* src_stride */,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vld1.u8      {q3}, [%3]                   \n"
+    "1:                                        \n"
+    "vld1.u8      {d0, d1, d2, d3}, [%0]!      \n"
+    "vtbl.u8      d4, {d0, d1, d2, d3}, d6     \n"
+    "vtbl.u8      d5, {d0, d1, d2, d3}, d7     \n"
+    "vst1.u8      {d4}, [%1]!                  \n"
+    "vst1.u32     {d5[0]}, [%1]!               \n"
+    "subs         %2, #12                      \n"
+    "bgt          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    : "r"(&kShuf38)           // %3
+    : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+  );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vld1.u16     {q13}, [%4]                  \n"
+    "vld1.u8      {q14}, [%5]                  \n"
+    "vld1.u8      {q15}, [%6]                  \n"
+    "add          r4, %0, %3, lsl #1           \n"
+    "add          %3, %0                       \n"
+    "1:                                        \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
+    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
+    "vld4.u8      {d16, d17, d18, d19}, [r4]!  \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+    "vtrn.u8      d16, d17                     \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+    "vtrn.u8      d18, d19                     \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+    "vpaddl.u8    q8, q8                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+    "vpaddl.u8    d19, d19                     \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     q0, q8                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+    "vadd.u16     d4, d19                      \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "vqrdmulh.s16 q2, q2, q13                  \n"
+    "vmovn.u16    d4, q2                       \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+    "vmovl.u8     q9, d18                      \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+    "vadd.u16     q1, q9                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q15                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    "vst1.u8      {d3}, [%1]!                  \n"
+    "vst1.u32     {d4[0]}, [%1]!               \n"
+    "subs         %2, #12                      \n"
+    "bgt          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width),        // %2
+      "+r"(src_stride)        // %3
+    : "r"(&kMult38_Div6),     // %4
+      "r"(&kShuf38_2),        // %5
+      "r"(&kMult38_Div9)      // %6
+    : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
+      "q13", "q14", "q15", "memory", "cc"
+  );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vld1.u16     {q13}, [%4]                  \n"
+    "vld1.u8      {q14}, [%5]                  \n"
+    "add          %3, %0                       \n"
+    "1:                                        \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
+    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "vqrshrn.u16  d4, q2, #2                   \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q13                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    "vst1.u8      {d3}, [%1]!                  \n"
+    "vst1.u32     {d4[0]}, [%1]!               \n"
+    "subs         %2, #12                      \n"
+    "bgt          1b                           \n"
+    : "+r"(src_ptr),       // %0
+      "+r"(dst_ptr),       // %1
+      "+r"(dst_width),     // %2
+      "+r"(src_stride)     // %3
+    : "r"(&kMult38_Div6),  // %4
+      "r"(&kShuf38_2)      // %5
+    : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+  );
+}
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+  asm volatile (
+    "cmp          %4, #0                       \n"
+    "beq          2f                           \n"
+    "add          %2, %1                       \n"
+    "cmp          %4, #128                     \n"
+    "beq          3f                           \n"
+
+    "vdup.8       d5, %4                       \n"
+    "rsb          %4, #256                     \n"
+    "vdup.8       d4, %4                       \n"
+    "1:                                        \n"
+    "vld1.u8      {q0}, [%1]!                  \n"
+    "vld1.u8      {q1}, [%2]!                  \n"
+    "subs         %3, #16                      \n"
+    "vmull.u8     q13, d0, d4                  \n"
+    "vmull.u8     q14, d1, d4                  \n"
+    "vmlal.u8     q13, d2, d5                  \n"
+    "vmlal.u8     q14, d3, d5                  \n"
+    "vrshrn.u16   d0, q13, #8                  \n"
+    "vrshrn.u16   d1, q14, #8                  \n"
+    "vst1.u8      {q0}, [%0]!                  \n"
+    "bgt          1b                           \n"
+    "b            4f                           \n"
+
+    "2:                                        \n"
+    "vld1.u8      {q0}, [%1]!                  \n"
+    "subs         %3, #16                      \n"
+    "vst1.u8      {q0}, [%0]!                  \n"
+    "bgt          2b                           \n"
+    "b            4f                           \n"
+
+    "3:                                        \n"
+    "vld1.u8      {q0}, [%1]!                  \n"
+    "vld1.u8      {q1}, [%2]!                  \n"
+    "subs         %3, #16                      \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vst1.u8      {q0}, [%0]!                  \n"
+    "bgt          3b                           \n"
+    "4:                                        \n"
+    "vst1.u8      {d1[7]}, [%0]                \n"
+    : "+r"(dst_ptr),          // %0
+      "+r"(src_ptr),          // %1
+      "+r"(src_stride),       // %2
+      "+r"(dst_width),        // %3
+      "+r"(source_y_fraction) // %4
+    :
+    : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+  );
+}
+
+#endif  // __ARM_NEON__
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/files/source/video_common.cc b/files/source/video_common.cc
index 8b8ee622..616affd1 100644
--- a/files/source/video_common.cc
+++ b/files/source/video_common.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -9,13 +9,14 @@
  */
 
 
-#include "video_common.h"
-
-#include <sstream>
+#include "libyuv/video_common.h"
 
+#ifdef __cplusplus
 namespace libyuv {
+extern "C" {
+#endif
 
-#define ARRAY_SIZE(x) (static_cast<int>((sizeof(x)/sizeof(x[0]))))
+#define ARRAY_SIZE(x) (static_cast<int>((sizeof(x) / sizeof(x[0]))))
 
 struct FourCCAliasEntry {
   uint32 alias;
@@ -24,7 +25,8 @@ struct FourCCAliasEntry {
 
 static const FourCCAliasEntry kFourCCAliases[] = {
   {FOURCC_IYUV, FOURCC_I420},
-  {FOURCC_YU12, FOURCC_I420},
+  {FOURCC_YU16, FOURCC_I422},
+  {FOURCC_YU24, FOURCC_I444},
   {FOURCC_YUYV, FOURCC_YUY2},
   {FOURCC_YUVS, FOURCC_YUY2},
   {FOURCC_HDYC, FOURCC_UYVY},
@@ -35,6 +37,7 @@ static const FourCCAliasEntry kFourCCAliases[] = {
   {FOURCC_BGR3, FOURCC_24BG},
 };
 
+LIBYUV_API
 uint32 CanonicalFourCC(uint32 fourcc) {
   for (int i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
     if (kFourCCAliases[i].alias == fourcc) {
@@ -45,4 +48,8 @@ uint32 CanonicalFourCC(uint32 fourcc) {
   return fourcc;
 }
 
+#ifdef __cplusplus
+}  // extern "C"
 }  // namespace libyuv
+#endif
+
diff --git a/files/source/video_common.h b/files/source/video_common.h
deleted file mode 100644
index 9fe08a03..00000000
--- a/files/source/video_common.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-/*
-* Common definitions for video, including fourcc and VideoFormat
-*/
-
-
-#ifndef LIBYUV_SOURCE_VIDEO_COMMON_H_
-#define LIBYUV_SOURCE_VIDEO_COMMON_H_
-
-#include <string>
-
-#include "libyuv/basic_types.h"
-
-namespace libyuv {
-
-//////////////////////////////////////////////////////////////////////////////
-// Definition of fourcc.
-//////////////////////////////////////////////////////////////////////////////
-// Convert four characters to a fourcc code.
-// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
-// constants are used in a switch.
-#define FOURCC(a, b, c, d) (\
-    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
-    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
-
-// Some good pages discussing FourCC codes:
-//   http://developer.apple.com/quicktime/icefloe/dispatch020.html
-//   http://www.fourcc.org/yuv.php
-enum FourCC {
-  // Canonical fourcc codes used in our code.
-  FOURCC_I420 = FOURCC('I', '4', '2', '0'),
-  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
-  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
-  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
-  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
-  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
-  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
-  FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
-  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
-  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
-  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
-  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
-  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
-  // Next four are Bayer RGB formats. The four characters define the order of
-  // the colours in each 2x2 pixel grid, going left-to-right and top-to-bottom.
-  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
-  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
-  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
-  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
-
-  // Aliases for canonical fourcc codes, replaced with their canonical
-  // equivalents by CanonicalFourCC().
-  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420
-  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Alias for I420
-  FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2
-  FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac
-  FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY
-  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY
-  FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG
-  FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR
-  FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW
-  FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG
-
-  // Match any fourcc.
-  FOURCC_ANY  = 0xFFFFFFFF,
-};
-
-// Converts fourcc aliases into canonical ones.
-uint32 CanonicalFourCC(uint32 fourcc);
-
-}  // namespace libyuv
-
-#endif  // LIBYUV_SOURCE_VIDEO_COMMON_H_
diff --git a/files/unit_test/compare_test.cc b/files/unit_test/compare_test.cc
new file mode 100644
index 00000000..8a49a612
--- /dev/null
+++ b/files/unit_test/compare_test.cc
@@ -0,0 +1,450 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/cpu_id.h"
+
+namespace libyuv {
+
+// hash seed of 5381 recommended.
+static uint32 ReferenceHashDjb2(const uint8* src, uint64 count, uint32 seed) {
+  uint32 hash = seed;
+  if (count > 0) {
+    do {
+      hash = hash * 33 + *src++;
+    } while (--count);
+  }
+  return hash;
+}
+
+TEST_F(libyuvTest, TestDjb2) {
+  const int kMaxTest = 2049;
+  align_buffer_16(src_a, kMaxTest)
+
+  for (int i = 0; i < kMaxTest; ++i) {
+    src_a[i] = i;
+  }
+  for (int i = 0; i < kMaxTest; ++i) {
+    uint32 h1 = HashDjb2(src_a, kMaxTest, 5381);
+    uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
+    EXPECT_EQ(h1, h2);
+  }
+  // Hash constant generator using for tables in compare
+  int h = 1;
+  for (int i = 0; i <= 16 ; ++i) {
+    printf("%08x ", h);
+    h *= 33;
+  }
+  printf("\n");
+
+  free_aligned_buffer_16(src_a)
+}
+
+TEST_F(libyuvTest, BenchmakDjb2_C) {
+  const int kMaxTest = 1280 * 720;
+  align_buffer_16(src_a, kMaxTest)
+
+  for (int i = 0; i < kMaxTest; ++i) {
+    src_a[i] = i;
+  }
+  uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
+  uint32 h1;
+  MaskCpuFlags(kCpuInitialized);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    h1 = HashDjb2(src_a, kMaxTest, 5381);
+  }
+  MaskCpuFlags(-1);
+  EXPECT_EQ(h1, h2);
+  free_aligned_buffer_16(src_a)
+}
+
+TEST_F(libyuvTest, BenchmakDjb2_OPT) {
+  const int kMaxTest = 1280 * 720;
+  align_buffer_16(src_a, kMaxTest)
+
+  for (int i = 0; i < kMaxTest; ++i) {
+    src_a[i] = i;
+  }
+  uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
+  uint32 h1;
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    h1 = HashDjb2(src_a, kMaxTest, 5381);
+  }
+  EXPECT_EQ(h1, h2);
+  free_aligned_buffer_16(src_a)
+}
+
+TEST_F(libyuvTest, BenchmakDjb2_Unaligned_OPT) {
+  const int kMaxTest = 1280 * 720;
+  align_buffer_16(src_a, kMaxTest + 1)
+
+  for (int i = 0; i < kMaxTest; ++i) {
+    src_a[i + 1] = i;
+  }
+  uint32 h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381);
+  uint32 h1;
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    h1 = HashDjb2(src_a + 1, kMaxTest, 5381);
+  }
+  EXPECT_EQ(h1, h2);
+  free_aligned_buffer_16(src_a)
+}
+
+TEST_F(libyuvTest, BenchmarkSumSquareError_C) {
+  const int kMaxWidth = 4096 * 3;
+  align_buffer_16(src_a, kMaxWidth)
+  align_buffer_16(src_b, kMaxWidth)
+
+  for (int i = 0; i < kMaxWidth; ++i) {
+    src_a[i] = i;
+    src_b[i] = i;
+  }
+
+  MaskCpuFlags(kCpuInitialized);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ComputeSumSquareError(src_a, src_b, kMaxWidth);
+  }
+
+  MaskCpuFlags(-1);
+
+  EXPECT_EQ(0, 0);
+
+  free_aligned_buffer_16(src_a)
+  free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, BenchmarkSumSquareError_OPT) {
+  const int kMaxWidth = 4096 * 3;
+  align_buffer_16(src_a, kMaxWidth)
+  align_buffer_16(src_b, kMaxWidth)
+
+  for (int i = 0; i < kMaxWidth; ++i) {
+    src_a[i] = i;
+    src_b[i] = i;
+  }
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ComputeSumSquareError(src_a, src_b, kMaxWidth);
+  }
+
+  EXPECT_EQ(0, 0);
+
+  free_aligned_buffer_16(src_a)
+  free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, SumSquareError) {
+  const int kMaxWidth = 4096 * 3;
+  align_buffer_16(src_a, kMaxWidth)
+  align_buffer_16(src_b, kMaxWidth)
+
+  memset(src_a, 0, kMaxWidth);
+  memset(src_b, 0, kMaxWidth);
+
+  uint64 err;
+  err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+  EXPECT_EQ(err, 0);
+
+  memset(src_a, 1, kMaxWidth);
+  err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+  EXPECT_EQ(err, kMaxWidth);
+
+  memset(src_a, 190, kMaxWidth);
+  memset(src_b, 193, kMaxWidth);
+  err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+  EXPECT_EQ(err, (kMaxWidth * 3 * 3));
+
+  srandom(time(NULL));
+
+  for (int i = 0; i < kMaxWidth; ++i) {
+    src_a[i] = (random() & 0xff);
+    src_b[i] = (random() & 0xff);
+  }
+
+  MaskCpuFlags(kCpuInitialized);
+  uint64 c_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+  MaskCpuFlags(-1);
+  uint64 opt_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+  EXPECT_EQ(c_err, opt_err);
+
+  free_aligned_buffer_16(src_a)
+  free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, BenchmarkPsnr_C) {
+  align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
+  align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
+
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    src_a[i] = i;
+    src_b[i] = i;
+  }
+
+  MaskCpuFlags(kCpuInitialized);
+
+  double c_time = get_time();
+  for (int i = 0; i < benchmark_iterations_; ++i)
+    CalcFramePsnr(src_a, benchmark_width_,
+                  src_b, benchmark_width_,
+                  benchmark_width_, benchmark_height_);
+
+  c_time = (get_time() - c_time) / benchmark_iterations_;
+  printf("BenchmarkPsnr_C - %8.2f us c\n", c_time * 1e6);
+
+  MaskCpuFlags(-1);
+
+  EXPECT_EQ(0, 0);
+
+  free_aligned_buffer_16(src_a)
+  free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, BenchmarkPsnr_OPT) {
+  align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
+  align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
+
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    src_a[i] = i;
+    src_b[i] = i;
+  }
+
+  MaskCpuFlags(-1);
+
+  double opt_time = get_time();
+  for (int i = 0; i < benchmark_iterations_; ++i)
+    CalcFramePsnr(src_a, benchmark_width_,
+                  src_b, benchmark_width_,
+                  benchmark_width_, benchmark_height_);
+
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+  printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6);
+
+  EXPECT_EQ(0, 0);
+
+  free_aligned_buffer_16(src_a)
+  free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, Psnr) {
+  const int kSrcWidth = 1280;
+  const int kSrcHeight = 720;
+  const int b = 128;
+  const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
+  const int kSrcStride = 2 * b + kSrcWidth;
+  align_buffer_16(src_a, kSrcPlaneSize)
+  align_buffer_16(src_b, kSrcPlaneSize)
+
+  memset(src_a, 0, kSrcPlaneSize);
+  memset(src_b, 0, kSrcPlaneSize);
+
+  double err;
+  err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  EXPECT_EQ(err, kMaxPsnr);
+
+  memset(src_a, 255, kSrcPlaneSize);
+
+  err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  EXPECT_EQ(err, 0.0);
+
+  memset(src_a, 1, kSrcPlaneSize);
+
+  err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  EXPECT_GT(err, 48.0);
+  EXPECT_LT(err, 49.0);
+
+  for (int i = 0; i < kSrcPlaneSize; ++i)
+    src_a[i] = i;
+
+  err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  EXPECT_GT(err, 4.0);
+  EXPECT_LT(err, 5.0);
+
+  srandom(time(NULL));
+
+  memset(src_a, 0, kSrcPlaneSize);
+  memset(src_b, 0, kSrcPlaneSize);
+
+  for (int i = b; i < (kSrcHeight + b); ++i) {
+    for (int j = b; j < (kSrcWidth + b); ++j) {
+      src_a[(i * kSrcStride) + j] = (random() & 0xff);
+      src_b[(i * kSrcStride) + j] = (random() & 0xff);
+    }
+  }
+
+  MaskCpuFlags(kCpuInitialized);
+  double c_err, opt_err;
+
+  c_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+                        src_b + kSrcStride * b + b, kSrcStride,
+                        kSrcWidth, kSrcHeight);
+
+  MaskCpuFlags(-1);
+
+  opt_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+                          src_b + kSrcStride * b + b, kSrcStride,
+                          kSrcWidth, kSrcHeight);
+
+  EXPECT_EQ(opt_err, c_err);
+
+  free_aligned_buffer_16(src_a)
+  free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, BenchmarkSsim_C) {
+  align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
+  align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
+
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    src_a[i] = i;
+    src_b[i] = i;
+  }
+
+  MaskCpuFlags(kCpuInitialized);
+
+  double c_time = get_time();
+  for (int i = 0; i < benchmark_iterations_; ++i)
+    CalcFrameSsim(src_a, benchmark_width_,
+                  src_b, benchmark_width_,
+                  benchmark_width_, benchmark_height_);
+
+  c_time = (get_time() - c_time) / benchmark_iterations_;
+  printf("BenchmarkSsim_C - %8.2f us c\n", c_time * 1e6);
+
+  MaskCpuFlags(-1);
+
+  EXPECT_EQ(0, 0);
+
+  free_aligned_buffer_16(src_a)
+  free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, BenchmarkSsim_OPT) {
+  align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
+  align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
+
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    src_a[i] = i;
+    src_b[i] = i;
+  }
+
+  MaskCpuFlags(-1);
+
+  double opt_time = get_time();
+  for (int i = 0; i < benchmark_iterations_; ++i)
+    CalcFrameSsim(src_a, benchmark_width_,
+                  src_b, benchmark_width_,
+                  benchmark_width_, benchmark_height_);
+
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+  printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6);
+
+  EXPECT_EQ(0, 0);
+
+  free_aligned_buffer_16(src_a)
+  free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, Ssim) {
+  const int kSrcWidth = 1280;
+  const int kSrcHeight = 720;
+  const int b = 128;
+  const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
+  const int kSrcStride = 2 * b + kSrcWidth;
+  align_buffer_16(src_a, kSrcPlaneSize)
+  align_buffer_16(src_b, kSrcPlaneSize)
+
+  memset(src_a, 0, kSrcPlaneSize);
+  memset(src_b, 0, kSrcPlaneSize);
+
+  double err;
+  err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  EXPECT_EQ(err, 1.0);
+
+  memset(src_a, 255, kSrcPlaneSize);
+
+  err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  EXPECT_LT(err, 0.0001);
+
+  memset(src_a, 1, kSrcPlaneSize);
+
+  err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  EXPECT_GT(err, 0.8);
+  EXPECT_LT(err, 0.9);
+
+  for (int i = 0; i < kSrcPlaneSize; ++i)
+    src_a[i] = i;
+
+  err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  EXPECT_GT(err, 0.008);
+  EXPECT_LT(err, 0.009);
+
+  srandom(time(NULL));
+  for (int i = b; i < (kSrcHeight + b); ++i) {
+    for (int j = b; j < (kSrcWidth + b); ++j) {
+      src_a[(i * kSrcStride) + j] = (random() & 0xff);
+      src_b[(i * kSrcStride) + j] = (random() & 0xff);
+    }
+  }
+
+  MaskCpuFlags(kCpuInitialized);
+  double c_err, opt_err;
+
+  c_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+                        src_b + kSrcStride * b + b, kSrcStride,
+                        kSrcWidth, kSrcHeight);
+
+  MaskCpuFlags(-1);
+
+  opt_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+                          src_b + kSrcStride * b + b, kSrcStride,
+                          kSrcWidth, kSrcHeight);
+
+  EXPECT_EQ(opt_err, c_err);
+
+  free_aligned_buffer_16(src_a)
+  free_aligned_buffer_16(src_b)
+}
+
+}  // namespace libyuv
diff --git a/files/unit_test/cpu_test.cc b/files/unit_test/cpu_test.cc
new file mode 100644
index 00000000..52810e80
--- /dev/null
+++ b/files/unit_test/cpu_test.cc
@@ -0,0 +1,100 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/version.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+TEST_F(libyuvTest, TestCpuHas) {
+  int cpu_flags = TestCpuFlag(~kCpuInitialized);
+  printf("Cpu Flags %x\n", cpu_flags);
+  int has_arm = TestCpuFlag(kCpuHasARM);
+  printf("Has ARM %x\n", has_arm);
+  int has_neon = TestCpuFlag(kCpuHasNEON);
+  printf("Has NEON %x\n", has_neon);
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+  printf("Has X86 %x\n", has_x86);
+  int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+  printf("Has SSE2 %x\n", has_sse2);
+  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+  printf("Has SSSE3 %x\n", has_ssse3);
+  int has_sse41 = TestCpuFlag(kCpuHasSSE41);
+  printf("Has SSE4.1 %x\n", has_sse41);
+  int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+  printf("Has SSE4.2 %x\n", has_sse42);
+  int has_avx = TestCpuFlag(kCpuHasAVX);
+  printf("Has AVX %x\n", has_avx);
+  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+  printf("Has AVX2 %x\n", has_avx2);
+}
+
+#if defined(__i386__) || defined(__x86_64__) || \
+    defined(_M_IX86) || defined(_M_X64)
+TEST_F(libyuvTest, TestCpuId) {
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+  if (has_x86) {
+    int cpu_info[4];
+    // Vendor ID:
+    // AuthenticAMD AMD processor
+    // CentaurHauls Centaur processor
+    // CyrixInstead Cyrix processor
+    // GenuineIntel Intel processor
+    // GenuineTMx86 Transmeta processor
+    // Geode by NSC National Semiconductor processor
+    // NexGenDriven NexGen processor
+    // RiseRiseRise Rise Technology processor
+    // SiS SiS SiS  SiS processor
+    // UMC UMC UMC  UMC processor
+    CpuId(cpu_info, 0);
+    cpu_info[0] = cpu_info[1];  // Reorder output
+    cpu_info[1] = cpu_info[3];
+    cpu_info[3] = 0;
+    printf("Cpu Vendor: %s %x %x %x\n", reinterpret_cast<char*>(&cpu_info[0]),
+           cpu_info[0], cpu_info[1], cpu_info[2]);
+    EXPECT_EQ(12, strlen(reinterpret_cast<char*>(&cpu_info[0])));
+
+    // CPU Family and Model
+    // 3:0 - Stepping
+    // 7:4 - Model
+    // 11:8 - Family
+    // 13:12 - Processor Type
+    // 19:16 - Extended Model
+    // 27:20 - Extended Family
+    CpuId(cpu_info, 1);
+    int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
+    int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
+    printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
+           model, model);
+  }
+}
+#endif
+
+TEST_F(libyuvTest, TestLinuxNeon) {
+  int testdata = ArmCpuCaps("unit_test/testdata/arm_v7.txt");
+  if (testdata) {
+    EXPECT_EQ(kCpuInitialized,
+              ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
+    EXPECT_EQ((kCpuInitialized | kCpuHasNEON),
+              ArmCpuCaps("unit_test/testdata/tegra3.txt"));
+  } else {
+    printf("WARNING: unable to load \"unit_test/testdata/arm_v7.txt\"\n");
+  }
+#if defined(__linux__) && defined(__ARM_NEON__)
+  EXPECT_NE(0, ArmCpuCaps("/proc/cpuinfo"));
+#endif
+}
+
+}  // namespace libyuv
diff --git a/files/unit_test/planar_test.cc b/files/unit_test/planar_test.cc
new file mode 100644
index 00000000..e9053a35
--- /dev/null
+++ b/files/unit_test/planar_test.cc
@@ -0,0 +1,1005 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/compare.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "../unit_test/unit_test.h"
+
+#if defined(_MSC_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#else  // __GNUC__
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#endif
+
+namespace libyuv {
+
+#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, N, NEG) \
+TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) {                        \
+  const int kWidth = 1280;                                                     \
+  const int kHeight = 720;                                                     \
+  const int kStride = (kWidth * 8 * BPP_B + 7) / 8;                            \
+  align_buffer_16(src_y, kWidth * kHeight);                                    \
+  align_buffer_16(src_u, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);            \
+  align_buffer_16(src_v, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);            \
+  align_buffer_16(dst_argb_c, kStride * kHeight);                              \
+  align_buffer_16(dst_argb_opt, kStride * kHeight);                            \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[(i * kWidth) + j] = (random() & 0xff);                             \
+  for (int i = 0; i < kHeight / SUBSAMP_Y; ++i)                                \
+    for (int j = 0; j < kWidth / SUBSAMP_X; ++j) {                             \
+      src_u[(i * kWidth / SUBSAMP_X) + j] = (random() & 0xff);                 \
+      src_v[(i * kWidth / SUBSAMP_X) + j] = (random() & 0xff);                 \
+    }                                                                          \
+  MaskCpuFlags(kCpuInitialized);                                               \
+  FMT_PLANAR##To##FMT_B(src_y, kWidth,                                         \
+                        src_u, kWidth / SUBSAMP_X,                             \
+                        src_v, kWidth / SUBSAMP_X,                             \
+                        dst_argb_c, kStride,                                   \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B(src_y, kWidth,                                       \
+                          src_u, kWidth / SUBSAMP_X,                           \
+                          src_v, kWidth / SUBSAMP_X,                           \
+                          dst_argb_opt, kStride,                               \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth * BPP_B; ++j) {                                 \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_argb_c[i * kWidth * BPP_B + j]) -           \
+              static_cast<int>(dst_argb_opt[i * kWidth * BPP_B + j]));         \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 2);                                                      \
+  free_aligned_buffer_16(src_y)                                                \
+  free_aligned_buffer_16(src_u)                                                \
+  free_aligned_buffer_16(src_v)                                                \
+  free_aligned_buffer_16(dst_argb_c)                                           \
+  free_aligned_buffer_16(dst_argb_opt)                                         \
+}
+
+#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B)          \
+    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +)        \
+    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -)
+
+TESTPLANARTOB(I420, 2, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BGRA, 4)
+TESTPLANARTOB(I420, 2, 2, ABGR, 4)
+TESTPLANARTOB(I420, 2, 2, RGBA, 4)
+TESTPLANARTOB(I420, 2, 2, RAW, 3)
+TESTPLANARTOB(I420, 2, 2, RGB24, 3)
+TESTPLANARTOB(I420, 2, 2, RGB565, 2)
+TESTPLANARTOB(I420, 2, 2, ARGB1555, 2)
+TESTPLANARTOB(I420, 2, 2, ARGB4444, 2)
+TESTPLANARTOB(I422, 2, 1, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, BGRA, 4)
+TESTPLANARTOB(I422, 2, 1, ABGR, 4)
+TESTPLANARTOB(I422, 2, 1, RGBA, 4)
+TESTPLANARTOB(I411, 4, 1, ARGB, 4)
+TESTPLANARTOB(I444, 1, 1, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, YUY2, 2)
+TESTPLANARTOB(I420, 2, 2, UYVY, 2)
+// TODO(fbarchard): Re-enable test and fix valgrind.
+// TESTPLANARTOB(I420, 2, 2, V210, 16 / 6)
+TESTPLANARTOB(I420, 2, 2, I400, 1)
+TESTPLANARTOB(I420, 2, 2, BayerBGGR, 1)
+TESTPLANARTOB(I420, 2, 2, BayerRGGB, 1)
+TESTPLANARTOB(I420, 2, 2, BayerGBRG, 1)
+TESTPLANARTOB(I420, 2, 2, BayerGRBG, 1)
+
+#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,       \
+                         N, NEG)                                               \
+TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) {                        \
+  const int kWidth = 1280;                                                     \
+  const int kHeight = 720;                                                     \
+  align_buffer_16(src_y, kWidth * kHeight);                                    \
+  align_buffer_16(src_uv, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y * 2);       \
+  align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight);                     \
+  align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight);                   \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[(i * kWidth) + j] = (random() & 0xff);                             \
+  for (int i = 0; i < kHeight / SUBSAMP_Y; ++i)                                \
+    for (int j = 0; j < kWidth / SUBSAMP_X * 2; ++j) {                         \
+      src_uv[(i * kWidth / SUBSAMP_X) * 2 + j] = (random() & 0xff);            \
+    }                                                                          \
+  MaskCpuFlags(kCpuInitialized);                                               \
+  FMT_PLANAR##To##FMT_B(src_y, kWidth,                                         \
+                        src_uv, kWidth / SUBSAMP_X * 2,                        \
+                        dst_argb_c, kWidth * BPP_B,                            \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B(src_y, kWidth,                                       \
+                          src_uv, kWidth / SUBSAMP_X * 2,                      \
+                          dst_argb_opt, kWidth * BPP_B,                        \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth * BPP_B; ++j) {                                 \
+      int abs_diff =                                                           \
+        abs(static_cast<int>(dst_argb_c[i * kWidth * BPP_B + j]) -             \
+            static_cast<int>(dst_argb_opt[i * kWidth * BPP_B + j]));           \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 3);                                                      \
+  free_aligned_buffer_16(src_y)                                                \
+  free_aligned_buffer_16(src_uv)                                               \
+  free_aligned_buffer_16(dst_argb_c)                                           \
+  free_aligned_buffer_16(dst_argb_opt)                                         \
+}
+
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B)        \
+    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +)      \
+    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -)
+
+TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4)
+TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4)
+TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2)
+TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2)
+
+#define TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, N, NEG) \
+TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N##_OptVsC) {                        \
+  const int kWidth = 1280;                                                     \
+  const int kHeight = 720;                                                     \
+  const int kStride = (kWidth * 8 * BPP_A + 7) / 8;                            \
+  align_buffer_16(src_argb, kStride * kHeight);                                \
+  align_buffer_16(dst_y_c, kWidth * kHeight);                                  \
+  align_buffer_16(dst_u_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);          \
+  align_buffer_16(dst_v_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);          \
+  align_buffer_16(dst_y_opt, kWidth * kHeight);                                \
+  align_buffer_16(dst_u_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);        \
+  align_buffer_16(dst_v_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);        \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kStride; ++j)                                          \
+      src_argb[(i * kStride) + j] = (random() & 0xff);                         \
+  MaskCpuFlags(kCpuInitialized);                                               \
+  FMT_A##To##FMT_PLANAR(src_argb, kStride,                                     \
+                        dst_y_c, kWidth,                                       \
+                        dst_u_c, kWidth / SUBSAMP_X,                           \
+                        dst_v_c, kWidth / SUBSAMP_X,                           \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_A##To##FMT_PLANAR(src_argb, kStride,                                   \
+                          dst_y_opt, kWidth,                                   \
+                          dst_u_opt, kWidth / SUBSAMP_X,                       \
+                          dst_v_opt, kWidth / SUBSAMP_X,                       \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 2);                                                      \
+  for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) {                              \
+    for (int j = 0; j < kWidth / SUBSAMP_X; ++j) {                             \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_u_c[i * kWidth / SUBSAMP_X + j]) -          \
+              static_cast<int>(dst_u_opt[i * kWidth / SUBSAMP_X + j]));        \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 2);                                                      \
+  for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) {                              \
+    for (int j = 0; j < kWidth / SUBSAMP_X; ++j) {                             \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_v_c[i * kWidth / SUBSAMP_X + j]) -          \
+              static_cast<int>(dst_v_opt[i * kWidth / SUBSAMP_X + j]));        \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 2);                                                      \
+  free_aligned_buffer_16(dst_y_c)                                              \
+  free_aligned_buffer_16(dst_u_c)                                              \
+  free_aligned_buffer_16(dst_v_c)                                              \
+  free_aligned_buffer_16(dst_y_opt)                                            \
+  free_aligned_buffer_16(dst_u_opt)                                            \
+  free_aligned_buffer_16(dst_v_opt)                                            \
+  free_aligned_buffer_16(src_argb)                                             \
+}
+
+#define TESTATOPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)          \
+    TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, , +)        \
+    TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, Invert, -)
+
+TESTATOPLANAR(ARGB, 4, I420, 2, 2)
+TESTATOPLANAR(BGRA, 4, I420, 2, 2)
+TESTATOPLANAR(ABGR, 4, I420, 2, 2)
+TESTATOPLANAR(RGBA, 4, I420, 2, 2)
+TESTATOPLANAR(RAW, 3, I420, 2, 2)
+TESTATOPLANAR(RGB24, 3, I420, 2, 2)
+TESTATOPLANAR(RGB565, 2, I420, 2, 2)
+TESTATOPLANAR(ARGB1555, 2, I420, 2, 2)
+TESTATOPLANAR(ARGB4444, 2, I420, 2, 2)
+// TESTATOPLANAR(ARGB, 4, I411, 4, 1)
+TESTATOPLANAR(ARGB, 4, I422, 2, 1)
+// TESTATOPLANAR(ARGB, 4, I444, 1, 1)
+// TODO(fbarchard): Implement and test 411 and 444
+TESTATOPLANAR(YUY2, 2, I420, 2, 2)
+TESTATOPLANAR(UYVY, 2, I420, 2, 2)
+TESTATOPLANAR(YUY2, 2, I422, 2, 1)
+TESTATOPLANAR(UYVY, 2, I422, 2, 1)
+TESTATOPLANAR(V210, 16 / 6, I420, 2, 2)
+TESTATOPLANAR(I400, 1, I420, 2, 2)
+TESTATOPLANAR(BayerBGGR, 1, I420, 2, 2)
+TESTATOPLANAR(BayerRGGB, 1, I420, 2, 2)
+TESTATOPLANAR(BayerGBRG, 1, I420, 2, 2)
+TESTATOPLANAR(BayerGRBG, 1, I420, 2, 2)
+
+#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, N, NEG)                \
+TEST_F(libyuvTest, FMT_A##To##FMT_B##N##_OptVsC) {                             \
+  const int kWidth = 1280;                                                     \
+  const int kHeight = 720;                                                     \
+  align_buffer_16(src_argb, (kWidth * BPP_A) * kHeight);                       \
+  align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight);                     \
+  align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight);                   \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kHeight * kWidth * BPP_A; ++i) {                         \
+    src_argb[i] = (random() & 0xff);                                           \
+  }                                                                            \
+  MaskCpuFlags(kCpuInitialized);                                               \
+  FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A,                                \
+                   dst_argb_c, kWidth * BPP_B,                                 \
+                   kWidth, NEG kHeight);                                       \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A,                              \
+                     dst_argb_opt, kWidth * BPP_B,                             \
+                     kWidth, NEG kHeight);                                     \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight * kWidth * BPP_B; ++i) {                         \
+    int abs_diff =                                                             \
+        abs(static_cast<int>(dst_argb_c[i]) -                                  \
+            static_cast<int>(dst_argb_opt[i]));                                \
+    if (abs_diff > max_diff) {                                                 \
+      max_diff = abs_diff;                                                     \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 2);                                                      \
+  free_aligned_buffer_16(src_argb)                                             \
+  free_aligned_buffer_16(dst_argb_c)                                           \
+  free_aligned_buffer_16(dst_argb_opt)                                         \
+}
+#define TESTATOB(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B)                         \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, , +)                       \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, Invert, -)
+
+TESTATOB(I400, 1, 1, I400, 1)
+TESTATOB(ARGB, 4, 4, ARGB, 4)
+TESTATOB(ARGB, 4, 4, BGRA, 4)
+TESTATOB(ARGB, 4, 4, ABGR, 4)
+TESTATOB(ARGB, 4, 4, RGBA, 4)
+TESTATOB(ARGB, 4, 4, RAW, 3)
+TESTATOB(ARGB, 4, 4, RGB24, 3)
+TESTATOB(ARGB, 4, 4, RGB565, 2)
+TESTATOB(ARGB, 4, 4, ARGB1555, 2)
+TESTATOB(ARGB, 4, 4, ARGB4444, 2)
+TESTATOB(BGRA, 4, 4, ARGB, 4)
+TESTATOB(ABGR, 4, 4, ARGB, 4)
+TESTATOB(RGBA, 4, 4, ARGB, 4)
+TESTATOB(RAW, 3, 3, ARGB, 4)
+TESTATOB(RGB24, 3, 3, ARGB, 4)
+TESTATOB(RGB565, 2, 2, ARGB, 4)
+TESTATOB(ARGB1555, 2, 2, ARGB, 4)
+TESTATOB(ARGB4444, 2, 2, ARGB, 4)
+TESTATOB(YUY2, 2, 2, ARGB, 4)
+TESTATOB(UYVY, 2, 2, ARGB, 4)
+TESTATOB(M420, 3 / 2, 1, ARGB, 4)
+
+static const int kReadPad = 16;  // Allow overread of 16 bytes.
+#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B)                   \
+TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) {                                \
+  srandom(time(NULL));                                                         \
+  for (int times = 0; times < benchmark_iterations_; ++times) {                \
+    const int kWidth = (random() & 63) + 1;                                    \
+    const int kHeight = (random() & 31) + 1;                                   \
+    align_buffer_page_end(src_argb, (kWidth * BPP_A) * kHeight + kReadPad);    \
+    align_buffer_page_end(dst_argb_c, (kWidth * BPP_B) * kHeight);             \
+    align_buffer_page_end(dst_argb_opt, (kWidth * BPP_B) * kHeight);           \
+    for (int i = 0; i < kHeight * kWidth * BPP_A; ++i) {                       \
+      src_argb[i] = (random() & 0xff);                                         \
+    }                                                                          \
+    MaskCpuFlags(kCpuInitialized);                                             \
+    FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A,                              \
+                     dst_argb_c, kWidth * BPP_B,                               \
+                     kWidth, kHeight);                                         \
+    MaskCpuFlags(-1);                                                          \
+    FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A,                              \
+                     dst_argb_opt, kWidth * BPP_B,                             \
+                     kWidth, kHeight);                                         \
+    int max_diff = 0;                                                          \
+    for (int i = 0; i < kHeight * kWidth * BPP_B; ++i) {                       \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_argb_c[i]) -                                \
+              static_cast<int>(dst_argb_opt[i]));                              \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+    EXPECT_LE(max_diff, 2);                                                    \
+    free_aligned_buffer_page_end(src_argb)                                     \
+    free_aligned_buffer_page_end(dst_argb_c)                                   \
+    free_aligned_buffer_page_end(dst_argb_opt)                                 \
+  }                                                                            \
+}
+
+TESTATOBRANDOM(ARGB, 4, 4, ARGB, 4)
+TESTATOBRANDOM(ARGB, 4, 4, BGRA, 4)
+TESTATOBRANDOM(ARGB, 4, 4, ABGR, 4)
+TESTATOBRANDOM(ARGB, 4, 4, RGBA, 4)
+TESTATOBRANDOM(ARGB, 4, 4, RAW, 3)
+TESTATOBRANDOM(ARGB, 4, 4, RGB24, 3)
+TESTATOBRANDOM(ARGB, 4, 4, RGB565, 2)
+TESTATOBRANDOM(ARGB, 4, 4, ARGB1555, 2)
+TESTATOBRANDOM(ARGB, 4, 4, ARGB4444, 2)
+
+TESTATOBRANDOM(BGRA, 4, 4, ARGB, 4)
+TESTATOBRANDOM(ABGR, 4, 4, ARGB, 4)
+TESTATOBRANDOM(RGBA, 4, 4, ARGB, 4)
+TESTATOBRANDOM(RAW, 3, 3, ARGB, 4)
+TESTATOBRANDOM(RGB24, 3, 3, ARGB, 4)
+TESTATOBRANDOM(RGB565, 2, 2, ARGB, 4)
+TESTATOBRANDOM(ARGB1555, 2, 2, ARGB, 4)
+TESTATOBRANDOM(ARGB4444, 2, 2, ARGB, 4)
+
+TEST_F(libyuvTest, TestAttenuate) {
+  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  SIMD_ALIGNED(uint8 atten_pixels[256][4]);
+  SIMD_ALIGNED(uint8 unatten_pixels[256][4]);
+  SIMD_ALIGNED(uint8 atten2_pixels[256][4]);
+
+  // Test unattenuation clamps
+  orig_pixels[0][0] = 200u;
+  orig_pixels[0][1] = 129u;
+  orig_pixels[0][2] = 127u;
+  orig_pixels[0][3] = 128u;
+  // Test unattenuation transparent and opaque are unaffected
+  orig_pixels[1][0] = 16u;
+  orig_pixels[1][1] = 64u;
+  orig_pixels[1][2] = 192u;
+  orig_pixels[1][3] = 0u;
+  orig_pixels[2][0] = 16u;
+  orig_pixels[2][1] = 64u;
+  orig_pixels[2][2] = 192u;
+  orig_pixels[2][3] = 255u;
+  orig_pixels[3][0] = 16u;
+  orig_pixels[3][1] = 64u;
+  orig_pixels[3][2] = 192u;
+  orig_pixels[3][3] = 128u;
+  ARGBUnattenuate(&orig_pixels[0][0], 0, &unatten_pixels[0][0], 0, 4, 1);
+  EXPECT_EQ(255u, unatten_pixels[0][0]);
+  EXPECT_EQ(255u, unatten_pixels[0][1]);
+  EXPECT_EQ(254u, unatten_pixels[0][2]);
+  EXPECT_EQ(128u, unatten_pixels[0][3]);
+  EXPECT_EQ(16u, unatten_pixels[1][0]);
+  EXPECT_EQ(64u, unatten_pixels[1][1]);
+  EXPECT_EQ(192u, unatten_pixels[1][2]);
+  EXPECT_EQ(0u, unatten_pixels[1][3]);
+  EXPECT_EQ(16u, unatten_pixels[2][0]);
+  EXPECT_EQ(64u, unatten_pixels[2][1]);
+  EXPECT_EQ(192u, unatten_pixels[2][2]);
+  EXPECT_EQ(255u, unatten_pixels[2][3]);
+  EXPECT_EQ(32u, unatten_pixels[3][0]);
+  EXPECT_EQ(128u, unatten_pixels[3][1]);
+  EXPECT_EQ(255u, unatten_pixels[3][2]);
+  EXPECT_EQ(128u, unatten_pixels[3][3]);
+
+  for (int i = 0; i < 256; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  ARGBAttenuate(&orig_pixels[0][0], 0, &atten_pixels[0][0], 0, 256, 1);
+  ARGBUnattenuate(&atten_pixels[0][0], 0, &unatten_pixels[0][0], 0, 256, 1);
+  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+    ARGBAttenuate(&unatten_pixels[0][0], 0, &atten2_pixels[0][0], 0, 256, 1);
+  }
+  for (int i = 0; i < 256; ++i) {
+    EXPECT_NEAR(atten_pixels[i][0], atten2_pixels[i][0], 2);
+    EXPECT_NEAR(atten_pixels[i][1], atten2_pixels[i][1], 2);
+    EXPECT_NEAR(atten_pixels[i][2], atten2_pixels[i][2], 2);
+    EXPECT_NEAR(atten_pixels[i][3], atten2_pixels[i][3], 2);
+  }
+  // Make sure transparent, 50% and opaque are fully accurate.
+  EXPECT_EQ(0, atten_pixels[0][0]);
+  EXPECT_EQ(0, atten_pixels[0][1]);
+  EXPECT_EQ(0, atten_pixels[0][2]);
+  EXPECT_EQ(0, atten_pixels[0][3]);
+  EXPECT_EQ(64, atten_pixels[128][0]);
+  EXPECT_EQ(32, atten_pixels[128][1]);
+  EXPECT_EQ(21,  atten_pixels[128][2]);
+  EXPECT_EQ(128, atten_pixels[128][3]);
+  EXPECT_EQ(255, atten_pixels[255][0]);
+  EXPECT_EQ(127, atten_pixels[255][1]);
+  EXPECT_EQ(85,  atten_pixels[255][2]);
+  EXPECT_EQ(255, atten_pixels[255][3]);
+}
+
+TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
+  SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
+  SIMD_ALIGNED(int32 added_pixels[16][16][4]);
+
+  for (int y = 0; y < 16; ++y) {
+    for (int x = 0; x < 16; ++x) {
+      orig_pixels[y][x][0] = 1u;
+      orig_pixels[y][x][1] = 2u;
+      orig_pixels[y][x][2] = 3u;
+      orig_pixels[y][x][3] = 255u;
+    }
+  }
+
+  ARGBComputeCumulativeSum(&orig_pixels[0][0][0], 16 * 4,
+                           &added_pixels[0][0][0], 16 * 4,
+                           16, 16);
+
+  for (int y = 0; y < 16; ++y) {
+    for (int x = 0; x < 16; ++x) {
+      EXPECT_EQ((x + 1) * (y + 1), added_pixels[y][x][0]);
+      EXPECT_EQ((x + 1) * (y + 1) * 2, added_pixels[y][x][1]);
+      EXPECT_EQ((x + 1) * (y + 1) * 3, added_pixels[y][x][2]);
+      EXPECT_EQ((x + 1) * (y + 1) * 255, added_pixels[y][x][3]);
+    }
+  }
+}
+
+TEST_F(libyuvTest, TestARGBGray) {
+  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test color
+  orig_pixels[3][0] = 16u;
+  orig_pixels[3][1] = 64u;
+  orig_pixels[3][2] = 192u;
+  orig_pixels[3][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1);
+  EXPECT_EQ(27u, orig_pixels[0][0]);
+  EXPECT_EQ(27u, orig_pixels[0][1]);
+  EXPECT_EQ(27u, orig_pixels[0][2]);
+  EXPECT_EQ(128u, orig_pixels[0][3]);
+  EXPECT_EQ(151u, orig_pixels[1][0]);
+  EXPECT_EQ(151u, orig_pixels[1][1]);
+  EXPECT_EQ(151u, orig_pixels[1][2]);
+  EXPECT_EQ(0u, orig_pixels[1][3]);
+  EXPECT_EQ(75u, orig_pixels[2][0]);
+  EXPECT_EQ(75u, orig_pixels[2][1]);
+  EXPECT_EQ(75u, orig_pixels[2][2]);
+  EXPECT_EQ(255u, orig_pixels[2][3]);
+  EXPECT_EQ(96u, orig_pixels[3][0]);
+  EXPECT_EQ(96u, orig_pixels[3][1]);
+  EXPECT_EQ(96u, orig_pixels[3][2]);
+  EXPECT_EQ(224u, orig_pixels[3][3]);
+
+  for (int i = 0; i < 256; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+
+  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+    ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+  }
+}
+
+TEST_F(libyuvTest, TestARGBGrayTo) {
+  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  SIMD_ALIGNED(uint8 gray_pixels[256][4]);
+
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test color
+  orig_pixels[3][0] = 16u;
+  orig_pixels[3][1] = 64u;
+  orig_pixels[3][2] = 192u;
+  orig_pixels[3][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1);
+  EXPECT_EQ(27u, gray_pixels[0][0]);
+  EXPECT_EQ(27u, gray_pixels[0][1]);
+  EXPECT_EQ(27u, gray_pixels[0][2]);
+  EXPECT_EQ(128u, gray_pixels[0][3]);
+  EXPECT_EQ(151u, gray_pixels[1][0]);
+  EXPECT_EQ(151u, gray_pixels[1][1]);
+  EXPECT_EQ(151u, gray_pixels[1][2]);
+  EXPECT_EQ(0u, gray_pixels[1][3]);
+  EXPECT_EQ(75u, gray_pixels[2][0]);
+  EXPECT_EQ(75u, gray_pixels[2][1]);
+  EXPECT_EQ(75u, gray_pixels[2][2]);
+  EXPECT_EQ(255u, gray_pixels[2][3]);
+  EXPECT_EQ(96u, gray_pixels[3][0]);
+  EXPECT_EQ(96u, gray_pixels[3][1]);
+  EXPECT_EQ(96u, gray_pixels[3][2]);
+  EXPECT_EQ(224u, gray_pixels[3][3]);
+
+  for (int i = 0; i < 256; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+
+  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+    ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 256, 1);
+  }
+}
+
+TEST_F(libyuvTest, TestARGBSepia) {
+  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test color
+  orig_pixels[3][0] = 16u;
+  orig_pixels[3][1] = 64u;
+  orig_pixels[3][2] = 192u;
+  orig_pixels[3][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 16, 1);
+  EXPECT_EQ(33u, orig_pixels[0][0]);
+  EXPECT_EQ(43u, orig_pixels[0][1]);
+  EXPECT_EQ(47u, orig_pixels[0][2]);
+  EXPECT_EQ(128u, orig_pixels[0][3]);
+  EXPECT_EQ(135u, orig_pixels[1][0]);
+  EXPECT_EQ(175u, orig_pixels[1][1]);
+  EXPECT_EQ(195u, orig_pixels[1][2]);
+  EXPECT_EQ(0u, orig_pixels[1][3]);
+  EXPECT_EQ(69u, orig_pixels[2][0]);
+  EXPECT_EQ(89u, orig_pixels[2][1]);
+  EXPECT_EQ(99u, orig_pixels[2][2]);
+  EXPECT_EQ(255u, orig_pixels[2][3]);
+  EXPECT_EQ(88u, orig_pixels[3][0]);
+  EXPECT_EQ(114u, orig_pixels[3][1]);
+  EXPECT_EQ(127u, orig_pixels[3][2]);
+  EXPECT_EQ(224u, orig_pixels[3][3]);
+
+  for (int i = 0; i < 256; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+
+  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+    ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+  }
+}
+
+TEST_F(libyuvTest, TestARGBColorMatrix) {
+  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+
+  // Matrix for Sepia.
+  static const int8 kARGBToSepia[] = {
+    17, 68, 35, 0,
+    22, 88, 45, 0,
+    24, 98, 50, 0,
+  };
+
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test color
+  orig_pixels[3][0] = 16u;
+  orig_pixels[3][1] = 64u;
+  orig_pixels[3][2] = 192u;
+  orig_pixels[3][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 16, 1);
+  EXPECT_EQ(33u, orig_pixels[0][0]);
+  EXPECT_EQ(43u, orig_pixels[0][1]);
+  EXPECT_EQ(47u, orig_pixels[0][2]);
+  EXPECT_EQ(128u, orig_pixels[0][3]);
+  EXPECT_EQ(135u, orig_pixels[1][0]);
+  EXPECT_EQ(175u, orig_pixels[1][1]);
+  EXPECT_EQ(195u, orig_pixels[1][2]);
+  EXPECT_EQ(0u, orig_pixels[1][3]);
+  EXPECT_EQ(69u, orig_pixels[2][0]);
+  EXPECT_EQ(89u, orig_pixels[2][1]);
+  EXPECT_EQ(99u, orig_pixels[2][2]);
+  EXPECT_EQ(255u, orig_pixels[2][3]);
+  EXPECT_EQ(88u, orig_pixels[3][0]);
+  EXPECT_EQ(114u, orig_pixels[3][1]);
+  EXPECT_EQ(127u, orig_pixels[3][2]);
+  EXPECT_EQ(224u, orig_pixels[3][3]);
+
+  for (int i = 0; i < 256; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+
+  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+    ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 256, 1);
+  }
+}
+
+TEST_F(libyuvTest, TestARGBColorTable) {
+  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  // Matrix for Sepia.
+  static const uint8 kARGBTable[256 * 4] = {
+    1u, 2u, 3u, 4u,
+    5u, 6u, 7u, 8u,
+    9u, 10u, 11u, 12u,
+    13u, 14u, 15u, 16u,
+  };
+
+  orig_pixels[0][0] = 0u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 0u;
+  orig_pixels[1][0] = 1u;
+  orig_pixels[1][1] = 1u;
+  orig_pixels[1][2] = 1u;
+  orig_pixels[1][3] = 1u;
+  orig_pixels[2][0] = 2u;
+  orig_pixels[2][1] = 2u;
+  orig_pixels[2][2] = 2u;
+  orig_pixels[2][3] = 2u;
+  orig_pixels[3][0] = 0u;
+  orig_pixels[3][1] = 1u;
+  orig_pixels[3][2] = 2u;
+  orig_pixels[3][3] = 3u;
+  // Do 16 to test asm version.
+  ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1);
+  EXPECT_EQ(1u, orig_pixels[0][0]);
+  EXPECT_EQ(2u, orig_pixels[0][1]);
+  EXPECT_EQ(3u, orig_pixels[0][2]);
+  EXPECT_EQ(4u, orig_pixels[0][3]);
+  EXPECT_EQ(5u, orig_pixels[1][0]);
+  EXPECT_EQ(6u, orig_pixels[1][1]);
+  EXPECT_EQ(7u, orig_pixels[1][2]);
+  EXPECT_EQ(8u, orig_pixels[1][3]);
+  EXPECT_EQ(9u, orig_pixels[2][0]);
+  EXPECT_EQ(10u, orig_pixels[2][1]);
+  EXPECT_EQ(11u, orig_pixels[2][2]);
+  EXPECT_EQ(12u, orig_pixels[2][3]);
+  EXPECT_EQ(1u, orig_pixels[3][0]);
+  EXPECT_EQ(6u, orig_pixels[3][1]);
+  EXPECT_EQ(11u, orig_pixels[3][2]);
+  EXPECT_EQ(16u, orig_pixels[3][3]);
+
+  for (int i = 0; i < 256; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+
+  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+    ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 256, 1);
+  }
+}
+
+TEST_F(libyuvTest, TestARGBQuantize) {
+  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+
+  for (int i = 0; i < 256; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  ARGBQuantize(&orig_pixels[0][0], 0,
+               (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1);
+
+  for (int i = 0; i < 256; ++i) {
+    EXPECT_EQ(i / 8 * 8 + 8 / 2, orig_pixels[i][0]);
+    EXPECT_EQ(i / 2 / 8 * 8 + 8 / 2, orig_pixels[i][1]);
+    EXPECT_EQ(i / 3 / 8 * 8 + 8 / 2, orig_pixels[i][2]);
+    EXPECT_EQ(i, orig_pixels[i][3]);
+  }
+  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+    ARGBQuantize(&orig_pixels[0][0], 0,
+                 (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1);
+  }
+}
+
+TEST_F(libyuvTest, TestARGBMirror) {
+  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  SIMD_ALIGNED(uint8 dst_pixels[256][4]);
+
+  for (int i = 0; i < 256; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i / 4;
+  }
+  ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1);
+
+  for (int i = 0; i < 256; ++i) {
+    EXPECT_EQ(i, dst_pixels[255 - i][0]);
+    EXPECT_EQ(i / 2, dst_pixels[255 - i][1]);
+    EXPECT_EQ(i / 3, dst_pixels[255 - i][2]);
+    EXPECT_EQ(i / 4, dst_pixels[255 - i][3]);
+  }
+  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+    ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1);
+  }
+}
+
+TEST_F(libyuvTest, TestShade) {
+  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  SIMD_ALIGNED(uint8 shade_pixels[256][4]);
+
+  orig_pixels[0][0] = 10u;
+  orig_pixels[0][1] = 20u;
+  orig_pixels[0][2] = 40u;
+  orig_pixels[0][3] = 80u;
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 0u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 255u;
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 0u;
+  orig_pixels[2][3] = 0u;
+  orig_pixels[3][0] = 0u;
+  orig_pixels[3][1] = 0u;
+  orig_pixels[3][2] = 0u;
+  orig_pixels[3][3] = 0u;
+  ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 4, 1, 0x80ffffff);
+  EXPECT_EQ(10u, shade_pixels[0][0]);
+  EXPECT_EQ(20u, shade_pixels[0][1]);
+  EXPECT_EQ(40u, shade_pixels[0][2]);
+  EXPECT_EQ(40u, shade_pixels[0][3]);
+  EXPECT_EQ(0u, shade_pixels[1][0]);
+  EXPECT_EQ(0u, shade_pixels[1][1]);
+  EXPECT_EQ(0u, shade_pixels[1][2]);
+  EXPECT_EQ(128u, shade_pixels[1][3]);
+  EXPECT_EQ(0u, shade_pixels[2][0]);
+  EXPECT_EQ(0u, shade_pixels[2][1]);
+  EXPECT_EQ(0u, shade_pixels[2][2]);
+  EXPECT_EQ(0u, shade_pixels[2][3]);
+  EXPECT_EQ(0u, shade_pixels[3][0]);
+  EXPECT_EQ(0u, shade_pixels[3][1]);
+  EXPECT_EQ(0u, shade_pixels[3][2]);
+  EXPECT_EQ(0u, shade_pixels[3][3]);
+
+  ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 4, 1, 0x80808080);
+  EXPECT_EQ(5u, shade_pixels[0][0]);
+  EXPECT_EQ(10u, shade_pixels[0][1]);
+  EXPECT_EQ(20u, shade_pixels[0][2]);
+  EXPECT_EQ(40u, shade_pixels[0][3]);
+
+  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+    ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 256, 1,
+              0x80808080);
+  }
+}
+
+TEST_F(libyuvTest, TestInterpolate) {
+  SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
+  SIMD_ALIGNED(uint8 orig_pixels_1[256][4]);
+  SIMD_ALIGNED(uint8 interpolate_pixels[256][4]);
+
+  orig_pixels_0[0][0] = 16u;
+  orig_pixels_0[0][1] = 32u;
+  orig_pixels_0[0][2] = 64u;
+  orig_pixels_0[0][3] = 128u;
+  orig_pixels_0[1][0] = 0u;
+  orig_pixels_0[1][1] = 0u;
+  orig_pixels_0[1][2] = 0u;
+  orig_pixels_0[1][3] = 255u;
+  orig_pixels_0[2][0] = 0u;
+  orig_pixels_0[2][1] = 0u;
+  orig_pixels_0[2][2] = 0u;
+  orig_pixels_0[2][3] = 0u;
+  orig_pixels_0[3][0] = 0u;
+  orig_pixels_0[3][1] = 0u;
+  orig_pixels_0[3][2] = 0u;
+  orig_pixels_0[3][3] = 0u;
+
+  orig_pixels_1[0][0] = 0u;
+  orig_pixels_1[0][1] = 0u;
+  orig_pixels_1[0][2] = 0u;
+  orig_pixels_1[0][3] = 0u;
+  orig_pixels_1[1][0] = 0u;
+  orig_pixels_1[1][1] = 0u;
+  orig_pixels_1[1][2] = 0u;
+  orig_pixels_1[1][3] = 0u;
+  orig_pixels_1[2][0] = 0u;
+  orig_pixels_1[2][1] = 0u;
+  orig_pixels_1[2][2] = 0u;
+  orig_pixels_1[2][3] = 0u;
+  orig_pixels_1[3][0] = 255u;
+  orig_pixels_1[3][1] = 255u;
+  orig_pixels_1[3][2] = 255u;
+  orig_pixels_1[3][3] = 255u;
+
+  ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+                  &interpolate_pixels[0][0], 0, 4, 1, 128);
+  EXPECT_EQ(8u, interpolate_pixels[0][0]);
+  EXPECT_EQ(16u, interpolate_pixels[0][1]);
+  EXPECT_EQ(32u, interpolate_pixels[0][2]);
+  EXPECT_EQ(64u, interpolate_pixels[0][3]);
+  EXPECT_EQ(0u, interpolate_pixels[1][0]);
+  EXPECT_EQ(0u, interpolate_pixels[1][1]);
+  EXPECT_EQ(0u, interpolate_pixels[1][2]);
+  EXPECT_NEAR(128u, interpolate_pixels[1][3], 1);  // C = 127, SSE = 128.
+  EXPECT_EQ(0u, interpolate_pixels[2][0]);
+  EXPECT_EQ(0u, interpolate_pixels[2][1]);
+  EXPECT_EQ(0u, interpolate_pixels[2][2]);
+  EXPECT_EQ(0u, interpolate_pixels[2][3]);
+  EXPECT_NEAR(128u, interpolate_pixels[3][0], 1);
+  EXPECT_NEAR(128u, interpolate_pixels[3][1], 1);
+  EXPECT_NEAR(128u, interpolate_pixels[3][2], 1);
+  EXPECT_NEAR(128u, interpolate_pixels[3][3], 1);
+
+  ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+                  &interpolate_pixels[0][0], 0, 4, 1, 0);
+  EXPECT_EQ(16u, interpolate_pixels[0][0]);
+  EXPECT_EQ(32u, interpolate_pixels[0][1]);
+  EXPECT_EQ(64u, interpolate_pixels[0][2]);
+  EXPECT_EQ(128u, interpolate_pixels[0][3]);
+
+  ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+                  &interpolate_pixels[0][0], 0, 4, 1, 192);
+
+  EXPECT_EQ(4u, interpolate_pixels[0][0]);
+  EXPECT_EQ(8u, interpolate_pixels[0][1]);
+  EXPECT_EQ(16u, interpolate_pixels[0][2]);
+  EXPECT_EQ(32u, interpolate_pixels[0][3]);
+
+  for (int i = 0; i < benchmark_iterations_ * (1280 * 720 / 256); ++i) {
+    ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+                    &interpolate_pixels[0][0], 0, 256, 1, 128);
+  }
+}
+
+TEST_F(libyuvTest, TestAffine) {
+  SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
+  SIMD_ALIGNED(uint8 interpolate_pixels_C[256][4]);
+#if defined(HAS_ARGBAFFINEROW_SSE2)
+  SIMD_ALIGNED(uint8 interpolate_pixels_Opt[256][4]);
+#endif
+
+  for (int i = 0; i < 256; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      orig_pixels_0[i][j] = i;
+    }
+  }
+
+  float uv_step[4] = { 0.f, 0.f, 0.75f, 0.f };
+
+  ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0],
+                  uv_step, 256);
+  EXPECT_EQ(0u, interpolate_pixels_C[0][0]);
+  EXPECT_EQ(96u, interpolate_pixels_C[128][0]);
+  EXPECT_EQ(191u, interpolate_pixels_C[255][3]);
+
+#if defined(HAS_ARGBAFFINEROW_SSE2)
+  ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
+                     uv_step, 256);
+  EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 256 * 4));
+#endif
+
+#if defined(HAS_ARGBAFFINEROW_SSE2)
+  int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+  if (has_sse2) {
+    for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+      ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
+                         uv_step, 256);
+    }
+  } else {
+#endif
+    for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+      ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0],
+                      uv_step, 256);
+    }
+#if defined(HAS_ARGBAFFINEROW_SSE2)
+  }
+#endif
+}
+
+TEST_F(libyuvTest, Test565) {
+  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  SIMD_ALIGNED(uint8 pixels565[256][2]);
+
+  for (int i = 0; i < 256; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      orig_pixels[i][j] = i;
+    }
+  }
+  ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
+  uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
+  EXPECT_EQ(610919429u, checksum);
+}
+
+}  // namespace libyuv
diff --git a/files/unit_test/rotate_argb_test.cc b/files/unit_test/rotate_argb_test.cc
new file mode 100644
index 00000000..fe8435e1
--- /dev/null
+++ b/files/unit_test/rotate_argb_test.cc
@@ -0,0 +1,195 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/rotate_argb.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+static int ARGBTestRotate(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          libyuv::RotationMode mode, int runs) {
+  const int b = 128;
+  int src_argb_plane_size = (src_width + b * 2) * (src_height + b * 2) * 4;
+  int src_stride_argb = (b * 2 + src_width) * 4;
+
+  align_buffer_16(src_argb, src_argb_plane_size)
+  memset(src_argb, 1, src_argb_plane_size);
+
+  int dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
+  int dst_stride_argb = (b * 2 + dst_width) * 4;
+
+  srandom(time(NULL));
+
+  int i, j;
+  for (i = b; i < (src_height + b); ++i) {
+    for (j = b; j < (src_width + b) * 4; ++j) {
+      src_argb[(i * src_stride_argb) + j] = (random() & 0xff);
+    }
+  }
+
+  align_buffer_16(dst_argb_c, dst_argb_plane_size)
+  align_buffer_16(dst_argb_opt, dst_argb_plane_size)
+  memset(dst_argb_c, 2, dst_argb_plane_size);
+  memset(dst_argb_opt, 3, dst_argb_plane_size);
+
+  // Warm up both versions for consistent benchmarks.
+  MaskCpuFlags(0);  // Disable all CPU optimization.
+  ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+             dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+             src_width, src_height, mode);
+  MaskCpuFlags(-1);  // Enable all CPU optimization.
+  ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+             dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+             src_width, src_height, mode);
+
+  MaskCpuFlags(0);  // Disable all CPU optimization.
+  double c_time = get_time();
+  for (i = 0; i < runs; ++i) {
+    ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+               dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+               src_width, src_height, mode);
+  }
+  c_time = (get_time() - c_time) / runs;
+
+  MaskCpuFlags(-1);  // Enable all CPU optimization.
+  double opt_time = get_time();
+  for (i = 0; i < runs; ++i) {
+    ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+               dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+               src_width, src_height, mode);
+  }
+  opt_time = (get_time() - opt_time) / runs;
+
+  // Report performance of C vs OPT
+  printf("filter %d - %8d us C - %8d us OPT\n",
+         mode, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
+
+  // C version may be a little off from the optimized. Order of
+  //  operations may introduce rounding somewhere. So do a difference
+  //  of the buffers and look to see that the max difference isn't
+  //  over 2.
+  int max_diff = 0;
+  for (i = b; i < (dst_height + b); ++i) {
+    for (j = b * 4; j < (dst_width + b) * 4; ++j) {
+      int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] -
+                         dst_argb_opt[(i * dst_stride_argb) + j]);
+      if (abs_diff > max_diff)
+        max_diff = abs_diff;
+    }
+  }
+
+  free_aligned_buffer_16(dst_argb_c)
+  free_aligned_buffer_16(dst_argb_opt)
+  free_aligned_buffer_16(src_argb)
+  return max_diff;
+}
+
+TEST_F(libyuvTest, ARGBRotate0) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = 1280;
+  const int dst_height = 720;
+
+  int err = ARGBTestRotate(src_width, src_height,
+                           dst_width, dst_height, kRotate0,
+                           benchmark_iterations_);
+  EXPECT_GE(1, err);
+}
+
+TEST_F(libyuvTest, ARGBRotate90) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = 720;
+  const int dst_height = 1280;
+
+  int err = ARGBTestRotate(src_width, src_height,
+                           dst_width, dst_height, kRotate90,
+                           benchmark_iterations_);
+  EXPECT_GE(1, err);
+}
+
+TEST_F(libyuvTest, ARGBRotate180) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = 1280;
+  const int dst_height = 720;
+
+  int err = ARGBTestRotate(src_width, src_height,
+                           dst_width, dst_height, kRotate180,
+                           benchmark_iterations_);
+  EXPECT_GE(1, err);
+}
+
+TEST_F(libyuvTest, ARGBRotate270) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = 720;
+  const int dst_height = 1280;
+
+  int err = ARGBTestRotate(src_width, src_height,
+                           dst_width, dst_height, kRotate270,
+                           benchmark_iterations_);
+  EXPECT_GE(1, err);
+}
+
+TEST_F(libyuvTest, ARGBRotate0_Odd) {
+  const int src_width = 1277;
+  const int src_height = 719;
+  const int dst_width = 1277;
+  const int dst_height = 719;
+
+  int err = ARGBTestRotate(src_width, src_height,
+                           dst_width, dst_height, kRotate0,
+                           benchmark_iterations_);
+  EXPECT_GE(1, err);
+}
+
+TEST_F(libyuvTest, ARGBRotate90_Odd) {
+  const int src_width = 1277;
+  const int src_height = 719;
+  const int dst_width = 719;
+  const int dst_height = 1277;
+
+  int err = ARGBTestRotate(src_width, src_height,
+                           dst_width, dst_height, kRotate90,
+                           benchmark_iterations_);
+  EXPECT_GE(1, err);
+}
+
+TEST_F(libyuvTest, ARGBRotate180_Odd) {
+  const int src_width = 1277;
+  const int src_height = 719;
+  const int dst_width = 1277;
+  const int dst_height = 719;
+
+  int err = ARGBTestRotate(src_width, src_height,
+                           dst_width, dst_height, kRotate180,
+                           benchmark_iterations_);
+  EXPECT_GE(1, err);
+}
+
+TEST_F(libyuvTest, ARGBRotate270_Odd) {
+  const int src_width = 1277;
+  const int src_height = 719;
+  const int dst_width = 719;
+  const int dst_height = 1277;
+
+  int err = ARGBTestRotate(src_width, src_height,
+                           dst_width, dst_height, kRotate270,
+                           benchmark_iterations_);
+  EXPECT_GE(1, err);
+}
+
+}  // namespace libyuv
diff --git a/files/unit_test/rotate_test.cc b/files/unit_test/rotate_test.cc
index 1c295b08..788e511e 100644
--- a/files/unit_test/rotate_test.cc
+++ b/files/unit_test/rotate_test.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,21 +8,19 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/rotate.h"
-#include "../source/rotate_priv.h"
-#include "unit_test.h"
 #include <stdlib.h>
 #include <time.h>
 
-using namespace libyuv;
-
-void print_array(uint8 *array, int w, int h) {
-  int i, j;
+#include "libyuv/rotate.h"
+#include "../unit_test/unit_test.h"
 
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j)
-      printf("%4d", (signed char)array[(i * w) + j]);
+namespace libyuv {
 
+void PrintArray(uint8 *array, int w, int h) {
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; ++j) {
+      printf("%4d", (signed char)array[i * w + j]);
+    }
     printf("\n");
   }
 }
@@ -31,46 +29,45 @@ TEST_F(libyuvTest, Transpose) {
   int iw, ih, ow, oh;
   int err = 0;
 
-  for (iw = 8; iw < _rotate_max_w && !err; ++iw)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+  for (iw = 8; iw < rotate_max_w_ && !err; ++iw) {
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_1;
-      uint8 *output_2;
-
       ow = ih;
       oh = iw;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_1 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_2 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_1, ow * oh)
+      align_buffer_16(output_2, iw * ih)
 
-      for (i = 0; i < (iw * ih); ++i)
+      for (i = 0; i < iw * ih; ++i) {
         input[i] = i;
+      }
 
       TransposePlane(input,    iw, output_1, ow, iw, ih);
       TransposePlane(output_1, ow, output_2, oh, ow, oh);
 
-      for (i = 0; i < (iw * ih); ++i) {
-        if (input[i] != output_2[i])
+      for (i = 0; i < iw * ih; ++i) {
+        if (input[i] != output_2[i]) {
           err++;
+        }
       }
 
       if (err) {
         printf("input %dx%d \n", iw, ih);
-        print_array(input, iw, ih);
+        PrintArray(input, iw, ih);
 
         printf("transpose 1\n");
-        print_array(output_1, ow, oh);
+        PrintArray(output_1, ow, oh);
 
         printf("transpose 2\n");
-        print_array(output_2, iw, ih);
+        PrintArray(output_2, iw, ih);
       }
 
-      free(input);
-      free(output_1);
-      free(output_2);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_1)
+      free_aligned_buffer_16(output_2)
     }
+  }
 
   EXPECT_EQ(0, err);
 }
@@ -79,23 +76,20 @@ TEST_F(libyuvTest, TransposeUV) {
   int iw, ih, ow, oh;
   int err = 0;
 
-  for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+  for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) {
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_a1, *output_b1;
-      uint8 *output_a2, *output_b2;
 
       ow = ih;
       oh = iw >> 1;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_a1 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_b1 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_a2 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_b2 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_a1, ow * oh)
+      align_buffer_16(output_b1, ow * oh)
+      align_buffer_16(output_a2, iw * ih)
+      align_buffer_16(output_b2, iw * ih)
 
-      for (i = 0; i < (iw * ih); i += 2) {
+      for (i = 0; i < iw * ih; i += 2) {
         input[i] = i >> 1;
         input[i + 1] = -(i >> 1);
       }
@@ -105,32 +99,35 @@ TEST_F(libyuvTest, TransposeUV) {
       TransposePlane(output_a1, ow, output_a2, oh, ow, oh);
       TransposePlane(output_b1, ow, output_b2, oh, ow, oh);
 
-      for (i = 0; i < (iw * ih); i += 2) {
-        if (input[i] != output_a2[i >> 1])
+      for (i = 0; i < iw * ih; i += 2) {
+        if (input[i] != output_a2[i >> 1]) {
           err++;
-        if (input[i + 1] != output_b2[i >> 1])
+        }
+        if (input[i + 1] != output_b2[i >> 1]) {
           err++;
+        }
       }
 
       if (err) {
         printf("input %dx%d \n", iw, ih);
-        print_array(input, iw, ih);
+        PrintArray(input, iw, ih);
 
         printf("transpose 1\n");
-        print_array(output_a1, ow, oh);
-        print_array(output_b1, ow, oh);
+        PrintArray(output_a1, ow, oh);
+        PrintArray(output_b1, ow, oh);
 
         printf("transpose 2\n");
-        print_array(output_a2, oh, ow);
-        print_array(output_b2, oh, ow);
+        PrintArray(output_a2, oh, ow);
+        PrintArray(output_b2, oh, ow);
       }
 
-      free(input);
-      free(output_a1);
-      free(output_b1);
-      free(output_a2);
-      free(output_b2);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_a1)
+      free_aligned_buffer_16(output_b1)
+      free_aligned_buffer_16(output_a2)
+      free_aligned_buffer_16(output_b2)
     }
+  }
 
   EXPECT_EQ(0, err);
 }
@@ -139,60 +136,58 @@ TEST_F(libyuvTest, RotatePlane90) {
   int iw, ih, ow, oh;
   int err = 0;
 
-  for (iw = 8; iw < _rotate_max_w && !err; ++iw)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+  for (iw = 8; iw < rotate_max_w_ && !err; ++iw) {
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_0;
-      uint8 *output_90;
-      uint8 *output_180;
-      uint8 *output_270;
 
       ow = ih;
       oh = iw;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_270 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0, iw * ih)
+      align_buffer_16(output_90, ow * oh)
+      align_buffer_16(output_180, iw * ih)
+      align_buffer_16(output_270, ow * oh)
 
-      for (i = 0; i < (iw * ih); ++i)
+      for (i = 0; i < iw * ih; ++i) {
         input[i] = i;
+      }
 
       RotatePlane90(input,      iw, output_90,  ow, iw, ih);
       RotatePlane90(output_90,  ow, output_180, oh, ow, oh);
       RotatePlane90(output_180, oh, output_270, ow, oh, ow);
       RotatePlane90(output_270, ow, output_0,   iw, ow, oh);
 
-      for (i = 0; i < (iw * ih); ++i) {
-        if (input[i] != output_0[i])
+      for (i = 0; i < iw * ih; ++i) {
+        if (input[i] != output_0[i]) {
           err++;
+        }
       }
 
       if (err) {
         printf("input %dx%d \n", iw, ih);
-        print_array(input, iw, ih);
+        PrintArray(input, iw, ih);
 
         printf("output 90\n");
-        print_array(output_90, ow, oh);
+        PrintArray(output_90, ow, oh);
 
         printf("output 180\n");
-        print_array(output_180, iw, ih);
+        PrintArray(output_180, iw, ih);
 
         printf("output 270\n");
-        print_array(output_270, ow, oh);
+        PrintArray(output_270, ow, oh);
 
         printf("output 0\n");
-        print_array(output_0, iw, ih);
+        PrintArray(output_0, iw, ih);
       }
 
-      free(input);
-      free(output_0);
-      free(output_90);
-      free(output_180);
-      free(output_270);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0)
+      free_aligned_buffer_16(output_90)
+      free_aligned_buffer_16(output_180)
+      free_aligned_buffer_16(output_270)
     }
+  }
 
   EXPECT_EQ(0, err);
 }
@@ -201,29 +196,22 @@ TEST_F(libyuvTest, RotateUV90) {
   int iw, ih, ow, oh;
   int err = 0;
 
-  for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+  for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) {
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_0_u;
-      uint8 *output_0_v;
-      uint8 *output_90_u;
-      uint8 *output_90_v;
-      uint8 *output_180_u;
-      uint8 *output_180_v;
 
       ow = ih;
       oh = iw >> 1;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_0_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_90_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_90_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0_u, ow * oh)
+      align_buffer_16(output_0_v, ow * oh)
+      align_buffer_16(output_90_u, ow * oh)
+      align_buffer_16(output_90_v, ow * oh)
+      align_buffer_16(output_180_u, ow * oh)
+      align_buffer_16(output_180_v, ow * oh)
 
-      for (i = 0; i < (iw * ih); i += 2) {
+      for (i = 0; i < iw * ih; i += 2) {
         input[i] = i >> 1;
         input[i + 1] = -(i >> 1);
       }
@@ -237,43 +225,46 @@ TEST_F(libyuvTest, RotateUV90) {
       RotatePlane180(output_180_v, ow, output_0_v, ow, ow, oh);
 
       for (i = 0; i < (ow * oh); ++i) {
-        if (output_0_u[i] != (uint8)i)
+        if (output_0_u[i] != (uint8)i) {
           err++;
-        if (output_0_v[i] != (uint8)(-i))
+        }
+        if (output_0_v[i] != (uint8)(-i)) {
           err++;
+        }
       }
 
       if (err) {
         printf("input %dx%d \n", iw, ih);
-        print_array(input, iw, ih);
+        PrintArray(input, iw, ih);
 
         printf("output 90_u\n");
-        print_array(output_90_u, ow, oh);
+        PrintArray(output_90_u, ow, oh);
 
         printf("output 90_v\n");
-        print_array(output_90_v, ow, oh);
+        PrintArray(output_90_v, ow, oh);
 
         printf("output 180_u\n");
-        print_array(output_180_u, oh, ow);
+        PrintArray(output_180_u, oh, ow);
 
         printf("output 180_v\n");
-        print_array(output_180_v, oh, ow);
+        PrintArray(output_180_v, oh, ow);
 
         printf("output 0_u\n");
-        print_array(output_0_u, oh, ow);
+        PrintArray(output_0_u, oh, ow);
 
         printf("output 0_v\n");
-        print_array(output_0_v, oh, ow);
+        PrintArray(output_0_v, oh, ow);
       }
 
-      free(input);
-      free(output_0_u);
-      free(output_0_v);
-      free(output_90_u);
-      free(output_90_v);
-      free(output_180_u);
-      free(output_180_v);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0_u)
+      free_aligned_buffer_16(output_0_v)
+      free_aligned_buffer_16(output_90_u)
+      free_aligned_buffer_16(output_90_v)
+      free_aligned_buffer_16(output_180_u)
+      free_aligned_buffer_16(output_180_v)
     }
+  }
 
   EXPECT_EQ(0, err);
 }
@@ -282,29 +273,22 @@ TEST_F(libyuvTest, RotateUV180) {
   int iw, ih, ow, oh;
   int err = 0;
 
-  for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+  for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) {
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_0_u;
-      uint8 *output_0_v;
-      uint8 *output_90_u;
-      uint8 *output_90_v;
-      uint8 *output_180_u;
-      uint8 *output_180_v;
 
       ow = iw >> 1;
       oh = ih;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_0_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_90_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_90_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0_u, ow * oh)
+      align_buffer_16(output_0_v, ow * oh)
+      align_buffer_16(output_90_u, ow * oh)
+      align_buffer_16(output_90_v, ow * oh)
+      align_buffer_16(output_180_u, ow * oh)
+      align_buffer_16(output_180_v, ow * oh)
 
-      for (i = 0; i < (iw * ih); i += 2) {
+      for (i = 0; i < iw * ih; i += 2) {
         input[i] = i >> 1;
         input[i + 1] = -(i >> 1);
       }
@@ -318,43 +302,46 @@ TEST_F(libyuvTest, RotateUV180) {
       RotatePlane90(output_90_v, oh, output_0_v, ow, oh, ow);
 
       for (i = 0; i < (ow * oh); ++i) {
-        if (output_0_u[i] != (uint8)i)
+        if (output_0_u[i] != (uint8)i) {
           err++;
-        if (output_0_v[i] != (uint8)(-i))
+        }
+        if (output_0_v[i] != (uint8)(-i)) {
           err++;
+        }
       }
 
       if (err) {
         printf("input %dx%d \n", iw, ih);
-        print_array(input, iw, ih);
+        PrintArray(input, iw, ih);
 
         printf("output 180_u\n");
-        print_array(output_180_u, oh, ow);
+        PrintArray(output_180_u, oh, ow);
 
         printf("output 180_v\n");
-        print_array(output_180_v, oh, ow);
+        PrintArray(output_180_v, oh, ow);
 
         printf("output 90_u\n");
-        print_array(output_90_u, oh, ow);
+        PrintArray(output_90_u, oh, ow);
 
         printf("output 90_v\n");
-        print_array(output_90_v, oh, ow);
+        PrintArray(output_90_v, oh, ow);
 
         printf("output 0_u\n");
-        print_array(output_0_u, ow, oh);
+        PrintArray(output_0_u, ow, oh);
 
         printf("output 0_v\n");
-        print_array(output_0_v, ow, oh);
+        PrintArray(output_0_v, ow, oh);
       }
 
-      free(input);
-      free(output_0_u);
-      free(output_0_v);
-      free(output_90_u);
-      free(output_90_v);
-      free(output_180_u);
-      free(output_180_v);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0_u)
+      free_aligned_buffer_16(output_0_v)
+      free_aligned_buffer_16(output_90_u)
+      free_aligned_buffer_16(output_90_v)
+      free_aligned_buffer_16(output_180_u)
+      free_aligned_buffer_16(output_180_v)
     }
+  }
 
   EXPECT_EQ(0, err);
 }
@@ -363,29 +350,22 @@ TEST_F(libyuvTest, RotateUV270) {
   int iw, ih, ow, oh;
   int err = 0;
 
-  for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+  for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) {
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_0_u;
-      uint8 *output_0_v;
-      uint8 *output_270_u;
-      uint8 *output_270_v;
-      uint8 *output_180_u;
-      uint8 *output_180_v;
 
       ow = ih;
       oh = iw >> 1;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_0_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_270_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_270_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0_u, ow * oh)
+      align_buffer_16(output_0_v, ow * oh)
+      align_buffer_16(output_270_u, ow * oh)
+      align_buffer_16(output_270_v, ow * oh)
+      align_buffer_16(output_180_u, ow * oh)
+      align_buffer_16(output_180_v, ow * oh)
 
-      for (i = 0; i < (iw * ih); i += 2) {
+      for (i = 0; i < iw * ih; i += 2) {
         input[i] = i >> 1;
         input[i + 1] = -(i >> 1);
       }
@@ -400,43 +380,46 @@ TEST_F(libyuvTest, RotateUV270) {
       RotatePlane180(output_180_v, ow, output_0_v, ow, ow, oh);
 
       for (i = 0; i < (ow * oh); ++i) {
-        if (output_0_u[i] != (uint8)i)
+        if (output_0_u[i] != (uint8)i) {
           err++;
-        if (output_0_v[i] != (uint8)(-i))
+        }
+        if (output_0_v[i] != (uint8)(-i)) {
           err++;
+        }
       }
 
       if (err) {
         printf("input %dx%d \n", iw, ih);
-        print_array(input, iw, ih);
+        PrintArray(input, iw, ih);
 
         printf("output 270_u\n");
-        print_array(output_270_u, ow, oh);
+        PrintArray(output_270_u, ow, oh);
 
         printf("output 270_v\n");
-        print_array(output_270_v, ow, oh);
+        PrintArray(output_270_v, ow, oh);
 
         printf("output 180_u\n");
-        print_array(output_180_u, oh, ow);
+        PrintArray(output_180_u, oh, ow);
 
         printf("output 180_v\n");
-        print_array(output_180_v, oh, ow);
+        PrintArray(output_180_v, oh, ow);
 
         printf("output 0_u\n");
-        print_array(output_0_u, oh, ow);
+        PrintArray(output_0_u, oh, ow);
 
         printf("output 0_v\n");
-        print_array(output_0_v, oh, ow);
+        PrintArray(output_0_v, oh, ow);
       }
 
-      free(input);
-      free(output_0_u);
-      free(output_0_v);
-      free(output_270_u);
-      free(output_270_v);
-      free(output_180_u);
-      free(output_180_v);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0_u)
+      free_aligned_buffer_16(output_0_v)
+      free_aligned_buffer_16(output_270_u)
+      free_aligned_buffer_16(output_270_v)
+      free_aligned_buffer_16(output_180_u)
+      free_aligned_buffer_16(output_180_v)
     }
+  }
 
   EXPECT_EQ(0, err);
 }
@@ -445,45 +428,44 @@ TEST_F(libyuvTest, RotatePlane180) {
   int iw, ih, ow, oh;
   int err = 0;
 
-  for (iw = 8; iw < _rotate_max_w && !err; ++iw)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+  for (iw = 8; iw < rotate_max_w_ && !err; ++iw)
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_0;
-      uint8 *output_180;
 
       ow = iw;
       oh = ih;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_180 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0, iw * ih)
+      align_buffer_16(output_180, iw * ih)
 
-      for (i = 0; i < (iw * ih); ++i)
+      for (i = 0; i < iw * ih; ++i) {
         input[i] = i;
+      }
 
       RotatePlane180(input,      iw, output_180, ow, iw, ih);
       RotatePlane180(output_180, ow, output_0,   iw, ow, oh);
 
-      for (i = 0; i < (iw * ih); ++i) {
-        if (input[i] != output_0[i])
+      for (i = 0; i < iw * ih; ++i) {
+        if (input[i] != output_0[i]) {
           err++;
+        }
       }
 
       if (err) {
         printf("input %dx%d \n", iw, ih);
-        print_array(input, iw, ih);
+        PrintArray(input, iw, ih);
 
         printf("output 180\n");
-        print_array(output_180, iw, ih);
+        PrintArray(output_180, iw, ih);
 
         printf("output 0\n");
-        print_array(output_0, iw, ih);
+        PrintArray(output_0, iw, ih);
       }
 
-      free(input);
-      free(output_0);
-      free(output_180);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0)
+      free_aligned_buffer_16(output_180)
     }
 
   EXPECT_EQ(0, err);
@@ -493,25 +475,20 @@ TEST_F(libyuvTest, RotatePlane270) {
   int iw, ih, ow, oh;
   int err = 0;
 
-  for (iw = 8; iw < _rotate_max_w && !err; ++iw)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+  for (iw = 8; iw < rotate_max_w_ && !err; ++iw) {
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_0;
-      uint8 *output_90;
-      uint8 *output_180;
-      uint8 *output_270;
 
       ow = ih;
       oh = iw;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_270 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0, iw * ih)
+      align_buffer_16(output_90, ow * oh)
+      align_buffer_16(output_180, iw * ih)
+      align_buffer_16(output_270, ow * oh)
 
-      for (i = 0; i < (iw * ih); ++i)
+      for (i = 0; i < iw * ih; ++i)
         input[i] = i;
 
       RotatePlane270(input,      iw, output_270, ow, iw, ih);
@@ -519,34 +496,36 @@ TEST_F(libyuvTest, RotatePlane270) {
       RotatePlane270(output_180, oh, output_90,  ow, oh, ow);
       RotatePlane270(output_90,  ow, output_0,   iw, ow, oh);
 
-      for (i = 0; i < (iw * ih); ++i) {
-        if (input[i] != output_0[i])
+      for (i = 0; i < iw * ih; ++i) {
+        if (input[i] != output_0[i]) {
           err++;
+        }
       }
 
       if (err) {
         printf("input %dx%d \n", iw, ih);
-        print_array(input, iw, ih);
+        PrintArray(input, iw, ih);
 
         printf("output 270\n");
-        print_array(output_270, ow, oh);
+        PrintArray(output_270, ow, oh);
 
         printf("output 180\n");
-        print_array(output_180, iw, ih);
+        PrintArray(output_180, iw, ih);
 
         printf("output 90\n");
-        print_array(output_90, ow, oh);
+        PrintArray(output_90, ow, oh);
 
         printf("output 0\n");
-        print_array(output_0, iw, ih);
+        PrintArray(output_0, iw, ih);
       }
 
-      free(input);
-      free(output_0);
-      free(output_90);
-      free(output_180);
-      free(output_270);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0)
+      free_aligned_buffer_16(output_90)
+      free_aligned_buffer_16(output_180)
+      free_aligned_buffer_16(output_270)
     }
+  }
 
   EXPECT_EQ(0, err);
 }
@@ -555,44 +534,44 @@ TEST_F(libyuvTest, RotatePlane90and270) {
   int iw, ih, ow, oh;
   int err = 0;
 
-  for (iw = 16; iw < _rotate_max_w && !err; iw += 4)
-    for (ih = 16; ih < _rotate_max_h && !err; ih += 4) {
+  for (iw = 16; iw < rotate_max_w_ && !err; iw += 4)
+    for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) {
       int i;
-      uint8 *input;
-      uint8 *output_0;
-      uint8 *output_90;
+
       ow = ih;
       oh = iw;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0, iw * ih)
+      align_buffer_16(output_90, ow * oh)
 
-      for (i = 0; i < (iw * ih); ++i)
+      for (i = 0; i < iw * ih; ++i) {
         input[i] = i;
+      }
 
       RotatePlane90(input,      iw, output_90,  ow, iw, ih);
       RotatePlane270(output_90, ow, output_0,   iw, ow, oh);
 
-      for (i = 0; i < (iw * ih); ++i) {
-        if (input[i] != output_0[i])
+      for (i = 0; i < iw * ih; ++i) {
+        if (input[i] != output_0[i]) {
           err++;
+        }
       }
 
       if (err) {
         printf("intput %dx%d\n", iw, ih);
-        print_array(input, iw, ih);
+        PrintArray(input, iw, ih);
 
         printf("output \n");
-        print_array(output_90, ow, oh);
+        PrintArray(output_90, ow, oh);
 
         printf("output \n");
-        print_array(output_0, iw, ih);
+        PrintArray(output_0, iw, ih);
       }
 
-      free(input);
-      free(output_0);
-      free(output_90);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0)
+      free_aligned_buffer_16(output_90)
     }
 
   EXPECT_EQ(0, err);
@@ -602,21 +581,20 @@ TEST_F(libyuvTest, RotatePlane90Pitch) {
   int iw, ih;
   int err = 0;
 
-  for (iw = 16; iw < _rotate_max_w && !err; iw += 4)
-    for (ih = 16; ih < _rotate_max_h && !err; ih += 4) {
+  for (iw = 16; iw < rotate_max_w_ && !err; iw += 4)
+    for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) {
       int i;
-      uint8 *input;
-      uint8 *output_0;
-      uint8 *output_90;
+
       int ow = ih;
       int oh = iw;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0, iw * ih)
+      align_buffer_16(output_90, ow * oh)
 
-      for (i = 0; i < (iw * ih); ++i)
+      for (i = 0; i < iw * ih; ++i) {
         input[i] = i;
+      }
 
       RotatePlane90(input, iw,
                     output_90 + (ow >> 1), ow,
@@ -633,25 +611,26 @@ TEST_F(libyuvTest, RotatePlane90Pitch) {
 
       RotatePlane270(output_90, ih, output_0,   iw, ow, oh);
 
-      for (i = 0; i < (iw * ih); ++i) {
-        if (input[i] != output_0[i])
+      for (i = 0; i < iw * ih; ++i) {
+        if (input[i] != output_0[i]) {
           err++;
+        }
       }
 
       if (err) {
         printf("intput %dx%d\n", iw, ih);
-        print_array(input, iw, ih);
+        PrintArray(input, iw, ih);
 
         printf("output \n");
-        print_array(output_90, ow, oh);
+        PrintArray(output_90, ow, oh);
 
         printf("output \n");
-        print_array(output_0, iw, ih);
+        PrintArray(output_0, iw, ih);
       }
 
-      free(input);
-      free(output_0);
-      free(output_90);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0)
+      free_aligned_buffer_16(output_90)
     }
 
   EXPECT_EQ(0, err);
@@ -661,22 +640,20 @@ TEST_F(libyuvTest, RotatePlane270Pitch) {
   int iw, ih, ow, oh;
   int err = 0;
 
-  for (iw = 16; iw < _rotate_max_w && !err; iw += 4)
-    for (ih = 16; ih < _rotate_max_h && !err; ih += 4) {
+  for (iw = 16; iw < rotate_max_w_ && !err; iw += 4) {
+    for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) {
       int i;
-      uint8 *input;
-      uint8 *output_0;
-      uint8 *output_270;
 
       ow = ih;
       oh = iw;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_270 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0, iw * ih)
+      align_buffer_16(output_270, ow * oh)
 
-      for (i = 0; i < (iw * ih); ++i)
+      for (i = 0; i < iw * ih; ++i) {
         input[i] = i;
+      }
 
       RotatePlane270(input, iw,
                      output_270 + ow * (oh >> 1), ow,
@@ -693,36 +670,34 @@ TEST_F(libyuvTest, RotatePlane270Pitch) {
 
       RotatePlane90(output_270, ih, output_0,   iw, ow, oh);
 
-      for (i = 0; i < (iw * ih); ++i) {
-        if (input[i] != output_0[i])
+      for (i = 0; i < iw * ih; ++i) {
+        if (input[i] != output_0[i]) {
           err++;
+        }
       }
 
       if (err) {
         printf("intput %dx%d\n", iw, ih);
-        print_array(input, iw, ih);
+        PrintArray(input, iw, ih);
 
         printf("output \n");
-        print_array(output_270, ow, oh);
+        PrintArray(output_270, ow, oh);
 
         printf("output \n");
-        print_array(output_0, iw, ih);
+        PrintArray(output_0, iw, ih);
       }
 
-      free(input);
-      free(output_0);
-      free(output_270);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0)
+      free_aligned_buffer_16(output_270)
     }
+  }
 
   EXPECT_EQ(0, err);
 }
 
 TEST_F(libyuvTest, I420Rotate90) {
   int err = 0;
-  uint8 *orig_y, *orig_u, *orig_v;
-  uint8 *ro0_y, *ro0_u, *ro0_v;
-  uint8 *ro90_y, *ro90_u, *ro90_v;
-  uint8 *ro270_y, *ro270_u, *ro270_v;
 
   int yw = 1024;
   int yh = 768;
@@ -732,50 +707,59 @@ TEST_F(libyuvTest, I420Rotate90) {
 
   int i, j;
 
-  int y_plane_size = (yw + (2 * b)) * (yh + (2 * b));
-  int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b));
+  int y_plane_size = (yw + b * 2) * (yh + b * 2);
+  int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
 
   srandom(time(NULL));
 
-  orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  orig_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  orig_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
-  ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
-  ro90_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro90_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro90_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
-  ro270_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro270_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro270_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(orig_u, uv_plane_size)
+  align_buffer_16(orig_v, uv_plane_size)
+  align_buffer_16(ro0_y, y_plane_size)
+  align_buffer_16(ro0_u, uv_plane_size)
+  align_buffer_16(ro0_v, uv_plane_size)
+  align_buffer_16(ro90_y, y_plane_size)
+  align_buffer_16(ro90_u, uv_plane_size)
+  align_buffer_16(ro90_v, uv_plane_size)
+  align_buffer_16(ro270_y, y_plane_size)
+  align_buffer_16(ro270_u, uv_plane_size)
+  align_buffer_16(ro270_v, uv_plane_size)
+  memset(orig_y, 0, y_plane_size);
+  memset(orig_u, 0, uv_plane_size);
+  memset(orig_v, 0, uv_plane_size);
+  memset(ro0_y, 0, y_plane_size);
+  memset(ro0_u, 0, uv_plane_size);
+  memset(ro0_v, 0, uv_plane_size);
+  memset(ro90_y, 0, y_plane_size);
+  memset(ro90_u, 0, uv_plane_size);
+  memset(ro90_v, 0, uv_plane_size);
+  memset(ro270_y, 0, y_plane_size);
+  memset(ro270_u, 0, uv_plane_size);
+  memset(ro270_v, 0, uv_plane_size);
 
   // fill image buffers with random data
   for (i = b; i < (yh + b); ++i) {
     for (j = b; j < (yw + b); ++j) {
-      orig_y[i * (yw + (2 * b)) + j] = random() & 0xff;
+      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
     }
   }
 
   for (i = b; i < (uvh + b); ++i) {
     for (j = b; j < (uvw + b); ++j) {
-      orig_u[i * (uvw + (2 * b)) + j] = random() & 0xff;
-      orig_v[i * (uvw + (2 * b)) + j] = random() & 0xff;
+      orig_u[i * (uvw + b * 2) + j] = random() & 0xff;
+      orig_v[i * (uvw + b * 2) + j] = random() & 0xff;
     }
   }
 
-  int y_off_0 = b * (yw + (2 * b)) + b;
-  int uv_off_0 = b * (uvw + (2 * b)) + b;
-  int y_off_90 = b * (yh + (2 * b)) + b;
-  int uv_off_90 = b * (uvh + (2 * b)) + b;
+  int y_off_0 = b * (yw + b * 2) + b;
+  int uv_off_0 = b * (uvw + b * 2) + b;
+  int y_off_90 = b * (yh + b * 2) + b;
+  int uv_off_90 = b * (uvh + b * 2) + b;
 
-  int y_st_0 = yw + (2 * b);
-  int uv_st_0 = uvw + (2 * b);
-  int y_st_90 = yh + (2 * b);
-  int uv_st_90 = uvh + (2 * b);
+  int y_st_0 = yw + b * 2;
+  int uv_st_0 = uvw + b * 2;
+  int y_st_90 = yh + b * 2;
+  int uv_st_90 = uvh + b * 2;
 
   I420Rotate(orig_y+y_off_0, y_st_0,
              orig_u+uv_off_0, uv_st_0,
@@ -805,39 +789,38 @@ TEST_F(libyuvTest, I420Rotate90) {
              kRotateClockwise);
 
   for (i = 0; i < y_plane_size; ++i) {
-    if (orig_y[i] != ro0_y[i])
+    if (orig_y[i] != ro0_y[i]) {
       ++err;
+    }
   }
 
   for (i = 0; i < uv_plane_size; ++i) {
-    if (orig_u[i] != ro0_u[i])
+    if (orig_u[i] != ro0_u[i]) {
       ++err;
-    if (orig_v[i] != ro0_v[i])
+    }
+    if (orig_v[i] != ro0_v[i]) {
       ++err;
+    }
   }
 
-  free(orig_y);
-  free(orig_u);
-  free(orig_v);
-  free(ro0_y);
-  free(ro0_u);
-  free(ro0_v);
-  free(ro90_y);
-  free(ro90_u);
-  free(ro90_v);
-  free(ro270_y);
-  free(ro270_u);
-  free(ro270_v);
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(orig_u)
+  free_aligned_buffer_16(orig_v)
+  free_aligned_buffer_16(ro0_y)
+  free_aligned_buffer_16(ro0_u)
+  free_aligned_buffer_16(ro0_v)
+  free_aligned_buffer_16(ro90_y)
+  free_aligned_buffer_16(ro90_u)
+  free_aligned_buffer_16(ro90_v)
+  free_aligned_buffer_16(ro270_y)
+  free_aligned_buffer_16(ro270_u)
+  free_aligned_buffer_16(ro270_v)
 
   EXPECT_EQ(0, err);
 }
 
 TEST_F(libyuvTest, I420Rotate270) {
   int err = 0;
-  uint8 *orig_y, *orig_u, *orig_v;
-  uint8 *ro0_y, *ro0_u, *ro0_v;
-  uint8 *ro90_y, *ro90_u, *ro90_v;
-  uint8 *ro270_y, *ro270_u, *ro270_v;
 
   int yw = 1024;
   int yh = 768;
@@ -847,50 +830,59 @@ TEST_F(libyuvTest, I420Rotate270) {
 
   int i, j;
 
-  int y_plane_size = (yw + (2 * b)) * (yh + (2 * b));
-  int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b));
+  int y_plane_size = (yw + b * 2) * (yh + b * 2);
+  int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
 
   srandom(time(NULL));
 
-  orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  orig_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  orig_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
-  ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
-  ro90_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro90_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro90_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
-  ro270_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro270_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro270_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(orig_u, uv_plane_size)
+  align_buffer_16(orig_v, uv_plane_size)
+  align_buffer_16(ro0_y, y_plane_size)
+  align_buffer_16(ro0_u, uv_plane_size)
+  align_buffer_16(ro0_v, uv_plane_size)
+  align_buffer_16(ro90_y, y_plane_size)
+  align_buffer_16(ro90_u, uv_plane_size)
+  align_buffer_16(ro90_v, uv_plane_size)
+  align_buffer_16(ro270_y, y_plane_size)
+  align_buffer_16(ro270_u, uv_plane_size)
+  align_buffer_16(ro270_v, uv_plane_size)
+  memset(orig_y, 0, y_plane_size);
+  memset(orig_u, 0, uv_plane_size);
+  memset(orig_v, 0, uv_plane_size);
+  memset(ro0_y, 0, y_plane_size);
+  memset(ro0_u, 0, uv_plane_size);
+  memset(ro0_v, 0, uv_plane_size);
+  memset(ro90_y, 0, y_plane_size);
+  memset(ro90_u, 0, uv_plane_size);
+  memset(ro90_v, 0, uv_plane_size);
+  memset(ro270_y, 0, y_plane_size);
+  memset(ro270_u, 0, uv_plane_size);
+  memset(ro270_v, 0, uv_plane_size);
 
   // fill image buffers with random data
   for (i = b; i < (yh + b); ++i) {
     for (j = b; j < (yw + b); ++j) {
-      orig_y[i * (yw + (2 * b)) + j] = random() & 0xff;
+      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
     }
   }
 
   for (i = b; i < (uvh + b); ++i) {
     for (j = b; j < (uvw + b); ++j) {
-      orig_u[i * (uvw + (2 * b)) + j] = random() & 0xff;
-      orig_v[i * (uvw + (2 * b)) + j] = random() & 0xff;
+      orig_u[i * (uvw + b * 2) + j] = random() & 0xff;
+      orig_v[i * (uvw + b * 2) + j] = random() & 0xff;
     }
   }
 
-  int y_off_0 = b * (yw + (2 * b)) + b;
-  int uv_off_0 = b * (uvw + (2 * b)) + b;
-  int y_off_90 = b * (yh + (2 * b)) + b;
-  int uv_off_90 = b * (uvh + (2 * b)) + b;
+  int y_off_0 = b * (yw + b * 2) + b;
+  int uv_off_0 = b * (uvw + b * 2) + b;
+  int y_off_90 = b * (yh + b * 2) + b;
+  int uv_off_90 = b * (uvh + b * 2) + b;
 
-  int y_st_0 = yw + (2 * b);
-  int uv_st_0 = uvw + (2 * b);
-  int y_st_90 = yh + (2 * b);
-  int uv_st_90 = uvh + (2 * b);
+  int y_st_0 = yw + b * 2;
+  int uv_st_0 = uvw + b * 2;
+  int y_st_90 = yh + b * 2;
+  int uv_st_90 = uvh + b * 2;
 
   I420Rotate(orig_y+y_off_0, y_st_0,
              orig_u+uv_off_0, uv_st_0,
@@ -920,38 +912,38 @@ TEST_F(libyuvTest, I420Rotate270) {
              kRotateCounterClockwise);
 
   for (i = 0; i < y_plane_size; ++i) {
-    if (orig_y[i] != ro0_y[i])
+    if (orig_y[i] != ro0_y[i]) {
       ++err;
+    }
   }
 
   for (i = 0; i < uv_plane_size; ++i) {
-    if (orig_u[i] != ro0_u[i])
+    if (orig_u[i] != ro0_u[i]) {
       ++err;
-    if (orig_v[i] != ro0_v[i])
+    }
+    if (orig_v[i] != ro0_v[i]) {
       ++err;
+    }
   }
 
-  free(orig_y);
-  free(orig_u);
-  free(orig_v);
-  free(ro0_y);
-  free(ro0_u);
-  free(ro0_v);
-  free(ro90_y);
-  free(ro90_u);
-  free(ro90_v);
-  free(ro270_y);
-  free(ro270_u);
-  free(ro270_v);
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(orig_u)
+  free_aligned_buffer_16(orig_v)
+  free_aligned_buffer_16(ro0_y)
+  free_aligned_buffer_16(ro0_u)
+  free_aligned_buffer_16(ro0_v)
+  free_aligned_buffer_16(ro90_y)
+  free_aligned_buffer_16(ro90_u)
+  free_aligned_buffer_16(ro90_v)
+  free_aligned_buffer_16(ro270_y)
+  free_aligned_buffer_16(ro270_u)
+  free_aligned_buffer_16(ro270_v)
 
   EXPECT_EQ(0, err);
 }
 
 TEST_F(libyuvTest, NV12ToI420Rotate90) {
   int err = 0;
-  uint8 *orig_y, *orig_uv;
-  uint8 *ro0_y, *ro0_u, *ro0_v;
-  uint8 *ro90_y, *ro90_u, *ro90_v;
 
   int yw = 1024;
   int yh = 768;
@@ -960,47 +952,53 @@ TEST_F(libyuvTest, NV12ToI420Rotate90) {
   int uvh = (yh + 1) >> 1;
   int i, j;
 
-  int y_plane_size = (yw + (2 * b)) * (yh + (2 * b));
-  int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b));
-  int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b));
+  int y_plane_size = (yw + b * 2) * (yh + b * 2);
+  int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
+  int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2);
 
   srandom(time(NULL));
 
-  orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8)));
-
-  ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
-  ro90_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro90_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro90_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(orig_uv, o_uv_plane_size)
+  align_buffer_16(ro0_y, y_plane_size)
+  align_buffer_16(ro0_u, uv_plane_size)
+  align_buffer_16(ro0_v, uv_plane_size)
+  align_buffer_16(ro90_y, y_plane_size)
+  align_buffer_16(ro90_u, uv_plane_size)
+  align_buffer_16(ro90_v, uv_plane_size)
+  memset(orig_y, 0, y_plane_size);
+  memset(orig_uv, 0, uv_plane_size);
+  memset(ro0_y, 0, y_plane_size);
+  memset(ro0_u, 0, uv_plane_size);
+  memset(ro0_v, 0, uv_plane_size);
+  memset(ro90_y, 0, y_plane_size);
+  memset(ro90_u, 0, uv_plane_size);
+  memset(ro90_v, 0, uv_plane_size);
 
   // fill image buffers with random data
   for (i = b; i < (yh + b); ++i) {
     for (j = b; j < (yw + b); ++j) {
-      orig_y[i * (yw + (2 * b)) + j] = random() & 0xff;
+      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
     }
   }
 
   for (i = b; i < (uvh + b); ++i) {
-    for (j = b; j < ((2 * uvw) + b); j += 2) {
+    for (j = b; j < (uvw * 2 + b); j += 2) {
       uint8 random_number = random() & 0x7f;
-      orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number;
-      orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number;
+      orig_uv[i * (uvw * 2 + b * 2) + j] = random_number;
+      orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number;
     }
   }
 
-  int y_off_0 = b * (yw + (2 * b)) + b;
-  int uv_off_0 = b * (uvw + (2 * b)) + b;
-  int y_off_90 = b * (yh + (2 * b)) + b;
-  int uv_off_90 = b * (uvh + (2 * b)) + b;
+  int y_off_0 = b * (yw + b * 2) + b;
+  int uv_off_0 = b * (uvw + b * 2) + b;
+  int y_off_90 = b * (yh + b * 2) + b;
+  int uv_off_90 = b * (uvh + b * 2) + b;
 
-  int y_st_0 = yw + (2 * b);
-  int uv_st_0 = uvw + (2 * b);
-  int y_st_90 = yh + (2 * b);
-  int uv_st_90 = uvh + (2 * b);
+  int y_st_0 = yw + b * 2;
+  int uv_st_0 = uvw + b * 2;
+  int y_st_90 = yh + b * 2;
+  int uv_st_90 = uvh + b * 2;
 
   NV12ToI420Rotate(orig_y+y_off_0, y_st_0,
                    orig_uv+y_off_0, y_st_0,
@@ -1027,32 +1025,32 @@ TEST_F(libyuvTest, NV12ToI420Rotate90) {
   int zero_cnt = 0;
 
   for (i = 0; i < uv_plane_size; ++i) {
-    if ((signed char)ro0_u[i] != -(signed char)ro0_v[i])
+    if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) {
       ++err;
-    if (ro0_u[i] != 0)
+    }
+    if (ro0_u[i] != 0) {
       ++zero_cnt;
+    }
   }
 
-  if (!zero_cnt)
+  if (!zero_cnt) {
     ++err;
+  }
 
-  free(orig_y);
-  free(orig_uv);
-  free(ro0_y);
-  free(ro0_u);
-  free(ro0_v);
-  free(ro90_y);
-  free(ro90_u);
-  free(ro90_v);
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(orig_uv)
+  free_aligned_buffer_16(ro0_y)
+  free_aligned_buffer_16(ro0_u)
+  free_aligned_buffer_16(ro0_v)
+  free_aligned_buffer_16(ro90_y)
+  free_aligned_buffer_16(ro90_u)
+  free_aligned_buffer_16(ro90_v)
 
   EXPECT_EQ(0, err);
 }
 
 TEST_F(libyuvTest, NV12ToI420Rotate270) {
   int err = 0;
-  uint8 *orig_y, *orig_uv;
-  uint8 *ro0_y, *ro0_u, *ro0_v;
-  uint8 *ro270_y, *ro270_u, *ro270_v;
 
   int yw = 1024;
   int yh = 768;
@@ -1062,47 +1060,53 @@ TEST_F(libyuvTest, NV12ToI420Rotate270) {
 
   int i, j;
 
-  int y_plane_size = (yw + (2 * b)) * (yh + (2 * b));
-  int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b));
-  int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b));
+  int y_plane_size = (yw + b * 2) * (yh + b * 2);
+  int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
+  int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2);
 
   srandom(time(NULL));
 
-  orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8)));
-
-  ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
-  ro270_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro270_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro270_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(orig_uv, o_uv_plane_size)
+  align_buffer_16(ro0_y, y_plane_size)
+  align_buffer_16(ro0_u, uv_plane_size)
+  align_buffer_16(ro0_v, uv_plane_size)
+  align_buffer_16(ro270_y, y_plane_size)
+  align_buffer_16(ro270_u, uv_plane_size)
+  align_buffer_16(ro270_v, uv_plane_size)
+  memset(orig_y, 0, y_plane_size);
+  memset(orig_uv, 0, o_uv_plane_size);
+  memset(ro0_y, 0, y_plane_size);
+  memset(ro0_u, 0, uv_plane_size);
+  memset(ro0_v, 0, uv_plane_size);
+  memset(ro270_y, 0, y_plane_size);
+  memset(ro270_u, 0, uv_plane_size);
+  memset(ro270_v, 0, uv_plane_size);
 
   // fill image buffers with random data
   for (i = b; i < (yh + b); ++i) {
     for (j = b; j < (yw + b); ++j) {
-      orig_y[i * (yw + (2 * b)) + j] = random() & 0xff;
+      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
     }
   }
 
   for (i = b; i < (uvh + b); ++i) {
-    for (j = b; j < ((2 * uvw) + b); j += 2) {
+    for (j = b; j < (uvw * 2 + b); j += 2) {
       uint8 random_number = random() & 0x7f;
-      orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number;
-      orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number;
+      orig_uv[i * (uvw * 2 + b * 2) + j] = random_number;
+      orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number;
     }
   }
 
-  int y_off_0 = b * (yw + (2 * b)) + b;
-  int uv_off_0 = b * (uvw + (2 * b)) + b;
-  int y_off_270 = b * (yh + (2 * b)) + b;
-  int uv_off_270 = b * (uvh + (2 * b)) + b;
+  int y_off_0 = b * (yw + b * 2) + b;
+  int uv_off_0 = b * (uvw + b * 2) + b;
+  int y_off_270 = b * (yh + b * 2) + b;
+  int uv_off_270 = b * (uvh + b * 2) + b;
 
-  int y_st_0 = yw + (2 * b);
-  int uv_st_0 = uvw + (2 * b);
-  int y_st_270 = yh + (2 * b);
-  int uv_st_270 = uvh + (2 * b);
+  int y_st_0 = yw + b * 2;
+  int uv_st_0 = uvw + b * 2;
+  int y_st_270 = yh + b * 2;
+  int uv_st_270 = uvh + b * 2;
 
   NV12ToI420Rotate(orig_y+y_off_0, y_st_0,
                    orig_uv+y_off_0, y_st_0,
@@ -1129,32 +1133,32 @@ TEST_F(libyuvTest, NV12ToI420Rotate270) {
   int zero_cnt = 0;
 
   for (i = 0; i < uv_plane_size; ++i) {
-    if ((signed char)ro0_u[i] != -(signed char)ro0_v[i])
+    if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) {
       ++err;
-    if (ro0_u[i] != 0)
+    }
+    if (ro0_u[i] != 0) {
       ++zero_cnt;
+    }
   }
 
-  if (!zero_cnt)
+  if (!zero_cnt) {
     ++err;
+  }
 
-  free(orig_y);
-  free(orig_uv);
-  free(ro0_y);
-  free(ro0_u);
-  free(ro0_v);
-  free(ro270_y);
-  free(ro270_u);
-  free(ro270_v);
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(orig_uv)
+  free_aligned_buffer_16(ro0_y)
+  free_aligned_buffer_16(ro0_u)
+  free_aligned_buffer_16(ro0_v)
+  free_aligned_buffer_16(ro270_y)
+  free_aligned_buffer_16(ro270_u)
+  free_aligned_buffer_16(ro270_v)
 
   EXPECT_EQ(0, err);
 }
 
 TEST_F(libyuvTest, NV12ToI420Rotate180) {
   int err = 0;
-  uint8 *orig_y, *orig_uv;
-  uint8 *ro0_y, *ro0_u, *ro0_v;
-  uint8 *ro180_y, *ro180_u, *ro180_v;
 
   int yw = 1024;
   int yh = 768;
@@ -1164,43 +1168,49 @@ TEST_F(libyuvTest, NV12ToI420Rotate180) {
 
   int i, j;
 
-  int y_plane_size = (yw + (2 * b)) * (yh + (2 * b));
-  int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b));
-  int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b));
+  int y_plane_size = (yw + b * 2) * (yh + b * 2);
+  int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
+  int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2);
 
   srandom(time(NULL));
 
-  orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8)));
-
-  ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
-  ro180_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro180_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro180_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(orig_uv, o_uv_plane_size)
+  align_buffer_16(ro0_y, y_plane_size)
+  align_buffer_16(ro0_u, uv_plane_size)
+  align_buffer_16(ro0_v, uv_plane_size)
+  align_buffer_16(ro180_y, y_plane_size)
+  align_buffer_16(ro180_u, uv_plane_size)
+  align_buffer_16(ro180_v, uv_plane_size)
+  memset(orig_y, 0, y_plane_size);
+  memset(orig_uv, 0, o_uv_plane_size);
+  memset(ro0_y, 0, y_plane_size);
+  memset(ro0_u, 0, uv_plane_size);
+  memset(ro0_v, 0, uv_plane_size);
+  memset(ro180_y, 0, y_plane_size);
+  memset(ro180_u, 0, uv_plane_size);
+  memset(ro180_v, 0, uv_plane_size);
 
   // fill image buffers with random data
   for (i = b; i < (yh + b); ++i) {
     for (j = b; j < (yw + b); ++j) {
-      orig_y[i * (yw + (2 * b)) + j] = random() & 0xff;
+      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
     }
   }
 
   for (i = b; i < (uvh + b); ++i) {
-    for (j = b; j < ((2 * uvw) + b); j += 2) {
+    for (j = b; j < (uvw * 2 + b); j += 2) {
       uint8 random_number = random() & 0x7f;
-      orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number;
-      orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number;
+      orig_uv[i * (uvw * 2 + b * 2) + j] = random_number;
+      orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number;
     }
   }
 
-  int y_off = b * (yw + (2 * b)) + b;
-  int uv_off = b * (uvw + (2 * b)) + b;
+  int y_off = b * (yw + b * 2) + b;
+  int uv_off = b * (uvw + b * 2) + b;
 
-  int y_st = yw + (2 * b);
-  int uv_st = uvw + (2 * b);
+  int y_st = yw + b * 2;
+  int uv_st = uvw + b * 2;
 
   NV12ToI420Rotate(orig_y+y_off, y_st,
                    orig_uv+y_off, y_st,
@@ -1220,40 +1230,40 @@ TEST_F(libyuvTest, NV12ToI420Rotate180) {
              kRotate180);
 
   for (i = 0; i < y_plane_size; ++i) {
-    if (orig_y[i] != ro0_y[i])
+    if (orig_y[i] != ro0_y[i]) {
       ++err;
+    }
   }
 
   int zero_cnt = 0;
 
   for (i = 0; i < uv_plane_size; ++i) {
-    if ((signed char)ro0_u[i] != -(signed char)ro0_v[i])
+    if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) {
       ++err;
-    if (ro0_u[i] != 0)
+    }
+    if (ro0_u[i] != 0) {
       ++zero_cnt;
+    }
   }
 
-  if (!zero_cnt)
+  if (!zero_cnt) {
     ++err;
+  }
 
-  free(orig_y);
-  free(orig_uv);
-  free(ro0_y);
-  free(ro0_u);
-  free(ro0_v);
-  free(ro180_y);
-  free(ro180_u);
-  free(ro180_v);
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(orig_uv)
+  free_aligned_buffer_16(ro0_y)
+  free_aligned_buffer_16(ro0_u)
+  free_aligned_buffer_16(ro0_v)
+  free_aligned_buffer_16(ro180_y)
+  free_aligned_buffer_16(ro180_u)
+  free_aligned_buffer_16(ro180_v)
 
   EXPECT_EQ(0, err);
 }
 
 TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) {
   int y_err = 0, uv_err = 0;
-  uint8 *orig_y, *orig_uv;
-  uint8 *roa_y, *roa_u, *roa_v;
-  uint8 *rob_y, *rob_u, *rob_v;
-  uint8 *roc_y, *roc_u, *roc_v;
 
   int yw = 1024;
   int yh = 768;
@@ -1262,51 +1272,59 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) {
   int uvh = (yh + 1) >> 1;
   int i, j;
 
-  int y_plane_size = (yw + (2 * b)) * (yh + (2 * b));
-  int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b));
-  int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b));
+  int y_plane_size = (yw + b * 2) * (yh + b * 2);
+  int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
+  int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2);
 
   srandom(time(NULL));
 
-  orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8)));
-
-  roa_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  roa_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  roa_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
-  rob_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  rob_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  rob_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
-  roc_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  roc_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  roc_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(orig_uv, o_uv_plane_size)
+  align_buffer_16(roa_y, y_plane_size)
+  align_buffer_16(roa_u, uv_plane_size)
+  align_buffer_16(roa_v, uv_plane_size)
+  align_buffer_16(rob_y, y_plane_size)
+  align_buffer_16(rob_u, uv_plane_size)
+  align_buffer_16(rob_v, uv_plane_size)
+  align_buffer_16(roc_y, y_plane_size)
+  align_buffer_16(roc_u, uv_plane_size)
+  align_buffer_16(roc_v, uv_plane_size)
+  memset(orig_y, 0, y_plane_size);
+  memset(orig_uv, 0, o_uv_plane_size);
+  memset(roa_y, 0, y_plane_size);
+  memset(roa_u, 0, uv_plane_size);
+  memset(roa_v, 0, uv_plane_size);
+  memset(rob_y, 0, y_plane_size);
+  memset(rob_u, 0, uv_plane_size);
+  memset(rob_v, 0, uv_plane_size);
+  memset(roc_y, 0, y_plane_size);
+  memset(roc_u, 0, uv_plane_size);
+  memset(roc_v, 0, uv_plane_size);
 
   // fill image buffers with random data
   for (i = b; i < (yh + b); ++i) {
     for (j = b; j < (yw + b); ++j) {
-      orig_y[i * (yw + (2 * b)) + j] = random() & 0xff;
+      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
     }
   }
 
   for (i = b; i < (uvh + b); ++i) {
-    for (j = b; j < ((2 * uvw) + b); j += 2) {
+    for (j = b; j < (uvw * 2 + b); j += 2) {
       uint8 random_number = random() & 0x7f;
-      orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number;
-      orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number;
+      orig_uv[i * (uvw * 2 + b * 2) + j] = random_number;
+      orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number;
     }
   }
 
-  int y_off_0 = b * (yw + (2 * b)) + b;
-  int uv_off_0 = b * (uvw + (2 * b)) + b;
-  int y_off_90 = b * (yh + (2 * b)) + b;
-  int uv_off_90 = b * (uvh + (2 * b)) + b;
+  int y_off_0 = b * (yw + b * 2) + b;
+  int uv_off_0 = b * (uvw + b * 2) + b;
+  int y_off_90 = b * (yh + b * 2) + b;
+  int uv_off_90 = b * (uvh + b * 2) + b;
 
-  int y_st_0 = yw + (2 * b);
-  int uv_st_0 = uvw + (2 * b);
-  int y_st_90 = yh + (2 * b);
-  int uv_st_90 = uvh + (2 * b);
+  int y_st_0 = yw + b * 2;
+  int uv_st_0 = uvw + b * 2;
+  int y_st_90 = yh + b * 2;
+  int uv_st_90 = uvh + b * 2;
 
   NV12ToI420Rotate(orig_y+y_off_0, y_st_0,
                    orig_uv+y_off_0, y_st_0,
@@ -1335,73 +1353,74 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) {
              kRotate180);
 
   for (i = 0; i < y_plane_size; ++i) {
-    if (orig_y[i] != roc_y[i])
+    if (orig_y[i] != roc_y[i]) {
       ++y_err;
+    }
   }
 
   if (y_err) {
     printf("input %dx%d \n", yw, yh);
-    print_array(orig_y, y_st_0, yh + (2 * b));
+    PrintArray(orig_y, y_st_0, yh + b * 2);
 
     printf("rotate a\n");
-    print_array(roa_y, y_st_90, y_st_0);
+    PrintArray(roa_y, y_st_90, y_st_0);
 
     printf("rotate b\n");
-    print_array(rob_y, y_st_90, y_st_0);
+    PrintArray(rob_y, y_st_90, y_st_0);
 
     printf("rotate c\n");
-    print_array(roc_y, y_st_0, y_st_90);
+    PrintArray(roc_y, y_st_0, y_st_90);
   }
 
   int zero_cnt = 0;
 
   for (i = 0; i < uv_plane_size; ++i) {
-    if ((signed char)roc_u[i] != -(signed char)roc_v[i])
+    if ((signed char)roc_u[i] != -(signed char)roc_v[i]) {
       ++uv_err;
-    if (rob_u[i] != 0)
+    }
+    if (rob_u[i] != 0) {
       ++zero_cnt;
+    }
   }
 
-  if (!zero_cnt)
+  if (!zero_cnt) {
     ++uv_err;
+  }
 
   if (uv_err) {
-    printf("input %dx%d \n", (2 * uvw), uvh);
-    print_array(orig_uv, y_st_0, uvh + (2 * b));
+    printf("input %dx%d \n", uvw * 2, uvh);
+    PrintArray(orig_uv, y_st_0, uvh + b * 2);
 
     printf("rotate a\n");
-    print_array(roa_u, uv_st_90, uv_st_0);
-    print_array(roa_v, uv_st_90, uv_st_0);
+    PrintArray(roa_u, uv_st_90, uv_st_0);
+    PrintArray(roa_v, uv_st_90, uv_st_0);
 
     printf("rotate b\n");
-    print_array(rob_u, uv_st_90, uv_st_0);
-    print_array(rob_v, uv_st_90, uv_st_0);
+    PrintArray(rob_u, uv_st_90, uv_st_0);
+    PrintArray(rob_v, uv_st_90, uv_st_0);
 
     printf("rotate c\n");
-    print_array(roc_u, uv_st_0, uv_st_90);
-    print_array(roc_v, uv_st_0, uv_st_90);
+    PrintArray(roc_u, uv_st_0, uv_st_90);
+    PrintArray(roc_v, uv_st_0, uv_st_90);
   }
 
-  free(orig_y);
-  free(orig_uv);
-  free(roa_y);
-  free(roa_u);
-  free(roa_v);
-  free(rob_y);
-  free(rob_u);
-  free(rob_v);
-  free(roc_y);
-  free(roc_u);
-  free(roc_v);
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(orig_uv)
+  free_aligned_buffer_16(roa_y)
+  free_aligned_buffer_16(roa_u)
+  free_aligned_buffer_16(roa_v)
+  free_aligned_buffer_16(rob_y)
+  free_aligned_buffer_16(rob_u)
+  free_aligned_buffer_16(rob_v)
+  free_aligned_buffer_16(roc_y)
+  free_aligned_buffer_16(roc_u)
+  free_aligned_buffer_16(roc_v)
 
   EXPECT_EQ(0, y_err + uv_err);
 }
 
 TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) {
   int y_err = 0, uv_err = 0;
-  uint8 *orig_y, *orig_uv;
-  uint8 *roa_y, *roa_u, *roa_v;
-  uint8 *rob_y, *rob_u, *rob_v;
 
   int yw = 1024;
   int yh = 768;
@@ -1410,43 +1429,49 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) {
   int uvh = (yh + 1) >> 1;
   int i, j;
 
-  int y_plane_size = (yw + (2 * b)) * (yh + (2 * b));
-  int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b));
-  int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b));
+  int y_plane_size = (yw + b * 2) * (yh + b * 2);
+  int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
+  int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2);
 
   srandom(time(NULL));
 
-  orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8)));
-
-  roa_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  roa_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  roa_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
-  rob_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  rob_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  rob_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(orig_uv, o_uv_plane_size)
+  align_buffer_16(roa_y, y_plane_size)
+  align_buffer_16(roa_u, uv_plane_size)
+  align_buffer_16(roa_v, uv_plane_size)
+  align_buffer_16(rob_y, y_plane_size)
+  align_buffer_16(rob_u, uv_plane_size)
+  align_buffer_16(rob_v, uv_plane_size)
+  memset(orig_y, 0, y_plane_size);
+  memset(orig_uv, 0, o_uv_plane_size);
+  memset(roa_y, 0, y_plane_size);
+  memset(roa_u, 0, uv_plane_size);
+  memset(roa_v, 0, uv_plane_size);
+  memset(rob_y, 0, y_plane_size);
+  memset(rob_u, 0, uv_plane_size);
+  memset(rob_v, 0, uv_plane_size);
 
   // fill image buffers with random data
   for (i = b; i < (yh + b); ++i) {
     for (j = b; j < (yw + b); ++j) {
-      orig_y[i * (yw + (2 * b)) + j] = random() & 0xff;
+      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
     }
   }
 
   for (i = b; i < (uvh + b); ++i) {
-    for (j = b; j < ((2 * uvw) + b); j += 2) {
+    for (j = b; j < (uvw * 2 + b); j += 2) {
       uint8 random_number = random() & 0x7f;
-      orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number;
-      orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number;
+      orig_uv[i * (uvw * 2 + b * 2) + j] = random_number;
+      orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number;
     }
   }
 
-  int y_off = b * (yw + (2 * b)) + b;
-  int uv_off = b * (uvw + (2 * b)) + b;
+  int y_off = b * (yw + b * 2) + b;
+  int uv_off = b * (uvw + b * 2) + b;
 
-  int y_st = yw + (2 * b);
-  int uv_st = uvw + (2 * b);
+  int y_st = yw + b * 2;
+  int uv_st = uvw + b * 2;
 
   NV12ToI420Rotate(orig_y+y_off, y_st,
                    orig_uv+y_off, y_st,
@@ -1472,48 +1497,53 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) {
 
   if (y_err) {
     printf("input %dx%d \n", yw, yh);
-    print_array(orig_y, y_st, yh + (2 * b));
+    PrintArray(orig_y, y_st, yh + b * 2);
 
     printf("rotate a\n");
-    print_array(roa_y, y_st, yh + (2 * b));
+    PrintArray(roa_y, y_st, yh + b * 2);
 
     printf("rotate b\n");
-    print_array(rob_y, y_st, yh + (2 * b));
+    PrintArray(rob_y, y_st, yh + b * 2);
   }
 
   int zero_cnt = 0;
 
   for (i = 0; i < uv_plane_size; ++i) {
-    if ((signed char)rob_u[i] != -(signed char)rob_v[i])
+    if ((signed char)rob_u[i] != -(signed char)rob_v[i]) {
       ++uv_err;
-    if (rob_u[i] != 0)
+    }
+    if (rob_u[i] != 0) {
       ++zero_cnt;
+    }
   }
 
-  if (!zero_cnt)
+  if (!zero_cnt) {
     ++uv_err;
+  }
 
   if (uv_err) {
-    printf("input %dx%d \n", (2 * uvw), uvh);
-    print_array(orig_uv, y_st, uvh + (2 * b));
+    printf("input %dx%d \n", uvw * 2, uvh);
+    PrintArray(orig_uv, y_st, uvh + b * 2);
 
     printf("rotate a\n");
-    print_array(roa_u, uv_st, uvh + (2 * b));
-    print_array(roa_v, uv_st, uvh + (2 * b));
+    PrintArray(roa_u, uv_st, uvh + b * 2);
+    PrintArray(roa_v, uv_st, uvh + b * 2);
 
     printf("rotate b\n");
-    print_array(rob_u, uv_st, uvh + (2 * b));
-    print_array(rob_v, uv_st, uvh + (2 * b));
+    PrintArray(rob_u, uv_st, uvh + b * 2);
+    PrintArray(rob_v, uv_st, uvh + b * 2);
   }
 
-  free(orig_y);
-  free(orig_uv);
-  free(roa_y);
-  free(roa_u);
-  free(roa_v);
-  free(rob_y);
-  free(rob_u);
-  free(rob_v);
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(orig_uv)
+  free_aligned_buffer_16(roa_y)
+  free_aligned_buffer_16(roa_u)
+  free_aligned_buffer_16(roa_v)
+  free_aligned_buffer_16(rob_y)
+  free_aligned_buffer_16(rob_u)
+  free_aligned_buffer_16(rob_v)
 
   EXPECT_EQ(0, y_err + uv_err);
 }
+
+}  // namespace libyuv
diff --git a/files/unit_test/scale_argb_test.cc b/files/unit_test/scale_argb_test.cc
new file mode 100644
index 00000000..fef96764
--- /dev/null
+++ b/files/unit_test/scale_argb_test.cc
@@ -0,0 +1,255 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale_argb.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+static int ARGBTestFilter(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          FilterMode f, int benchmark_iterations) {
+  const int b = 128;
+  int src_argb_plane_size = (src_width + b * 2) * (src_height + b * 2) * 4;
+  int src_stride_argb = (b * 2 + src_width) * 4;
+
+  align_buffer_16(src_argb, src_argb_plane_size)
+  memset(src_argb, 1, src_argb_plane_size);
+
+  int dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
+  int dst_stride_argb = (b * 2 + dst_width) * 4;
+
+  srandom(time(NULL));
+
+  int i, j;
+  for (i = b; i < (src_height + b); ++i) {
+    for (j = b; j < (src_width + b) * 4; ++j) {
+      src_argb[(i * src_stride_argb) + j] = (random() & 0xff);
+    }
+  }
+
+  align_buffer_16(dst_argb_c, dst_argb_plane_size)
+  align_buffer_16(dst_argb_opt, dst_argb_plane_size)
+  memset(dst_argb_c, 2, dst_argb_plane_size);
+  memset(dst_argb_opt, 3, dst_argb_plane_size);
+
+  // Warm up both versions for consistent benchmarks.
+  MaskCpuFlags(0);  // Disable all CPU optimization.
+  ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+            src_width, src_height,
+            dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+            dst_width, dst_height, f);
+  MaskCpuFlags(-1);  // Enable all CPU optimization.
+  ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+            src_width, src_height,
+            dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+            dst_width, dst_height, f);
+
+  MaskCpuFlags(0);  // Disable all CPU optimization.
+  double c_time = get_time();
+  for (i = 0; i < benchmark_iterations; ++i) {
+    ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+              src_width, src_height,
+              dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+              dst_width, dst_height, f);
+  }
+  c_time = (get_time() - c_time) / benchmark_iterations;
+
+  MaskCpuFlags(-1);  // Enable all CPU optimization.
+  double opt_time = get_time();
+  for (i = 0; i < benchmark_iterations; ++i) {
+    ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+              src_width, src_height,
+              dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+              dst_width, dst_height, f);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations;
+
+  // Report performance of C vs OPT
+  printf("filter %d - %8d us C - %8d us OPT\n",
+         f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
+
+  // C version may be a little off from the optimized. Order of
+  //  operations may introduce rounding somewhere. So do a difference
+  //  of the buffers and look to see that the max difference isn't
+  //  over 2.
+  int max_diff = 0;
+  for (i = b; i < (dst_height + b); ++i) {
+    for (j = b * 4; j < (dst_width + b) * 4; ++j) {
+      int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] -
+                         dst_argb_opt[(i * dst_stride_argb) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  free_aligned_buffer_16(dst_argb_c)
+  free_aligned_buffer_16(dst_argb_opt)
+  free_aligned_buffer_16(src_argb)
+  return max_diff;
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy2) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 2;
+  const int dst_height = src_height / 2;
+
+  for (int f = 0; f < 2; ++f) {
+    int max_diff = ARGBTestFilter(src_width, src_height,
+                                  dst_width, dst_height,
+                                  static_cast<FilterMode>(f),
+                                  benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy4) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 4;
+  const int dst_height = src_height / 4;
+
+  for (int f = 0; f < 2; ++f) {
+    int max_diff = ARGBTestFilter(src_width, src_height,
+                                  dst_width, dst_height,
+                                  static_cast<FilterMode>(f),
+                                  benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy5) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 5;
+  const int dst_height = src_height / 5;
+
+  for (int f = 0; f < 2; ++f) {
+    int max_diff = ARGBTestFilter(src_width, src_height,
+                                  dst_width, dst_height,
+                                  static_cast<FilterMode>(f),
+                                  benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy8) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 8;
+  const int dst_height = src_height / 8;
+
+  for (int f = 0; f < 2; ++f) {
+    int max_diff = ARGBTestFilter(src_width, src_height,
+                                  dst_width, dst_height,
+                                  static_cast<FilterMode>(f),
+                                  benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy16) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 16;
+  const int dst_height = src_height / 16;
+
+  for (int f = 0; f < 2; ++f) {
+    int max_diff = ARGBTestFilter(src_width, src_height,
+                                  dst_width, dst_height,
+                                  static_cast<FilterMode>(f),
+                                  benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy34) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width * 3 / 4;
+  const int dst_height = src_height * 3 / 4;
+
+  for (int f = 0; f < 2; ++f) {
+    int max_diff = ARGBTestFilter(src_width, src_height,
+                                  dst_width, dst_height,
+                                  static_cast<FilterMode>(f),
+                                  benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy38) {
+  int src_width = 1280;
+  int src_height = 720;
+  int dst_width = src_width * 3 / 8;
+  int dst_height = src_height * 3 / 8;
+
+  for (int f = 0; f < 2; ++f) {
+    int max_diff = ARGBTestFilter(src_width, src_height,
+                                  dst_width, dst_height,
+                                  static_cast<FilterMode>(f),
+                                  benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ARGBScaleTo1366) {
+  int src_width = 1280;
+  int src_height = 720;
+  int dst_width = 1366;
+  int dst_height = 768;
+
+  for (int f = 0; f < 2; ++f) {
+    int max_diff = ARGBTestFilter(src_width, src_height,
+                                  dst_width, dst_height,
+                                  static_cast<FilterMode>(f),
+                                  benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ARGBScaleTo4074) {
+  int src_width = 2880 * 2;
+  int src_height = 1800;
+  int dst_width = 4074;
+  int dst_height = 1272;
+
+  for (int f = 0; f < 2; ++f) {
+    int max_diff = ARGBTestFilter(src_width, src_height,
+                                  dst_width, dst_height,
+                                  static_cast<FilterMode>(f),
+                                  benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+
+TEST_F(libyuvTest, ARGBScaleTo853) {
+  int src_width = 1280;
+  int src_height = 720;
+  int dst_width = 853;
+  int dst_height = 480;
+
+  for (int f = 0; f < 2; ++f) {
+    int max_diff = ARGBTestFilter(src_width, src_height,
+                                  dst_width, dst_height,
+                                  static_cast<FilterMode>(f),
+                                  benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+}  // namespace libyuv
diff --git a/files/unit_test/scale_test.cc b/files/unit_test/scale_test.cc
index e147d78b..55b4148d 100644
--- a/files/unit_test/scale_test.cc
+++ b/files/unit_test/scale_test.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,152 +8,369 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/scale.h"
-#include "unit_test.h"
 #include <stdlib.h>
 #include <time.h>
 
-using namespace libyuv;
-
-#define align_buffer_16(var, size) \
-  uint8 *var; \
-  uint8 *var##_mem; \
-  var##_mem = reinterpret_cast<uint8*>(calloc(size+15, sizeof(uint8))); \
-  var = reinterpret_cast<uint8*> \
-        ((reinterpret_cast<intptr_t>(var##_mem) + 15) & (~0x0f));
-
-#define free_aligned_buffer_16(var) \
-  free(var##_mem);  \
-  var = 0;
-
-TEST_F(libyuvTest, ScaleDownBy4) {
-  int b = 128;
-  int src_width = 1280;
-  int src_height = 720;
-  int src_width_uv = (src_width + 1) >> 1;
-  int src_height_uv = (src_height + 1) >> 1;
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale.h"
+#include "../unit_test/unit_test.h"
 
-  int src_y_plane_size = (src_width + (2 * b)) * (src_height + (2 * b));
-  int src_uv_plane_size = (src_width_uv + (2 * b)) * (src_height_uv + (2 * b));
+namespace libyuv {
 
-  int src_stride_y = 2 * b + src_width;
-  int src_stride_uv = 2 * b + src_width_uv;
+static int TestFilter(int src_width, int src_height,
+                      int dst_width, int dst_height,
+                      FilterMode f, int rounding, int benchmark_iterations) {
+  const int b = 128 * rounding;
+  int src_width_uv = (src_width + rounding) >> 1;
+  int src_height_uv = (src_height + rounding) >> 1;
 
-  align_buffer_16(src_y, src_y_plane_size)
-  align_buffer_16(src_u, src_uv_plane_size)
-  align_buffer_16(src_v, src_uv_plane_size)
+  int src_y_plane_size = (src_width + b * 2) * (src_height + b * 2);
+  int src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
 
-  int dst_width = src_width >> 2;
-  int dst_height = src_height >> 2;
+  int src_stride_y = b * 2 + src_width;
+  int src_stride_uv = b * 2 + src_width_uv;
 
-  int dst_width_uv = (dst_width + 1) >> 1;
-  int dst_height_uv = (dst_height + 1) >> 1;
+  align_buffer_page_end(src_y, src_y_plane_size)
+  align_buffer_page_end(src_u, src_uv_plane_size)
+  align_buffer_page_end(src_v, src_uv_plane_size)
 
-  int dst_y_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b));
-  int dst_uv_plane_size = (dst_width_uv + (2 * b)) * (dst_height_uv + (2 * b));
+  int dst_width_uv = (dst_width + rounding) >> 1;
+  int dst_height_uv = (dst_height + rounding) >> 1;
 
-  int dst_stride_y = 2 * b + dst_width;
-  int dst_stride_uv = 2 * b + dst_width_uv;
+  int dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
+  int dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
 
-  align_buffer_16(dst_y, dst_y_plane_size)
-  align_buffer_16(dst_u, dst_uv_plane_size)
-  align_buffer_16(dst_v, dst_uv_plane_size)
+  int dst_stride_y = b * 2 + dst_width;
+  int dst_stride_uv = b * 2 + dst_width_uv;
 
-  // create an image with random data reoccurring in 4x4 grid.  When the image
-  // is filtered all the values should be the same.
   srandom(time(NULL));
 
-  uint8 block_data[16];
-
   int i, j;
-
-  // Pulling 16 random numbers there is an infinitesimally small
-  //  chance that they are all 0.  Then the output will be all 0.
-  //  Output buffer is filled with 0, want to make sure that after the
-  //  filtering something went into the output buffer.
-  //  Avoid this by setting one of the values to 128.  Also set the
-  //  random data to at least 1 for when point sampling to prevent
-  //  output all being 0.
-  block_data[0] = 128;
-
-  for (i = 1; i < 16; i++)
-    block_data[i] = (random() & 0xfe) + 1;
-
-  for (i = b; i < (src_height + b); i += 4) {
-    for (j = b; j < (src_width + b); j += 4) {
-      uint8 *ptr = src_y + (i * src_stride_y) + j;
-      int k, l;
-      for (k = 0; k < 4; ++k)
-        for (l = 0; l < 4; ++l)
-          ptr[k + src_stride_y * l] = block_data[k + 4 * l];
+  for (i = b; i < (src_height + b); ++i) {
+    for (j = b; j < (src_width + b); ++j) {
+      src_y[(i * src_stride_y) + j] = (random() & 0xff);
     }
   }
 
-  for (i = 1; i < 16; i++)
-    block_data[i] = (random() & 0xfe) + 1;
-
-  for (i = b; i < (src_height_uv + b); i += 4) {
-    for (j = b; j < (src_width_uv + b); j += 4) {
-      uint8 *ptru = src_u + (i * src_stride_uv) + j;
-      uint8 *ptrv = src_v + (i * src_stride_uv) + j;
-      int k, l;
-      for (k = 0; k < 4; ++k)
-        for (l = 0; l < 4; ++l) {
-          ptru[k + src_stride_uv * l] = block_data[k + 4 * l];
-          ptrv[k + src_stride_uv * l] = block_data[k + 4 * l];
-        }
+  for (i = b; i < (src_height_uv + b); ++i) {
+    for (j = b; j < (src_width_uv + b); ++j) {
+      src_u[(i * src_stride_uv) + j] = (random() & 0xff);
+      src_v[(i * src_stride_uv) + j] = (random() & 0xff);
     }
   }
 
-  int f;
-  int err = 0;
+  align_buffer_page_end(dst_y_c, dst_y_plane_size)
+  align_buffer_page_end(dst_u_c, dst_uv_plane_size)
+  align_buffer_page_end(dst_v_c, dst_uv_plane_size)
+  align_buffer_page_end(dst_y_opt, dst_y_plane_size)
+  align_buffer_page_end(dst_u_opt, dst_uv_plane_size)
+  align_buffer_page_end(dst_v_opt, dst_uv_plane_size)
+
+  // Warm up both versions for consistent benchmarks.
+  MaskCpuFlags(0);  // Disable all CPU optimization.
+  I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+            src_u + (src_stride_uv * b) + b, src_stride_uv,
+            src_v + (src_stride_uv * b) + b, src_stride_uv,
+            src_width, src_height,
+            dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
+            dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
+            dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
+            dst_width, dst_height, f);
+  MaskCpuFlags(-1);  // Enable all CPU optimization.
+  I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+            src_u + (src_stride_uv * b) + b, src_stride_uv,
+            src_v + (src_stride_uv * b) + b, src_stride_uv,
+            src_width, src_height,
+            dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
+            dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+            dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+            dst_width, dst_height, f);
 
-  // currently three filter modes, defined as FilterMode in scale.h
-  for (f = 0; f < 3; ++f) {
+  MaskCpuFlags(0);  // Disable all CPU optimization.
+  double c_time = get_time();
+  for (i = 0; i < benchmark_iterations; ++i) {
     I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
               src_u + (src_stride_uv * b) + b, src_stride_uv,
               src_v + (src_stride_uv * b) + b, src_stride_uv,
               src_width, src_height,
-              dst_y + (dst_stride_y * b) + b, dst_stride_y,
-              dst_u + (dst_stride_uv * b) + b, dst_stride_uv,
-              dst_v + (dst_stride_uv * b) + b, dst_stride_uv,
-              dst_width, dst_height,
-              static_cast<FilterMode>(f));
-
-    int value = dst_y[(dst_stride_y * b) + b];
-
-    // catch the case that the output buffer is all 0
-    if (value == 0)
-      ++err;
-
-    for (i = b; i < (dst_height + b); ++i) {
-      for (j = b; j < (dst_width + b); ++j) {
-        if (value != dst_y[(i * dst_stride_y) + j])
-          ++err;
-      }
-    }
+              dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
+              dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
+              dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
+              dst_width, dst_height, f);
+  }
+  c_time = (get_time() - c_time) / benchmark_iterations;
 
-    value = dst_u[(dst_stride_uv * b) + b];
+  MaskCpuFlags(-1);  // Enable all CPU optimization.
+  double opt_time = get_time();
+  for (i = 0; i < benchmark_iterations; ++i) {
+    I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+              src_u + (src_stride_uv * b) + b, src_stride_uv,
+              src_v + (src_stride_uv * b) + b, src_stride_uv,
+              src_width, src_height,
+              dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
+              dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+              dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+              dst_width, dst_height, f);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations;
+
+  // Report performance of C vs OPT
+  printf("filter %d - %8d us C - %8d us OPT\n",
+         f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
 
-    if (value == 0)
-      ++err;
+  // C version may be a little off from the optimized. Order of
+  //  operations may introduce rounding somewhere. So do a difference
+  //  of the buffers and look to see that the max difference isn't
+  //  over 2.
+  int max_diff = 0;
+  for (i = b; i < (dst_height + b); ++i) {
+    for (j = b; j < (dst_width + b); ++j) {
+      int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
+                         dst_y_opt[(i * dst_stride_y) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
 
-    for (i = b; i < (dst_height_uv + b); ++i) {
-      for (j = b; j < (dst_width_uv + b); ++j) {
-        if (value != dst_u[(i * dst_stride_uv) + j])
-          ++err;
-        if (value != dst_v[(i * dst_stride_uv) + j])
-          ++err;
+  for (i = b; i < (dst_height_uv + b); ++i) {
+    for (j = b; j < (dst_width_uv + b); ++j) {
+      int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] -
+                         dst_u_opt[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+      abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] -
+                     dst_v_opt[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
       }
     }
   }
 
-  free_aligned_buffer_16(src_y)
-  free_aligned_buffer_16(src_u)
-  free_aligned_buffer_16(src_v)
-  free_aligned_buffer_16(dst_y)
-  free_aligned_buffer_16(dst_u)
-  free_aligned_buffer_16(dst_v)
+  free_aligned_buffer_page_end(dst_y_c)
+  free_aligned_buffer_page_end(dst_u_c)
+  free_aligned_buffer_page_end(dst_v_c)
+  free_aligned_buffer_page_end(dst_y_opt)
+  free_aligned_buffer_page_end(dst_u_opt)
+  free_aligned_buffer_page_end(dst_v_opt)
+
+  free_aligned_buffer_page_end(src_y)
+  free_aligned_buffer_page_end(src_u)
+  free_aligned_buffer_page_end(src_v)
+
+  return max_diff;
+}
+
+TEST_F(libyuvTest, ScaleDownBy2) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 2;
+  const int dst_height = src_height / 2;
+
+  for (int f = 0; f < 3; ++f) {
+    int max_diff = TestFilter(src_width, src_height,
+                              dst_width, dst_height,
+                              static_cast<FilterMode>(f), 1,
+                              benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ScaleDownBy4) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 4;
+  const int dst_height = src_height / 4;
+
+  for (int f = 0; f < 3; ++f) {
+    int max_diff = TestFilter(src_width, src_height,
+                              dst_width, dst_height,
+                              static_cast<FilterMode>(f), 1,
+                              benchmark_iterations_);
+    EXPECT_LE(max_diff, 2);  // This is the only scale factor with error of 2.
+  }
+}
+
+TEST_F(libyuvTest, ScaleDownBy5) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 5;
+  const int dst_height = src_height / 5;
+
+  for (int f = 0; f < 3; ++f) {
+    int max_diff = TestFilter(src_width, src_height,
+                              dst_width, dst_height,
+                              static_cast<FilterMode>(f), 1,
+                              benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ScaleDownBy8) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 8;
+  const int dst_height = src_height / 8;
+
+  for (int f = 0; f < 3; ++f) {
+    int max_diff = TestFilter(src_width, src_height,
+                              dst_width, dst_height,
+                              static_cast<FilterMode>(f), 1,
+                              benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ScaleDownBy16) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 16;
+  const int dst_height = src_height / 16;
+
+  for (int f = 0; f < 3; ++f) {
+    int max_diff = TestFilter(src_width, src_height,
+                              dst_width, dst_height,
+                              static_cast<FilterMode>(f), 1,
+                              benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ScaleDownBy34) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width * 3 / 4;
+  const int dst_height = src_height * 3 / 4;
+
+  for (int f = 0; f < 3; ++f) {
+    int max_diff = TestFilter(src_width, src_height,
+                              dst_width, dst_height,
+                              static_cast<FilterMode>(f), 1,
+                              benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ScaleDownBy38) {
+  int src_width = 1280;
+  int src_height = 720;
+  int dst_width = src_width * 3 / 8;
+  int dst_height = src_height * 3 / 8;
+
+  for (int f = 0; f < 3; ++f) {
+    int max_diff = TestFilter(src_width, src_height,
+                              dst_width, dst_height,
+                              static_cast<FilterMode>(f), 1,
+                              benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ScaleTo1366) {
+  int src_width = 1280;
+  int src_height = 720;
+  int dst_width = 1366;
+  int dst_height = 768;
+
+  for (int f = 0; f < 3; ++f) {
+    int max_diff = TestFilter(src_width, src_height,
+                              dst_width, dst_height,
+                              static_cast<FilterMode>(f), 1,
+                              benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ScaleTo4074) {
+  int src_width = 2880 * 2;
+  int src_height = 1800;
+  int dst_width = 4074;
+  int dst_height = 1272;
+
+  for (int f = 0; f < 3; ++f) {
+    int max_diff = TestFilter(src_width, src_height,
+                              dst_width, dst_height,
+                              static_cast<FilterMode>(f), 1,
+                              benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ScaleTo853) {
+  int src_width = 1280;
+  int src_height = 720;
+  int dst_width = 853;
+  int dst_height = 480;
+
+  for (int f = 0; f < 3; ++f) {
+    int max_diff = TestFilter(src_width, src_height,
+                              dst_width, dst_height,
+                              static_cast<FilterMode>(f), 1,
+                              benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ScaleTo853Wrong) {
+  int src_width = 1280;
+  int src_height = 720;
+  int dst_width = 853;
+  int dst_height = 480;
 
-  EXPECT_EQ(0, err);
+  for (int f = 0; f < 3; ++f) {
+    int max_diff = TestFilter(src_width, src_height,
+                              dst_width, dst_height,
+                              static_cast<FilterMode>(f), 0,
+                              benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
 }
+
+// A one off test for a screen cast resolution scale.
+TEST_F(libyuvTest, ScaleTo684) {
+  int src_width = 686;
+  int src_height = 557;
+  int dst_width = 684;
+  int dst_height = 552;
+
+  for (int f = 0; f < 3; ++f) {
+    int max_diff = TestFilter(src_width, src_height,
+                              dst_width, dst_height,
+                              static_cast<FilterMode>(f), 1,
+                              benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ScaleTo342) {
+  int src_width = 686;
+  int src_height = 557;
+  int dst_width = 342;
+  int dst_height = 276;
+
+  for (int f = 0; f < 3; ++f) {
+    int max_diff = TestFilter(src_width, src_height,
+                              dst_width, dst_height,
+                              static_cast<FilterMode>(f), 1,
+                              benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+TEST_F(libyuvTest, ScaleToHalf342) {
+  int src_width = 684;
+  int src_height = 552;
+  int dst_width = 342;
+  int dst_height = 276;
+
+  for (int f = 0; f < 3; ++f) {
+    int max_diff = TestFilter(src_width, src_height,
+                              dst_width, dst_height,
+                              static_cast<FilterMode>(f), 1,
+                              benchmark_iterations_);
+    EXPECT_LE(max_diff, 1);
+  }
+}
+
+}  // namespace libyuv
diff --git a/files/unit_test/testdata/arm_v7.txt b/files/unit_test/testdata/arm_v7.txt
new file mode 100644
index 00000000..5d7dbd04
--- /dev/null
+++ b/files/unit_test/testdata/arm_v7.txt
@@ -0,0 +1,12 @@
+Processor	: ARMv7 Processor rev 5 (v7l)
+BogoMIPS	: 795.44
+Features	: swp half thumb fastmult vfp edsp iwmmxt thumbee vfpv3 vfpv3d16 
+CPU implementer	: 0x56
+CPU architecture: 7
+CPU variant	: 0x0
+CPU part	: 0x581
+CPU revision	: 5
+
+Hardware	: OLPC XO-1.75
+Revision	: 0000
+Serial		: 0000000000000000
diff --git a/files/unit_test/testdata/tegra3.txt b/files/unit_test/testdata/tegra3.txt
new file mode 100644
index 00000000..d1b09f6b
--- /dev/null
+++ b/files/unit_test/testdata/tegra3.txt
@@ -0,0 +1,23 @@
+Processor       : ARMv7 Processor rev 9 (v7l)
+processor       : 0
+BogoMIPS        : 1992.29
+
+processor       : 1
+BogoMIPS        : 1992.29
+
+processor       : 2
+BogoMIPS        : 1992.29
+
+processor       : 3
+BogoMIPS        : 1992.29
+
+Features        : swp half thumb fastmult vfp edsp neon vfpv3
+CPU implementer : 0�41
+CPU architecture: 7
+CPU variant     : 0�2
+CPU part        : 0xc09
+CPU revision    : 9
+
+Hardware        : cardhu
+Revision        : 0000
+
diff --git a/files/unit_test/unit_test.cc b/files/unit_test/unit_test.cc
index 1996adf1..007c81f0 100644
--- a/files/unit_test/unit_test.cc
+++ b/files/unit_test/unit_test.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,33 +8,26 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <cstring>
-#include "unit_test.h"
-
-class libyuvEnvironment : public ::testing::Environment {
- public:
-  virtual void SetUp() {
-  }
+#include "../unit_test/unit_test.h"
 
-  virtual void TearDown() {
-  }
-};
-
-libyuvTest::libyuvTest() :
-  _rotate_max_w(128),
-  _rotate_max_h(128) {
-}
+#include <stdlib.h>  // For getenv()
 
-void libyuvTest::SetUp() {
-}
+#include <cstring>
 
-void libyuvTest::TearDown() {
+// Change this to 1000 for benchmarking.
+// TODO(fbarchard): Add command line parsing to pass this as option.
+#define BENCHMARK_ITERATIONS 1
+
+libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128),
+    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(1280),
+    benchmark_height_(720) {
+    const char* repeat = getenv("LIBYUV_REPEAT");
+    if (repeat) {
+      benchmark_iterations_ = atoi(repeat);  // NOLINT
+    }
 }
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  libyuvEnvironment* env = new libyuvEnvironment;
-  ::testing::AddGlobalTestEnvironment(env);
-
   return RUN_ALL_TESTS();
-}
-\ No newline at end of file
+}
diff --git a/files/unit_test/unit_test.h b/files/unit_test/unit_test.h
index cac30c72..62521e88 100644
--- a/files/unit_test/unit_test.h
+++ b/files/unit_test/unit_test.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,20 +8,67 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef UINIT_TEST_H_
-#define UINIT_TEST_H_
+#ifndef UNIT_TEST_UNIT_TEST_H_
+#define UNIT_TEST_UNIT_TEST_H_
 
 #include <gtest/gtest.h>
 
+#define align_buffer_16(var, size)                                             \
+  uint8* var;                                                                  \
+  uint8* var##_mem;                                                            \
+  var##_mem = reinterpret_cast<uint8*>(malloc((size) + 15));                   \
+  var = reinterpret_cast<uint8*>                                               \
+        ((reinterpret_cast<intptr_t>(var##_mem) + 15) & ~15);
+
+#define free_aligned_buffer_16(var) \
+  free(var##_mem);  \
+  var = 0;
+
+
+#define align_buffer_page_end(var, size)                                       \
+  uint8* var;                                                                  \
+  uint8* var##_mem;                                                            \
+  var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095) & ~4095));       \
+  var = var##_mem + (-(size) & 4095);
+
+#define free_aligned_buffer_page_end(var) \
+  free(var##_mem);  \
+  var = 0;
+
+#ifdef WIN32
+#include <windows.h>
+static inline double get_time() {
+  LARGE_INTEGER t, f;
+  QueryPerformanceCounter(&t);
+  QueryPerformanceFrequency(&f);
+  return static_cast<double>(t.QuadPart) / static_cast<double>(f.QuadPart);
+}
+
+#define random rand
+#define srandom srand
+#else
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+static inline double get_time() {
+  struct timeval t;
+  struct timezone tzp;
+  gettimeofday(&t, &tzp);
+  return t.tv_sec + t.tv_usec * 1e-6;
+}
+#endif
+
 class libyuvTest : public ::testing::Test {
  protected:
   libyuvTest();
-  virtual void SetUp();
-  virtual void TearDown();
 
-  const int _rotate_max_w;
-  const int _rotate_max_h;
+  const int rotate_max_w_;
+  const int rotate_max_h_;
 
+  int benchmark_iterations_;
+  const int benchmark_width_;
+  const int benchmark_height_;
 };
 
-#endif // UNIT_TEST_H_
+#endif  // UNIT_TEST_UNIT_TEST_H_
diff --git a/files/unit_test/version_test.cc b/files/unit_test/version_test.cc
new file mode 100644
index 00000000..c53d754c
--- /dev/null
+++ b/files/unit_test/version_test.cc
@@ -0,0 +1,42 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/version.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+// Tests SVN version against include/libyuv/version.h
+// SVN version is bumped by documentation changes as well as code.
+// Although the versions should match, once checked in, a tolerance is allowed.
+TEST_F(libyuvTest, TestVersion) {
+  EXPECT_GE(LIBYUV_VERSION, 169);  // 169 is first version to support version.
+  printf("LIBYUV_VERSION %d\n", LIBYUV_VERSION);
+#ifdef LIBYUV_SVNREVISION
+  const char *ver = strchr(LIBYUV_SVNREVISION, ':');
+  if (ver) {
+    ++ver;
+  } else {
+    ver = LIBYUV_SVNREVISION;
+  }
+  int svn_revision = atoi(ver);  // NOLINT
+  printf("LIBYUV_SVNREVISION %d\n", svn_revision);
+  EXPECT_NEAR(LIBYUV_VERSION, svn_revision, 3);  // Allow version to be close.
+  if (LIBYUV_VERSION != svn_revision) {
+    printf("WARNING - Versions do not match.\n");
+  }
+#endif
+}
+
+}  // namespace libyuv
diff --git a/files/util/compare.cc b/files/util/compare.cc
new file mode 100644
index 00000000..f030c799
--- /dev/null
+++ b/files/util/compare.cc
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/version.h"
+
+int main(int argc, char** argv) {
+  if (argc < 1) {
+    printf("libyuv compare v%d\n", LIBYUV_VERSION);
+    printf("compare file1.yuv file2.yuv\n");
+    return -1;
+  }
+  char* name1 = argv[1];
+  char* name2 = (argc > 2) ? argv[2] : NULL;
+  FILE* fin1 = fopen(name1, "rb");
+  FILE* fin2 = name2 ? fopen(name2, "rb") : NULL;
+
+  const int kBlockSize = 32768;
+  uint8 buf1[kBlockSize];
+  uint8 buf2[kBlockSize];
+  uint32 hash1 = 5381;
+  uint32 hash2 = 5381;
+  uint64 sum_square_err = 0;
+  uint64 size_min = 0;
+  int amt1 = 0;
+  int amt2 = 0;
+  do {
+    amt1 = fread(buf1, 1, kBlockSize, fin1);
+    if (amt1 > 0) hash1 = libyuv::HashDjb2(buf1, amt1, hash1);
+    if (fin2) {
+      amt2 = fread(buf2, 1, kBlockSize, fin2);
+      if (amt2 > 0) hash2 = libyuv::HashDjb2(buf2, amt2, hash2);
+      int amt_min = (amt1 < amt2) ? amt1 : amt2;
+      size_min += amt_min;
+      sum_square_err += libyuv::ComputeSumSquareError(buf1, buf2, amt_min);
+    }
+  } while (amt1 > 0 || amt2 > 0);
+
+  printf("hash1 %x", hash1);
+  if (fin2) {
+    printf(", hash2 %x", hash2);
+    double mse = static_cast<double>(sum_square_err) /
+                 static_cast<double>(size_min);
+    printf(", mse %.2f", mse);
+    double psnr = libyuv::SumSquareErrorToPsnr(sum_square_err, size_min);
+    printf(", psnr %.2f\n", psnr);
+    fclose(fin2);
+  }
+  fclose(fin1);
+}
+
author	Hendrik Dahlkamp <hendrik@google.com>	2013-01-23 18:27:37 -0800
committer	Adam Hampson <ahampson@google.com>	2013-01-28 15:39:41 -0800
commit	33cfdeb7b267ab635413797fffb046b73272f7ec (patch)
tree	8ff16b765a83ba911233a1d7bfa27cce9cee3b7c
parent	a88a10a6ed9f9801852929bac34bdf10510116f4 (diff)
download	libyuv-33cfdeb7b267ab635413797fffb046b73272f7ec.tar.gz