1 files changed, 793 insertions, 437 deletions
diff --git a/third_party/libyuv/source/row_common.cc b/third_party/libyuv/source/row_common.cc
index a941c3f5fc..c6e412414e 100644
--- a/third_party/libyuv/source/row_common.cc
+++ b/third_party/libyuv/source/row_common.cc
@@ -10,6 +10,7 @@
 
 #include "libyuv/row.h"
 
+#include <assert.h>
 #include <stdio.h>
 #include <string.h>  // For memcpy and memset.
 
@@ -21,10 +22,14 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// The following ifdef from row_win makes the C code match the row_win code,
-// which is 7 bit fixed point.
+// This macro control YUV to RGB using unsigned math to extend range of
+// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B:
+// LIBYUV_UNLIMITED_DATA
+
+// The following macro from row_win makes the C code match the row_win code,
+// which is 7 bit fixed point for ARGBToI420:
 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
-    (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+    !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
 #define LIBYUV_RGB7 1
 #endif
 
@@ -50,6 +55,11 @@ static __inline int32_t clamp1023(int32_t v) {
   return (-(v >= 1023) | v) & 1023;
 }
 
+// clamp to max
+static __inline int32_t ClampMax(int32_t v, int32_t max) {
+  return (-(v >= max) | v) & max;
+}
+
 static __inline uint32_t Abs(int32_t v) {
   int m = -(v < 0);
   return (v + m) ^ m;
@@ -67,6 +77,10 @@ static __inline int32_t clamp1023(int32_t v) {
   return (v > 1023) ? 1023 : v;
 }
 
+static __inline int32_t ClampMax(int32_t v, int32_t max) {
+  return (v > max) ? max : v;
+}
+
 static __inline uint32_t Abs(int32_t v) {
   return (v < 0) ? -v : v;
 }
@@ -413,6 +427,82 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
   }
 }
 
+void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_ar64[0] = src_argb[0] * 0x0101;
+    dst_ar64[1] = src_argb[1] * 0x0101;
+    dst_ar64[2] = src_argb[2] * 0x0101;
+    dst_ar64[3] = src_argb[3] * 0x0101;
+    dst_ar64 += 4;
+    src_argb += 4;
+  }
+}
+
+void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_ab64[0] = src_argb[2] * 0x0101;
+    dst_ab64[1] = src_argb[1] * 0x0101;
+    dst_ab64[2] = src_argb[0] * 0x0101;
+    dst_ab64[3] = src_argb[3] * 0x0101;
+    dst_ab64 += 4;
+    src_argb += 4;
+  }
+}
+
+void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_argb[0] = src_ar64[0] >> 8;
+    dst_argb[1] = src_ar64[1] >> 8;
+    dst_argb[2] = src_ar64[2] >> 8;
+    dst_argb[3] = src_ar64[3] >> 8;
+    dst_argb += 4;
+    src_ar64 += 4;
+  }
+}
+
+void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_argb[0] = src_ab64[2] >> 8;
+    dst_argb[1] = src_ab64[1] >> 8;
+    dst_argb[2] = src_ab64[0] >> 8;
+    dst_argb[3] = src_ab64[3] >> 8;
+    dst_argb += 4;
+    src_ab64 += 4;
+  }
+}
+
+// TODO(fbarchard): Make shuffle compatible with SIMD versions
+void AR64ShuffleRow_C(const uint8_t* src_ar64,
+                      uint8_t* dst_ar64,
+                      const uint8_t* shuffler,
+                      int width) {
+  const uint16_t* src_ar64_16 = (const uint16_t*)src_ar64;
+  uint16_t* dst_ar64_16 = (uint16_t*)dst_ar64;
+  int index0 = shuffler[0] / 2;
+  int index1 = shuffler[2] / 2;
+  int index2 = shuffler[4] / 2;
+  int index3 = shuffler[6] / 2;
+  // Shuffle a row of AR64.
+  int x;
+  for (x = 0; x < width / 2; ++x) {
+    // To support in-place conversion.
+    uint16_t b = src_ar64_16[index0];
+    uint16_t g = src_ar64_16[index1];
+    uint16_t r = src_ar64_16[index2];
+    uint16_t a = src_ar64_16[index3];
+    dst_ar64_16[0] = b;
+    dst_ar64_16[1] = g;
+    dst_ar64_16[2] = r;
+    dst_ar64_16[3] = a;
+    src_ar64_16 += 4;
+    dst_ar64_16 += 4;
+  }
+}
+
 #ifdef LIBYUV_RGB7
 // Old 7 bit math for compatibility on unsupported platforms.
 static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
@@ -462,80 +552,80 @@ static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
 // Intel version mimic SSE/AVX which does 2 pavgb
 #if LIBYUV_ARGBTOUV_PAVGB
 
-#define MAKEROWY(NAME, R, G, B, BPP)                                         \
-  void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
-    int x;                                                                   \
-    for (x = 0; x < width; ++x) {                                            \
-      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);           \
-      src_argb0 += BPP;                                                      \
-      dst_y += 1;                                                            \
-    }                                                                        \
-  }                                                                          \
-  void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
-                       uint8_t* dst_u, uint8_t* dst_v, int width) {          \
-    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \
-    int x;                                                                   \
-    for (x = 0; x < width - 1; x += 2) {                                     \
-      uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                      \
-                        AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));         \
-      uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                      \
-                        AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));         \
-      uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                      \
-                        AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));         \
-      dst_u[0] = RGBToU(ar, ag, ab);                                         \
-      dst_v[0] = RGBToV(ar, ag, ab);                                         \
-      src_rgb0 += BPP * 2;                                                   \
-      src_rgb1 += BPP * 2;                                                   \
-      dst_u += 1;                                                            \
-      dst_v += 1;                                                            \
-    }                                                                        \
-    if (width & 1) {                                                         \
-      uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]);                           \
-      uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]);                           \
-      uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]);                           \
-      dst_u[0] = RGBToU(ar, ag, ab);                                         \
-      dst_v[0] = RGBToV(ar, ag, ab);                                         \
-    }                                                                        \
+#define MAKEROWY(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                 \
+    for (x = 0; x < width; ++x) {                                          \
+      dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                      \
+      dst_y += 1;                                                          \
+    }                                                                      \
+  }                                                                        \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                       uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
+    int x;                                                                 \
+    for (x = 0; x < width - 1; x += 2) {                                   \
+      uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                     \
+                        AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));        \
+      uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                     \
+                        AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));        \
+      uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                     \
+                        AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));        \
+      dst_u[0] = RGBToU(ar, ag, ab);                                       \
+      dst_v[0] = RGBToV(ar, ag, ab);                                       \
+      src_rgb += BPP * 2;                                                  \
+      src_rgb1 += BPP * 2;                                                 \
+      dst_u += 1;                                                          \
+      dst_v += 1;                                                          \
+    }                                                                      \
+    if (width & 1) {                                                       \
+      uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                          \
+      uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                          \
+      uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                          \
+      dst_u[0] = RGBToU(ar, ag, ab);                                       \
+      dst_v[0] = RGBToV(ar, ag, ab);                                       \
+    }                                                                      \
   }
 #else
 // ARM version does sum / 2 then multiply by 2x smaller coefficients
-#define MAKEROWY(NAME, R, G, B, BPP)                                         \
-  void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
-    int x;                                                                   \
-    for (x = 0; x < width; ++x) {                                            \
-      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);           \
-      src_argb0 += BPP;                                                      \
-      dst_y += 1;                                                            \
-    }                                                                        \
-  }                                                                          \
-  void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
-                       uint8_t* dst_u, uint8_t* dst_v, int width) {          \
-    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \
-    int x;                                                                   \
-    for (x = 0; x < width - 1; x += 2) {                                     \
-      uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +         \
-                     src_rgb1[B + BPP] + 1) >>                               \
-                    1;                                                       \
-      uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +         \
-                     src_rgb1[G + BPP] + 1) >>                               \
-                    1;                                                       \
-      uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +         \
-                     src_rgb1[R + BPP] + 1) >>                               \
-                    1;                                                       \
-      dst_u[0] = RGB2xToU(ar, ag, ab);                                       \
-      dst_v[0] = RGB2xToV(ar, ag, ab);                                       \
-      src_rgb0 += BPP * 2;                                                   \
-      src_rgb1 += BPP * 2;                                                   \
-      dst_u += 1;                                                            \
-      dst_v += 1;                                                            \
-    }                                                                        \
-    if (width & 1) {                                                         \
-      uint16_t ab = src_rgb0[B] + src_rgb1[B];                               \
-      uint16_t ag = src_rgb0[G] + src_rgb1[G];                               \
-      uint16_t ar = src_rgb0[R] + src_rgb1[R];                               \
-      dst_u[0] = RGB2xToU(ar, ag, ab);                                       \
-      dst_v[0] = RGB2xToV(ar, ag, ab);                                       \
-    }                                                                        \
+#define MAKEROWY(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                 \
+    for (x = 0; x < width; ++x) {                                          \
+      dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                      \
+      dst_y += 1;                                                          \
+    }                                                                      \
+  }                                                                        \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                       uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
+    int x;                                                                 \
+    for (x = 0; x < width - 1; x += 2) {                                   \
+      uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +         \
+                     src_rgb1[B + BPP] + 1) >>                             \
+                    1;                                                     \
+      uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +         \
+                     src_rgb1[G + BPP] + 1) >>                             \
+                    1;                                                     \
+      uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +         \
+                     src_rgb1[R + BPP] + 1) >>                             \
+                    1;                                                     \
+      dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
+      src_rgb += BPP * 2;                                                  \
+      src_rgb1 += BPP * 2;                                                 \
+      dst_u += 1;                                                          \
+      dst_v += 1;                                                          \
+    }                                                                      \
+    if (width & 1) {                                                       \
+      uint16_t ab = src_rgb[B] + src_rgb1[B];                              \
+      uint16_t ag = src_rgb[G] + src_rgb1[G];                              \
+      uint16_t ar = src_rgb[R] + src_rgb1[R];                              \
+      dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
+    }                                                                      \
   }
 #endif
 
@@ -603,80 +693,80 @@ static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
 // ARGBToYJ_C and ARGBToUVJ_C
 // Intel version mimic SSE/AVX which does 2 pavgb
 #if LIBYUV_ARGBTOUV_PAVGB
-#define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
-  void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
-    int x;                                                                    \
-    for (x = 0; x < width; ++x) {                                             \
-      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);           \
-      src_argb0 += BPP;                                                       \
-      dst_y += 1;                                                             \
-    }                                                                         \
-  }                                                                           \
-  void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
-                        uint8_t* dst_u, uint8_t* dst_v, int width) {          \
-    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                      \
-    int x;                                                                    \
-    for (x = 0; x < width - 1; x += 2) {                                      \
-      uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                       \
-                        AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));          \
-      uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                       \
-                        AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));          \
-      uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                       \
-                        AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));          \
-      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
-      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
-      src_rgb0 += BPP * 2;                                                    \
-      src_rgb1 += BPP * 2;                                                    \
-      dst_u += 1;                                                             \
-      dst_v += 1;                                                             \
-    }                                                                         \
-    if (width & 1) {                                                          \
-      uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]);                            \
-      uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]);                            \
-      uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]);                            \
-      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
-      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
-    }                                                                         \
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                  \
+    for (x = 0; x < width; ++x) {                                           \
+      dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                       \
+      dst_y += 1;                                                           \
+    }                                                                       \
+  }                                                                         \
+  void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
+    int x;                                                                  \
+    for (x = 0; x < width - 1; x += 2) {                                    \
+      uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                      \
+                        AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));         \
+      uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                      \
+                        AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));         \
+      uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                      \
+                        AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));         \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
+      src_rgb += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                  \
+      dst_u += 1;                                                           \
+      dst_v += 1;                                                           \
+    }                                                                       \
+    if (width & 1) {                                                        \
+      uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                           \
+      uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                           \
+      uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                           \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
+    }                                                                       \
   }
 #else
 // ARM version does sum / 2 then multiply by 2x smaller coefficients
-#define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
-  void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
-    int x;                                                                    \
-    for (x = 0; x < width; ++x) {                                             \
-      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);           \
-      src_argb0 += BPP;                                                       \
-      dst_y += 1;                                                             \
-    }                                                                         \
-  }                                                                           \
-  void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
-                        uint8_t* dst_u, uint8_t* dst_v, int width) {          \
-    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                      \
-    int x;                                                                    \
-    for (x = 0; x < width - 1; x += 2) {                                      \
-      uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +          \
-                     src_rgb1[B + BPP] + 1) >>                                \
-                    1;                                                        \
-      uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +          \
-                     src_rgb1[G + BPP] + 1) >>                                \
-                    1;                                                        \
-      uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +          \
-                     src_rgb1[R + BPP] + 1) >>                                \
-                    1;                                                        \
-      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                       \
-      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                       \
-      src_rgb0 += BPP * 2;                                                    \
-      src_rgb1 += BPP * 2;                                                    \
-      dst_u += 1;                                                             \
-      dst_v += 1;                                                             \
-    }                                                                         \
-    if (width & 1) {                                                          \
-      uint16_t ab = (src_rgb0[B] + src_rgb1[B]);                              \
-      uint16_t ag = (src_rgb0[G] + src_rgb1[G]);                              \
-      uint16_t ar = (src_rgb0[R] + src_rgb1[R]);                              \
-      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                       \
-      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                       \
-    }                                                                         \
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                  \
+    for (x = 0; x < width; ++x) {                                           \
+      dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                       \
+      dst_y += 1;                                                           \
+    }                                                                       \
+  }                                                                         \
+  void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
+    int x;                                                                  \
+    for (x = 0; x < width - 1; x += 2) {                                    \
+      uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +          \
+                     src_rgb1[B + BPP] + 1) >>                              \
+                    1;                                                      \
+      uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +          \
+                     src_rgb1[G + BPP] + 1) >>                              \
+                    1;                                                      \
+      uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +          \
+                     src_rgb1[R + BPP] + 1) >>                              \
+                    1;                                                      \
+      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
+      src_rgb += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                  \
+      dst_u += 1;                                                           \
+      dst_v += 1;                                                           \
+    }                                                                       \
+    if (width & 1) {                                                        \
+      uint16_t ab = (src_rgb[B] + src_rgb1[B]);                             \
+      uint16_t ag = (src_rgb[G] + src_rgb1[G]);                             \
+      uint16_t ar = (src_rgb[R] + src_rgb1[R]);                             \
+      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
+    }                                                                       \
   }
 
 #endif
@@ -1146,16 +1236,16 @@ void ARGBShadeRow_C(const uint8_t* src_argb,
 #define REPEAT8(v) (v) | ((v) << 8)
 #define SHADE(f, v) v* f >> 16
 
-void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+void ARGBMultiplyRow_C(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    const uint32_t b = REPEAT8(src_argb0[0]);
-    const uint32_t g = REPEAT8(src_argb0[1]);
-    const uint32_t r = REPEAT8(src_argb0[2]);
-    const uint32_t a = REPEAT8(src_argb0[3]);
+    const uint32_t b = REPEAT8(src_argb[0]);
+    const uint32_t g = REPEAT8(src_argb[1]);
+    const uint32_t r = REPEAT8(src_argb[2]);
+    const uint32_t a = REPEAT8(src_argb[3]);
     const uint32_t b_scale = src_argb1[0];
     const uint32_t g_scale = src_argb1[1];
     const uint32_t r_scale = src_argb1[2];
@@ -1164,7 +1254,7 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0,
     dst_argb[1] = SHADE(g, g_scale);
     dst_argb[2] = SHADE(r, r_scale);
     dst_argb[3] = SHADE(a, a_scale);
-    src_argb0 += 4;
+    src_argb += 4;
     src_argb1 += 4;
     dst_argb += 4;
   }
@@ -1174,16 +1264,16 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0,
 
 #define SHADE(f, v) clamp255(v + f)
 
-void ARGBAddRow_C(const uint8_t* src_argb0,
+void ARGBAddRow_C(const uint8_t* src_argb,
                   const uint8_t* src_argb1,
                   uint8_t* dst_argb,
                   int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    const int b = src_argb0[0];
-    const int g = src_argb0[1];
-    const int r = src_argb0[2];
-    const int a = src_argb0[3];
+    const int b = src_argb[0];
+    const int g = src_argb[1];
+    const int r = src_argb[2];
+    const int a = src_argb[3];
     const int b_add = src_argb1[0];
     const int g_add = src_argb1[1];
     const int r_add = src_argb1[2];
@@ -1192,7 +1282,7 @@ void ARGBAddRow_C(const uint8_t* src_argb0,
     dst_argb[1] = SHADE(g, g_add);
     dst_argb[2] = SHADE(r, r_add);
     dst_argb[3] = SHADE(a, a_add);
-    src_argb0 += 4;
+    src_argb += 4;
     src_argb1 += 4;
     dst_argb += 4;
   }
@@ -1201,16 +1291,16 @@ void ARGBAddRow_C(const uint8_t* src_argb0,
 
 #define SHADE(f, v) clamp0(f - v)
 
-void ARGBSubtractRow_C(const uint8_t* src_argb0,
+void ARGBSubtractRow_C(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    const int b = src_argb0[0];
-    const int g = src_argb0[1];
-    const int r = src_argb0[2];
-    const int a = src_argb0[3];
+    const int b = src_argb[0];
+    const int g = src_argb[1];
+    const int r = src_argb[2];
+    const int a = src_argb[3];
     const int b_sub = src_argb1[0];
     const int g_sub = src_argb1[1];
     const int r_sub = src_argb1[2];
@@ -1219,7 +1309,7 @@ void ARGBSubtractRow_C(const uint8_t* src_argb0,
     dst_argb[1] = SHADE(g, g_sub);
     dst_argb[2] = SHADE(r, r_sub);
     dst_argb[3] = SHADE(a, a_sub);
-    src_argb0 += 4;
+    src_argb += 4;
     src_argb1 += 4;
     dst_argb += 4;
   }
@@ -1329,64 +1419,36 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 
 // Macros to create SIMD specific yuv to rgb conversion constants.
 
-#if defined(__aarch64__)
-#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR)        \
-  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = {       \
-      {UB, VR, UB, VR, UB, VR, UB, VR}, {UB, VR, UB, VR, UB, VR, UB, VR}, \
-      {UG, VG, UG, VG, UG, VG, UG, VG}, {UG, VG, UG, VG, UG, VG, UG, VG}, \
-      {BB, BG, BR, YB, 0, 0, 0, 0},     {0x0101 * YG, YG, 0, 0}};         \
-  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = {       \
-      {VR, UB, VR, UB, VR, UB, VR, UB}, {VR, UB, VR, UB, VR, UB, VR, UB}, \
-      {VG, UG, VG, UG, VG, UG, VG, UG}, {VG, UG, VG, UG, VG, UG, VG, UG}, \
-      {BR, BG, BB, YB, 0, 0, 0, 0},     {0x0101 * YG, YG, 0, 0}};
-
-#elif defined(__arm__)
-#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR)  \
-  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
-      {UB, UB, UB, UB, VR, VR, VR, VR, 0, 0, 0, 0, 0, 0, 0, 0},     \
-      {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},     \
-      {BB, BG, BR, YB, 0, 0, 0, 0},                                 \
-      {0x0101 * YG, YG, 0, 0}};                                     \
-  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
-      {VR, VR, VR, VR, UB, UB, UB, UB, 0, 0, 0, 0, 0, 0, 0, 0},     \
-      {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},     \
-      {BR, BG, BB, YB, 0, 0, 0, 0},                                 \
-      {0x0101 * YG, YG, 0, 0}};
+// clang-format off
 
+#if defined(__aarch64__) || defined(__arm__)
+// Bias values include subtract 128 from U and V, bias from Y and rounding.
+// For B and R bias is negative. For G bias is positive.
+#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                             \
+  {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},                     \
+   {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \
+    0, 0}}
 #else
-#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR)       \
-  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = {      \
-      {-UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0,   \
-       -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0},  \
-      {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,   \
-       UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},  \
-      {0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR,   \
-       0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR},  \
-      {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},  \
-      {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},  \
-      {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},  \
-      {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},  \
-      {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}; \
-  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = {      \
-      {-VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0,   \
-       -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0},  \
-      {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,   \
-       VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},  \
-      {0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB,   \
-       0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB},  \
-      {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},  \
-      {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},  \
-      {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},  \
-      {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},  \
-      {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}};
+#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                     \
+  {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,          \
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},         \
+   {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,  \
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
+   {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,          \
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},         \
+   {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
+   {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
 #endif
 
-// TODO(fbarchard): Generate SIMD structures from float matrix.
+// clang-format on
 
-// Bias values to round, and subtract 128 from U and V.
-#define BB (-UB * 128 + YB)
-#define BG (UG * 128 + VG * 128 + YB)
-#define BR (-VR * 128 + YB)
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR)            \
+  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \
+      YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR);                   \
+  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
+      YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
+
+// TODO(fbarchard): Generate SIMD structures from float matrix.
 
 // BT.601 limited range YUV to RGB reference
 //  R = (Y - 16) * 1.164             + V * 1.596
@@ -1395,7 +1457,11 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 // KR = 0.299; KB = 0.114
 
 // U and V contributions to R,G,B.
+#ifdef LIBYUV_UNLIMITED_DATA
+#define UB 129 /* round(2.018 * 64) */
+#else
 #define UB 128 /* max(128, round(2.018 * 64)) */
+#endif
 #define UG 25  /* round(0.391 * 64) */
 #define VG 52  /* round(0.813 * 64) */
 #define VR 102 /* round(1.596 * 64) */
@@ -1404,7 +1470,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
-MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR)
 
 #undef YG
 #undef YB
@@ -1429,7 +1495,7 @@ MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
 #define YB 32    /* 64 / 2 */
 
-MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)
 
 #undef YG
 #undef YB
@@ -1444,9 +1510,12 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 //  B = (Y - 16) * 1.164 + U * 2.112
 //  KR = 0.2126, KB = 0.0722
 
-// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
 // U and V contributions to R,G,B.
+#ifdef LIBYUV_UNLIMITED_DATA
+#define UB 135 /* round(2.112 * 64) */
+#else
 #define UB 128 /* max(128, round(2.112 * 64)) */
+#endif
 #define UG 14  /* round(0.213 * 64) */
 #define VG 34  /* round(0.533 * 64) */
 #define VR 115 /* round(1.793 * 64) */
@@ -1455,7 +1524,7 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
-MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR)
 
 #undef YG
 #undef YB
@@ -1480,7 +1549,7 @@ MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
 #define YB 32    /* 64 / 2 */
 
-MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)
 
 #undef YG
 #undef YB
@@ -1495,9 +1564,12 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 //  B = (Y - 16) * 1.164384 + U * 2.14177
 // KR = 0.2627; KB = 0.0593
 
-// TODO(fbarchard): Improve accuracy; the B channel is off by 7%.
 // U and V contributions to R,G,B.
+#ifdef LIBYUV_UNLIMITED_DATA
+#define UB 137 /* round(2.142 * 64) */
+#else
 #define UB 128 /* max(128, round(2.142 * 64)) */
+#endif
 #define UG 12  /* round(0.187326 * 64) */
 #define VG 42  /* round(0.65042 * 64) */
 #define VR 107 /* round(1.67867 * 64) */
@@ -1506,7 +1578,7 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
 #define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
 
-MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR)
 
 #undef YG
 #undef YB
@@ -1530,7 +1602,7 @@ MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
 #define YB 32    /* 64 / 2 */
 
-MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)
 
 #undef YG
 #undef YB
@@ -1545,6 +1617,42 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 
 #undef MAKEYUVCONSTANTS
 
+#if defined(__aarch64__) || defined(__arm__)
+#define LOAD_YUV_CONSTANTS                 \
+  int ub = yuvconstants->kUVCoeff[0];      \
+  int vr = yuvconstants->kUVCoeff[1];      \
+  int ug = yuvconstants->kUVCoeff[2];      \
+  int vg = yuvconstants->kUVCoeff[3];      \
+  int yg = yuvconstants->kRGBCoeffBias[0]; \
+  int bb = yuvconstants->kRGBCoeffBias[1]; \
+  int bg = yuvconstants->kRGBCoeffBias[2]; \
+  int br = yuvconstants->kRGBCoeffBias[3]
+
+#define CALC_RGB16                         \
+  int32_t y1 = (uint32_t)(y32 * yg) >> 16; \
+  int b16 = y1 + (u * ub) - bb;            \
+  int g16 = y1 + bg - (u * ug + v * vg);   \
+  int r16 = y1 + (v * vr) - br
+#else
+#define LOAD_YUV_CONSTANTS           \
+  int ub = yuvconstants->kUVToB[0];  \
+  int ug = yuvconstants->kUVToG[0];  \
+  int vg = yuvconstants->kUVToG[1];  \
+  int vr = yuvconstants->kUVToR[1];  \
+  int yg = yuvconstants->kYToRgb[0]; \
+  int yb = yuvconstants->kYBiasToRgb[0]
+
+#define CALC_RGB16                                \
+  int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \
+  int8_t ui = u;                                  \
+  int8_t vi = v;                                  \
+  ui -= 0x80;                                     \
+  vi -= 0x80;                                     \
+  int b16 = y1 + (ui * ub);                       \
+  int g16 = y1 - (ui * ug + vi * vg);             \
+  int r16 = y1 + (vi * vr)
+#endif
+
 // C reference code that mimics the YUV assembly.
 // Reads 8 bit YUV and leaves result as 16 bit.
 static __inline void YuvPixel(uint8_t y,
@@ -1554,39 +1662,12 @@ static __inline void YuvPixel(uint8_t y,
                               uint8_t* g,
                               uint8_t* r,
                               const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = -yuvconstants->kUVToRB[1];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[1];
-#elif defined(__arm__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[4];
-  int vr = -yuvconstants->kUVToRB[4];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[1];
-#else
-  int ub = yuvconstants->kUVToB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = yuvconstants->kUVToR[1];
-  int bb = yuvconstants->kUVBiasB[0];
-  int bg = yuvconstants->kUVBiasG[0];
-  int br = yuvconstants->kUVBiasR[0];
-  int yg = yuvconstants->kYToRgb[0];
-#endif
-
-  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
-  *b = Clamp((int32_t)(y1 + -(u * ub) + bb) >> 6);
-  *g = Clamp((int32_t)(y1 + -(u * ug + v * vg) + bg) >> 6);
-  *r = Clamp((int32_t)(y1 + -(v * vr) + br) >> 6);
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y * 0x0101;
+  CALC_RGB16;
+  *b = Clamp((int32_t)(b16) >> 6);
+  *g = Clamp((int32_t)(g16) >> 6);
+  *r = Clamp((int32_t)(r16) >> 6);
 }
 
 // Reads 8 bit YUV and leaves result as 16 bit.
@@ -1597,85 +1678,50 @@ static __inline void YuvPixel8_16(uint8_t y,
                                   int* g,
                                   int* r,
                                   const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = -yuvconstants->kUVToRB[1];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[1];
-#elif defined(__arm__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[4];
-  int vr = -yuvconstants->kUVToRB[4];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[1];
-#else
-  int ub = yuvconstants->kUVToB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = yuvconstants->kUVToR[1];
-  int bb = yuvconstants->kUVBiasB[0];
-  int bg = yuvconstants->kUVBiasG[0];
-  int br = yuvconstants->kUVBiasR[0];
-  int yg = yuvconstants->kYToRgb[0];
-#endif
-
-  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
-  *b = (int)(-(u * ub) + y1 + bb);
-  *g = (int)(-(u * ug + v * vg) + y1 + bg);
-  *r = (int)(-(v * vr) + y1 + br);
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y * 0x0101;
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
 }
 
 // C reference code that mimics the YUV 16 bit assembly.
 // Reads 10 bit YUV and leaves result as 16 bit.
-static __inline void YuvPixel16(int16_t y,
-                                int16_t u,
-                                int16_t v,
-                                int* b,
-                                int* g,
-                                int* r,
-                                const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = -yuvconstants->kUVToRB[1];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[1];
-#elif defined(__arm__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[4];
-  int vr = -yuvconstants->kUVToRB[4];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[1];
-#else
-  int ub = yuvconstants->kUVToB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = yuvconstants->kUVToR[1];
-  int bb = yuvconstants->kUVBiasB[0];
-  int bg = yuvconstants->kUVBiasG[0];
-  int br = yuvconstants->kUVBiasR[0];
-  int yg = yuvconstants->kYToRgb[0];
-#endif
-
-  uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16;
+static __inline void YuvPixel10_16(uint16_t y,
+                                   uint16_t u,
+                                   uint16_t v,
+                                   int* b,
+                                   int* g,
+                                   int* r,
+                                   const struct YuvConstants* yuvconstants) {
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y << 6;
   u = clamp255(u >> 2);
   v = clamp255(v >> 2);
-  *b = (int)(-(u * ub) + y1 + bb);
-  *g = (int)(-(u * ug + v * vg) + y1 + bg);
-  *r = (int)(-(v * vr) + y1 + br);
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
+}
+
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 12 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel12_16(int16_t y,
+                                   int16_t u,
+                                   int16_t v,
+                                   int* b,
+                                   int* g,
+                                   int* r,
+                                   const struct YuvConstants* yuvconstants) {
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y << 4;
+  u = clamp255(u >> 4);
+  v = clamp255(v >> 4);
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
 }
 
 // C reference code that mimics the YUV 10 bit assembly.
@@ -1690,22 +1736,78 @@ static __inline void YuvPixel10(uint16_t y,
   int b16;
   int g16;
   int r16;
-  YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants);
+  YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants);
+  *b = Clamp(b16 >> 6);
+  *g = Clamp(g16 >> 6);
+  *r = Clamp(r16 >> 6);
+}
+
+// C reference code that mimics the YUV 12 bit assembly.
+// Reads 12 bit YUV and clamps down to 8 bit RGB.
+static __inline void YuvPixel12(uint16_t y,
+                                uint16_t u,
+                                uint16_t v,
+                                uint8_t* b,
+                                uint8_t* g,
+                                uint8_t* r,
+                                const struct YuvConstants* yuvconstants) {
+  int b16;
+  int g16;
+  int r16;
+  YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants);
   *b = Clamp(b16 >> 6);
   *g = Clamp(g16 >> 6);
   *r = Clamp(r16 >> 6);
 }
 
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 16 bit YUV and leaves result as 8 bit.
+static __inline void YuvPixel16_8(uint16_t y,
+                                  uint16_t u,
+                                  uint16_t v,
+                                  uint8_t* b,
+                                  uint8_t* g,
+                                  uint8_t* r,
+                                  const struct YuvConstants* yuvconstants) {
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y;
+  u = clamp255(u >> 8);
+  v = clamp255(v >> 8);
+  CALC_RGB16;
+  *b = Clamp((int32_t)(b16) >> 6);
+  *g = Clamp((int32_t)(g16) >> 6);
+  *r = Clamp((int32_t)(r16) >> 6);
+}
+
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 16 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel16_16(uint16_t y,
+                                   uint16_t u,
+                                   uint16_t v,
+                                   int* b,
+                                   int* g,
+                                   int* r,
+                                   const struct YuvConstants* yuvconstants) {
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y;
+  u = clamp255(u >> 8);
+  v = clamp255(v >> 8);
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
+}
+
 // C reference code that mimics the YUV assembly.
-// Reads 8 bit YUV and leaves result as 16 bit.
+// Reads 8 bit YUV and leaves result as 8 bit.
 static __inline void YPixel(uint8_t y,
                             uint8_t* b,
                             uint8_t* g,
                             uint8_t* r,
                             const struct YuvConstants* yuvconstants) {
 #if defined(__aarch64__) || defined(__arm__)
-  int ygb = yuvconstants->kUVBiasBGR[3];
-  int yg = yuvconstants->kYToRgb[1];
+  int yg = yuvconstants->kRGBCoeffBias[0];
+  int ygb = yuvconstants->kRGBCoeffBias[4];
 #else
   int ygb = yuvconstants->kYBiasToRgb[0];
   int yg = yuvconstants->kYToRgb[0];
@@ -1716,38 +1818,6 @@ static __inline void YPixel(uint8_t y,
   *r = Clamp(((int32_t)(y1) + ygb) >> 6);
 }
 
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
-// C mimic assembly.
-// TODO(fbarchard): Remove subsampling from Neon.
-void I444ToARGBRow_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
-    uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
-    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
-             yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
-             yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    src_u += 2;
-    src_v += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-#else
 void I444ToARGBRow_C(const uint8_t* src_y,
                      const uint8_t* src_u,
                      const uint8_t* src_v,
@@ -1765,7 +1835,6 @@ void I444ToARGBRow_C(const uint8_t* src_y,
     rgb_buf += 4;  // Advance 1 pixel.
   }
 }
-#endif
 
 // Also used for 420
 void I422ToARGBRow_C(const uint8_t* src_y,
@@ -1821,9 +1890,102 @@ void I210ToARGBRow_C(const uint16_t* src_y,
   }
 }
 
+void I410ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 4;  // Advance 1 pixels.
+  }
+}
+
+void I210AlphaToARGBRow_C(const uint16_t* src_y,
+                          const uint16_t* src_u,
+                          const uint16_t* src_v,
+                          const uint16_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = clamp255(src_a[0] >> 2);
+    YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+               rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = clamp255(src_a[1] >> 2);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    src_a += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = clamp255(src_a[0] >> 2);
+  }
+}
+
+void I410AlphaToARGBRow_C(const uint16_t* src_y,
+                          const uint16_t* src_u,
+                          const uint16_t* src_v,
+                          const uint16_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = clamp255(src_a[0] >> 2);
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    src_a += 1;
+    rgb_buf += 4;  // Advance 1 pixels.
+  }
+}
+
+// 12 bit YUV to ARGB
+void I212ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+               rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
 static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
   uint32_t ar30;
-  b = b >> 4;  // convert 10.6 to 10 bit.
+  b = b >> 4;  // convert 8 bit 10.6 to 10 bit.
   g = g >> 4;
   r = r >> 4;
   b = Clamp10(b);
@@ -1845,9 +2007,9 @@ void I210ToAR30Row_C(const uint16_t* src_y,
   int g;
   int r;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
     StoreAR30(rgb_buf, b, g, r);
-    YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    YuvPixel10_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
     StoreAR30(rgb_buf + 4, b, g, r);
     src_y += 2;
     src_u += 1;
@@ -1855,16 +2017,15 @@ void I210ToAR30Row_C(const uint16_t* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
     StoreAR30(rgb_buf, b, g, r);
   }
 }
 
-// 8 bit YUV to 10 bit AR30
-// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
-void I422ToAR30Row_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
+// 12 bit YUV to 10 bit AR30
+void I212ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
                      uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
@@ -1873,9 +2034,9 @@ void I422ToAR30Row_C(const uint8_t* src_y,
   int g;
   int r;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
     StoreAR30(rgb_buf, b, g, r);
-    YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
     StoreAR30(rgb_buf + 4, b, g, r);
     src_y += 2;
     src_u += 1;
@@ -1883,45 +2044,142 @@ void I422ToAR30Row_C(const uint8_t* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
     StoreAR30(rgb_buf, b, g, r);
   }
 }
 
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
-// C mimic assembly.
-// TODO(fbarchard): Remove subsampling from Neon.
-void I444AlphaToARGBRow_C(const uint8_t* src_y,
-                          const uint8_t* src_u,
-                          const uint8_t* src_v,
-                          const uint8_t* src_a,
-                          uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
+void I410ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width; ++x) {
+    YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+
+// P210 has 10 bits in msb of 16 bit NV12 style layout.
+void P210ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
-    uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
-    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
-             yuvconstants);
-    rgb_buf[3] = src_a[0];
-    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
-             yuvconstants);
-    rgb_buf[7] = src_a[1];
+    YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
+                 dst_argb + 2, yuvconstants);
+    dst_argb[3] = 255;
+    YuvPixel16_8(src_y[1], src_uv[0], src_uv[1], dst_argb + 4, dst_argb + 5,
+                 dst_argb + 6, yuvconstants);
+    dst_argb[7] = 255;
     src_y += 2;
-    src_u += 2;
-    src_v += 2;
-    src_a += 2;
+    src_uv += 2;
+    dst_argb += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
+                 dst_argb + 2, yuvconstants);
+    dst_argb[3] = 255;
+  }
+}
+
+void P410ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
+                 dst_argb + 2, yuvconstants);
+    dst_argb[3] = 255;
+    src_y += 1;
+    src_uv += 2;
+    dst_argb += 4;  // Advance 1 pixels.
+  }
+}
+
+void P210ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+    StoreAR30(dst_ar30, b, g, r);
+    YuvPixel16_16(src_y[1], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+    StoreAR30(dst_ar30 + 4, b, g, r);
+    src_y += 2;
+    src_uv += 2;
+    dst_ar30 += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+    StoreAR30(dst_ar30, b, g, r);
+  }
+}
+
+void P410ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width; ++x) {
+    YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+    StoreAR30(dst_ar30, b, g, r);
+    src_y += 1;
+    src_uv += 2;
+    dst_ar30 += 4;  // Advance 1 pixel.
+  }
+}
+
+// 8 bit YUV to 10 bit AR30
+// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
+void I422ToAR30Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf + 4, b, g, r);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = src_a[0];
+    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
   }
 }
-#else
+
 void I444AlphaToARGBRow_C(const uint8_t* src_y,
                           const uint8_t* src_u,
                           const uint8_t* src_v,
@@ -1941,7 +2199,6 @@ void I444AlphaToARGBRow_C(const uint8_t* src_y,
     rgb_buf += 4;  // Advance 1 pixel.
   }
 }
-#endif
 
 void I422AlphaToARGBRow_C(const uint8_t* src_y,
                           const uint8_t* src_u,
@@ -2492,6 +2749,105 @@ void MergeARGBRow_C(const uint8_t* src_r,
   }
 }
 
+void MergeXR30Row_C(const uint16_t* src_r,
+                    const uint16_t* src_g,
+                    const uint16_t* src_b,
+                    uint8_t* dst_ar30,
+                    int depth,
+                    int width) {
+  assert(depth >= 10);
+  assert(depth <= 16);
+  int x;
+  int shift = depth - 10;
+  uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30;
+  for (x = 0; x < width; ++x) {
+    uint32_t r = clamp1023(src_r[x] >> shift);
+    uint32_t g = clamp1023(src_g[x] >> shift);
+    uint32_t b = clamp1023(src_b[x] >> shift);
+    dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000;
+  }
+}
+
+void MergeAR64Row_C(const uint16_t* src_r,
+                    const uint16_t* src_g,
+                    const uint16_t* src_b,
+                    const uint16_t* src_a,
+                    uint16_t* dst_ar64,
+                    int depth,
+                    int width) {
+  assert(depth >= 1);
+  assert(depth <= 16);
+  int x;
+  int shift = 16 - depth;
+  int max = (1 << depth) - 1;
+  for (x = 0; x < width; ++x) {
+    dst_ar64[0] = ClampMax(src_b[x], max) << shift;
+    dst_ar64[1] = ClampMax(src_g[x], max) << shift;
+    dst_ar64[2] = ClampMax(src_r[x], max) << shift;
+    dst_ar64[3] = ClampMax(src_a[x], max) << shift;
+    dst_ar64 += 4;
+  }
+}
+
+void MergeARGB16To8Row_C(const uint16_t* src_r,
+                         const uint16_t* src_g,
+                         const uint16_t* src_b,
+                         const uint16_t* src_a,
+                         uint8_t* dst_argb,
+                         int depth,
+                         int width) {
+  assert(depth >= 8);
+  assert(depth <= 16);
+  int x;
+  int shift = depth - 8;
+  for (x = 0; x < width; ++x) {
+    dst_argb[0] = clamp255(src_b[x] >> shift);
+    dst_argb[1] = clamp255(src_g[x] >> shift);
+    dst_argb[2] = clamp255(src_r[x] >> shift);
+    dst_argb[3] = clamp255(src_a[x] >> shift);
+    dst_argb += 4;
+  }
+}
+
+void MergeXR64Row_C(const uint16_t* src_r,
+                    const uint16_t* src_g,
+                    const uint16_t* src_b,
+                    uint16_t* dst_ar64,
+                    int depth,
+                    int width) {
+  assert(depth >= 1);
+  assert(depth <= 16);
+  int x;
+  int shift = 16 - depth;
+  int max = (1 << depth) - 1;
+  for (x = 0; x < width; ++x) {
+    dst_ar64[0] = ClampMax(src_b[x], max) << shift;
+    dst_ar64[1] = ClampMax(src_g[x], max) << shift;
+    dst_ar64[2] = ClampMax(src_r[x], max) << shift;
+    dst_ar64[3] = 0xffff;
+    dst_ar64 += 4;
+  }
+}
+
+void MergeXRGB16To8Row_C(const uint16_t* src_r,
+                         const uint16_t* src_g,
+                         const uint16_t* src_b,
+                         uint8_t* dst_argb,
+                         int depth,
+                         int width) {
+  assert(depth >= 8);
+  assert(depth <= 16);
+  int x;
+  int shift = depth - 8;
+  for (x = 0; x < width; ++x) {
+    dst_argb[0] = clamp255(src_b[x] >> shift);
+    dst_argb[1] = clamp255(src_g[x] >> shift);
+    dst_argb[2] = clamp255(src_r[x] >> shift);
+    dst_argb[3] = 0xff;
+    dst_argb += 4;
+  }
+}
+
 void SplitXRGBRow_C(const uint8_t* src_argb,
                     uint8_t* dst_r,
                     uint8_t* dst_g,
@@ -2528,6 +2884,8 @@ void MergeUVRow_16_C(const uint16_t* src_u,
                      int depth,
                      int width) {
   int shift = 16 - depth;
+  assert(depth >= 8);
+  assert(depth <= 16);
   int x;
   for (x = 0; x < width; ++x) {
     dst_uv[0] = src_u[x] << shift;
@@ -2544,6 +2902,8 @@ void SplitUVRow_16_C(const uint16_t* src_uv,
                      int width) {
   int shift = 16 - depth;
   int x;
+  assert(depth >= 8);
+  assert(depth <= 16);
   for (x = 0; x < width; ++x) {
     dst_u[x] = src_uv[0] >> shift;
     dst_v[x] = src_uv[1] >> shift;
@@ -2581,6 +2941,9 @@ void Convert16To8Row_C(const uint16_t* src_y,
                        int scale,
                        int width) {
   int x;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+
   for (x = 0; x < width; ++x) {
     dst_y[x] = clamp255((src_y[x] * scale) >> 16);
   }
@@ -2714,19 +3077,19 @@ void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
 
 #define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
 
-// Blend src_argb0 over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb0 or src_argb1.
+// Blend src_argb over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb or src_argb1.
 // This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_C(const uint8_t* src_argb0,
+void ARGBBlendRow_C(const uint8_t* src_argb,
                     const uint8_t* src_argb1,
                     uint8_t* dst_argb,
                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint32_t fb = src_argb0[0];
-    uint32_t fg = src_argb0[1];
-    uint32_t fr = src_argb0[2];
-    uint32_t a = src_argb0[3];
+    uint32_t fb = src_argb[0];
+    uint32_t fg = src_argb[1];
+    uint32_t fr = src_argb[2];
+    uint32_t a = src_argb[3];
     uint32_t bb = src_argb1[0];
     uint32_t bg = src_argb1[1];
     uint32_t br = src_argb1[2];
@@ -2735,10 +3098,10 @@ void ARGBBlendRow_C(const uint8_t* src_argb0,
     dst_argb[2] = BLEND(fr, br, a);
     dst_argb[3] = 255u;
 
-    fb = src_argb0[4 + 0];
-    fg = src_argb0[4 + 1];
-    fr = src_argb0[4 + 2];
-    a = src_argb0[4 + 3];
+    fb = src_argb[4 + 0];
+    fg = src_argb[4 + 1];
+    fr = src_argb[4 + 2];
+    a = src_argb[4 + 3];
     bb = src_argb1[4 + 0];
     bg = src_argb1[4 + 1];
     br = src_argb1[4 + 2];
@@ -2746,16 +3109,16 @@ void ARGBBlendRow_C(const uint8_t* src_argb0,
     dst_argb[4 + 1] = BLEND(fg, bg, a);
     dst_argb[4 + 2] = BLEND(fr, br, a);
     dst_argb[4 + 3] = 255u;
-    src_argb0 += 8;
+    src_argb += 8;
     src_argb1 += 8;
     dst_argb += 8;
   }
 
   if (width & 1) {
-    uint32_t fb = src_argb0[0];
-    uint32_t fg = src_argb0[1];
-    uint32_t fr = src_argb0[2];
-    uint32_t a = src_argb0[3];
+    uint32_t fb = src_argb[0];
+    uint32_t fg = src_argb[1];
+    uint32_t fr = src_argb[2];
+    uint32_t a = src_argb[3];
     uint32_t bb = src_argb1[0];
     uint32_t bg = src_argb1[1];
     uint32_t br = src_argb1[2];
@@ -3280,7 +3643,7 @@ void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
 // Maximum temporary width for wrappers to process at a time, in pixels.
 #define MAXTWIDTH 2048
 
-#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
+#if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \
     defined(HAS_I422TORGB565ROW_SSSE3)
 // row_win.cc has asm version, but GCC uses 2 step wrapper.
 void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
@@ -3747,13 +4110,14 @@ void NV21ToYUV24Row_C(const uint8_t* src_y,
 }
 
 // Filter 2 rows of AYUV UV's (444) into UV (420).
+// AYUV is VUYA in memory.  UV for NV12 is UV order in memory.
 void AYUVToUVRow_C(const uint8_t* src_ayuv,
                    int src_stride_ayuv,
                    uint8_t* dst_uv,
                    int width) {
   // Output a row of UV values, filtering 2x2 rows of AYUV.
   int x;
-  for (x = 0; x < width; x += 2) {
+  for (x = 0; x < width - 1; x += 2) {
     dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
                  src_ayuv[src_stride_ayuv + 5] + 2) >>
                 2;
@@ -3764,12 +4128,8 @@ void AYUVToUVRow_C(const uint8_t* src_ayuv,
     dst_uv += 2;
   }
   if (width & 1) {
-    dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
-                 src_ayuv[src_stride_ayuv + 0] + 2) >>
-                2;
-    dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
-                 src_ayuv[src_stride_ayuv + 1] + 2) >>
-                2;
+    dst_uv[0] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
+    dst_uv[1] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
   }
 }
 
@@ -3780,7 +4140,7 @@ void AYUVToVURow_C(const uint8_t* src_ayuv,
                    int width) {
   // Output a row of VU values, filtering 2x2 rows of AYUV.
   int x;
-  for (x = 0; x < width; x += 2) {
+  for (x = 0; x < width - 1; x += 2) {
     dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
                  src_ayuv[src_stride_ayuv + 4] + 2) >>
                 2;
@@ -3791,12 +4151,8 @@ void AYUVToVURow_C(const uint8_t* src_ayuv,
     dst_vu += 2;
   }
   if (width & 1) {
-    dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
-                 src_ayuv[src_stride_ayuv + 0] + 2) >>
-                2;
-    dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
-                 src_ayuv[src_stride_ayuv + 1] + 2) >>
-                2;
+    dst_vu[0] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
+    dst_vu[1] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
   }
 }