14 files changed, 1969 insertions, 1646 deletions
diff --git a/README.chromium b/README.chromium
index 9c78a007..4a6830aa 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1744
+Version: 1746
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 57395262..2aa95335 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -313,6 +313,15 @@ int ARGBMirror(const uint8_t* src_argb,
                int width,
                int height);
 
+// Mirror a plane of data.
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y,
+                 int src_stride_y,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 int width,
+                 int height);
+
 // Convert NV12 to RGB565.
 LIBYUV_API
 int NV12ToRGB565(const uint8_t* src_y,
diff --git a/include/libyuv/rotate.h b/include/libyuv/rotate.h
index c64e0216..30888224 100644
--- a/include/libyuv/rotate.h
+++ b/include/libyuv/rotate.h
@@ -118,6 +118,10 @@ void RotatePlane270(const uint8_t* src,
                     int width,
                     int height);
 
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them. Deprecated.
 LIBYUV_API
 void RotateUV90(const uint8_t* src,
                 int src_stride,
@@ -128,10 +132,6 @@ void RotateUV90(const uint8_t* src,
                 int width,
                 int height);
 
-// Rotations for when U and V are interleaved.
-// These functions take one input pointer and
-// split the data into two buffers while
-// rotating them. Deprecated.
 LIBYUV_API
 void RotateUV180(const uint8_t* src,
                  int src_stride,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 2b52c724..9d487f0c 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1744
+#define LIBYUV_VERSION 1746
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc
index 6e8f672a..a22ba75b 100644
--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@@ -33,8 +33,10 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
       "ld1        {v0.16b, v1.16b}, [%0], #32    \n"
       "ld1        {v2.16b, v3.16b}, [%1], #32    \n"
       "eor        v0.16b, v0.16b, v2.16b         \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "eor        v1.16b, v1.16b, v3.16b         \n"
       "cnt        v0.16b, v0.16b                 \n"
+      "prfm       pldl1keep, [%1, 448]           \n"
       "cnt        v1.16b, v1.16b                 \n"
       "subs       %w2, %w2, #32                  \n"
       "add        v0.16b, v0.16b, v1.16b         \n"
@@ -65,8 +67,10 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
       "subs       %w2, %w2, #16                  \n"
       "usubl      v2.8h, v0.8b, v1.8b            \n"
       "usubl2     v3.8h, v0.16b, v1.16b          \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "smlal      v16.4s, v2.4h, v2.4h           \n"
       "smlal      v17.4s, v3.4h, v3.4h           \n"
+      "prfm       pldl1keep, [%1, 448]           \n"
       "smlal2     v18.4s, v2.8h, v2.8h           \n"
       "smlal2     v19.4s, v3.8h, v3.8h           \n"
       "b.gt       1b                             \n"
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 7e7e6e35..b6aac913 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -716,70 +716,6 @@ void MergeRGBPlane(const uint8_t* src_r,
   }
 }
 
-// Mirror a plane of data.
-void MirrorPlane(const uint8_t* src_y,
-                 int src_stride_y,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 int width,
-                 int height) {
-  int y;
-  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-#if defined(HAS_MIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MirrorRow = MirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 32)) {
-      MirrorRow = MirrorRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    MirrorRow = MirrorRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MirrorRow = MirrorRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      MirrorRow = MirrorRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    MirrorRow = MirrorRow_Any_MSA;
-    if (IS_ALIGNED(width, 64)) {
-      MirrorRow = MirrorRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MirrorRow = MirrorRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      MirrorRow = MirrorRow_MMI;
-    }
-  }
-#endif
-
-  // Mirror plane
-  for (y = 0; y < height; ++y) {
-    MirrorRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-  }
-}
-
 // Convert YUY2 to I422.
 LIBYUV_API
 int YUY2ToI422(const uint8_t* src_yuy2,
@@ -1047,6 +983,68 @@ int YUY2ToY(const uint8_t* src_yuy2,
   return 0;
 }
 
+// Mirror a plane of data.
+// See Also I400Mirror
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y,
+                 int dst_stride_y, int width, int height) {
+  int y;
+  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MirrorRow = MirrorRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorRow = MirrorRow_MMI;
+    }
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    MirrorRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
 // Mirror I400 with optional flipping
 LIBYUV_API
 int I400Mirror(const uint8_t* src_y,
diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc
index a93fd55f..12a240f3 100644
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -21,17 +21,21 @@ namespace libyuv {
 extern "C" {
 #endif
 
-static void ARGBTranspose(const uint8_t* src_argb,
-                          int src_stride_argb,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          int width,
-                          int height) {
+static int ARGBTranspose(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
   int i;
   int src_pixel_step = src_stride_argb >> 2;
   void (*ScaleARGBRowDownEven)(
       const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
       uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
+  // Check stride is a multiple of 4.
+  if (src_stride_argb & 3) {
+    return -1;
+  }
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
@@ -70,44 +74,45 @@ static void ARGBTranspose(const uint8_t* src_argb,
     dst_argb += dst_stride_argb;
     src_argb += 4;
   }
+  return 0;
 }
 
-void ARGBRotate90(const uint8_t* src_argb,
-                  int src_stride_argb,
-                  uint8_t* dst_argb,
-                  int dst_stride_argb,
-                  int width,
-                  int height) {
+static int ARGBRotate90(const uint8_t* src_argb,
+                        int src_stride_argb,
+                        uint8_t* dst_argb,
+                        int dst_stride_argb,
+                        int width,
+                        int height) {
   // Rotate by 90 is a ARGBTranspose with the source read
   // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
   src_argb += src_stride_argb * (height - 1);
   src_stride_argb = -src_stride_argb;
-  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                height);
+  return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                       width, height);
 }
 
-void ARGBRotate270(const uint8_t* src_argb,
-                   int src_stride_argb,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height) {
+static int ARGBRotate270(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
   // Rotate by 270 is a ARGBTranspose with the destination written
   // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
   dst_argb += dst_stride_argb * (width - 1);
   dst_stride_argb = -dst_stride_argb;
-  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                height);
+  return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                       width, height);
 }
 
-void ARGBRotate180(const uint8_t* src_argb,
-                   int src_stride_argb,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height) {
+static int ARGBRotate180(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
   // Swap first and last row and mirror the content. Uses a temporary row.
   align_buffer_64(row, width * 4);
   const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
@@ -190,6 +195,7 @@ void ARGBRotate180(const uint8_t* src_argb,
     dst_bot -= dst_stride_argb;
   }
   free_aligned_buffer_64(row);
+  return 0;
 }
 
 LIBYUV_API
@@ -217,17 +223,14 @@ int ARGBRotate(const uint8_t* src_argb,
       return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                       width, height);
     case kRotate90:
-      ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                   height);
-      return 0;
+      return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                          width, height);
     case kRotate270:
-      ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                    height);
-      return 0;
+      return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                           width, height);
     case kRotate180:
-      ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                    height);
-      return 0;
+      return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                           width, height);
     default:
       break;
   }
diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc
index f469baac..99f7ee16 100644
--- a/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@@ -37,7 +37,7 @@ void TransposeWx8_NEON(const uint8_t* src,
       "sub         %w3, %w3, #8                     \n"
 
       // handle 8x8 blocks. this should be the majority of the plane
-      "1:                                          \n"
+      "1:                                        \n"
       "mov         %0, %1                        \n"
 
       "ld1        {v0.8b}, [%0], %5              \n"
@@ -48,23 +48,39 @@ void TransposeWx8_NEON(const uint8_t* src,
       "ld1        {v5.8b}, [%0], %5              \n"
       "ld1        {v6.8b}, [%0], %5              \n"
       "ld1        {v7.8b}, [%0]                  \n"
+      "mov         %0, %1                        \n"
 
       "trn2     v16.8b, v0.8b, v1.8b             \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "trn1     v17.8b, v0.8b, v1.8b             \n"
+      "add        %0, %0, %5                     \n"
       "trn2     v18.8b, v2.8b, v3.8b             \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // row 1
       "trn1     v19.8b, v2.8b, v3.8b             \n"
+      "add        %0, %0, %5                     \n"
       "trn2     v20.8b, v4.8b, v5.8b             \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // row 2
       "trn1     v21.8b, v4.8b, v5.8b             \n"
+      "add        %0, %0, %5                     \n"
       "trn2     v22.8b, v6.8b, v7.8b             \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // row 3
       "trn1     v23.8b, v6.8b, v7.8b             \n"
+      "add        %0, %0, %5                     \n"
 
       "trn2     v3.4h, v17.4h, v19.4h            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // row 4
       "trn1     v1.4h, v17.4h, v19.4h            \n"
+      "add        %0, %0, %5                     \n"
       "trn2     v2.4h, v16.4h, v18.4h            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // row 5
       "trn1     v0.4h, v16.4h, v18.4h            \n"
+      "add        %0, %0, %5                     \n"
       "trn2     v7.4h, v21.4h, v23.4h            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // row 6
       "trn1     v5.4h, v21.4h, v23.4h            \n"
+      "add        %0, %0, %5                     \n"
       "trn2     v6.4h, v20.4h, v22.4h            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // row 7
       "trn1     v4.4h, v20.4h, v22.4h            \n"
 
       "trn2     v21.2s, v1.2s, v5.2s             \n"
@@ -226,6 +242,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
       "ld1       {v5.16b}, [%0], %5              \n"
       "ld1       {v6.16b}, [%0], %5              \n"
       "ld1       {v7.16b}, [%0]                  \n"
+      "mov       %0, %1                          \n"
 
       "trn1      v16.16b, v0.16b, v1.16b         \n"
       "trn2      v17.16b, v0.16b, v1.16b         \n"
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index fa7b8cb3..c041ba11 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -84,7 +84,7 @@ static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
                                 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
-                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
+                               0x8080u, 0x8080u, 0x8080u, 0x8080u};
 
 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
 
@@ -1101,8 +1101,11 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   "lea       0x40(%0),%0                     \n" \
   "phaddw    %%xmm0,%%xmm6                   \n" \
   "phaddw    %%xmm2,%%xmm1                   \n" \
-  "paddw     %%" #round ",%%xmm6             \n" \
-  "paddw     %%" #round ",%%xmm1             \n" \
+  "prefetcht0 1280(%0)                       \n" \
+  "paddw     %%" #round                          \
+  ",%%xmm6             \n"                       \
+  "paddw     %%" #round                          \
+  ",%%xmm1             \n"                       \
   "psrlw     $0x8,%%xmm6                     \n" \
   "psrlw     $0x8,%%xmm1                     \n" \
   "packuswb  %%xmm1,%%xmm6                   \n" \
@@ -1111,33 +1114,36 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   "sub       $0x10,%2                        \n" \
   "jg        1b                              \n"
 
-#define RGBTOY_AVX2(round)                                                  \
-  "1:                                        \n"                            \
-  "vmovdqu    (%0),%%ymm0                    \n"                            \
-  "vmovdqu    0x20(%0),%%ymm1                \n"                            \
-  "vmovdqu    0x40(%0),%%ymm2                \n"                            \
-  "vmovdqu    0x60(%0),%%ymm3                \n"                            \
-  "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                            \
-  "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                            \
-  "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                            \
-  "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                            \
-  "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                            \
-  "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                            \
-  "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                            \
-  "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                            \
-  "lea       0x80(%0),%0                     \n"                            \
-  "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */             \
-  "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                            \
-  "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */ \
-  "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n"                            \
-  "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                            \
-  "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                            \
-  "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */             \
-  "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */            \
-  "vmovdqu    %%ymm0,(%1)                    \n"                            \
-  "lea       0x20(%1),%1                     \n"                            \
-  "sub       $0x20,%2                        \n"                            \
-  "jg        1b                              \n"                            \
+#define RGBTOY_AVX2(round)                                       \
+  "1:                                        \n"                 \
+  "vmovdqu    (%0),%%ymm0                    \n"                 \
+  "vmovdqu    0x20(%0),%%ymm1                \n"                 \
+  "vmovdqu    0x40(%0),%%ymm2                \n"                 \
+  "vmovdqu    0x60(%0),%%ymm3                \n"                 \
+  "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                 \
+  "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                 \
+  "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                 \
+  "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                 \
+  "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                 \
+  "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                 \
+  "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                 \
+  "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                 \
+  "lea       0x80(%0),%0                     \n"                 \
+  "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
+  "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
+  "prefetcht0 1280(%0)                       \n"                 \
+  "vpaddw     %%" #round                                         \
+  ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
+  "vpaddw     %%" #round                                         \
+  ",%%ymm2,%%ymm2     \n"                                        \
+  "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
+  "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
+  "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
+  "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */ \
+  "vmovdqu    %%ymm0,(%1)                    \n"                 \
+  "lea       0x20(%1),%1                     \n"                 \
+  "sub       $0x20,%2                        \n"                 \
+  "jg        1b                              \n"                 \
   "vzeroupper                                \n"
 
 #ifdef HAS_ARGBTOYROW_SSSE3
@@ -1148,15 +1154,15 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "movdqa    %4,%%xmm5                       \n"
       "movdqa    %5,%%xmm7                       \n"
 
-      LABELALIGN
-      RGBTOY(xmm7)
+      LABELALIGN RGBTOY(xmm7)
       : "+r"(src_argb),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       : "m"(kARGBToY),   // %3
         "m"(kSub128),    // %4
         "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOYROW_SSSE3
 
@@ -1168,8 +1174,7 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "movdqa    %3,%%xmm4                       \n"
       "movdqa    %4,%%xmm5                       \n"
 
-      LABELALIGN
-      RGBTOY(xmm5)
+      LABELALIGN RGBTOY(xmm5)
       : "+r"(src_argb),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -1187,8 +1192,7 @@ void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
       "movdqa    %3,%%xmm4                       \n"
       "movdqa    %4,%%xmm5                       \n"
 
-      LABELALIGN
-      RGBTOY(xmm5)
+      LABELALIGN RGBTOY(xmm5)
       : "+r"(src_rgba),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -1210,8 +1214,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "vbroadcastf128 %5,%%ymm7                  \n"
       "vmovdqu    %6,%%ymm6                      \n"
 
-      LABELALIGN
-      RGBTOY_AVX2(ymm7)
+      LABELALIGN RGBTOY_AVX2(ymm7)
       : "+r"(src_argb),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
@@ -1219,7 +1222,8 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
         "m"(kSub128),           // %4
         "m"(kAddY16),           // %5
         "m"(kPermdARGBToY_AVX)  // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOYROW_AVX2
 
@@ -1232,8 +1236,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
       "vbroadcastf128 %5,%%ymm7                  \n"
       "vmovdqu    %6,%%ymm6                      \n"
 
-      LABELALIGN
-      RGBTOY_AVX2(ymm7)
+      LABELALIGN RGBTOY_AVX2(ymm7)
       : "+r"(src_abgr),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
@@ -1241,7 +1244,8 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
         "m"(kSub128),           // %4
         "m"(kAddY16),           // %5
         "m"(kPermdARGBToY_AVX)  // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ABGRTOYROW_AVX2
 
@@ -1253,15 +1257,15 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vmovdqu    %5,%%ymm6                      \n"
 
-      LABELALIGN
-      RGBTOY_AVX2(ymm5)
+      LABELALIGN RGBTOY_AVX2(ymm5)
       : "+r"(src_argb),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
       : "m"(kARGBToYJ),         // %3
         "m"(kSub128),           // %4
         "m"(kPermdARGBToY_AVX)  // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOYJROW_AVX2
 
@@ -1273,9 +1277,8 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vmovdqu    %5,%%ymm6                      \n"
 
-      LABELALIGN
-      RGBTOY_AVX2(ymm5)
-      "vzeroupper                                \n"
+      LABELALIGN RGBTOY_AVX2(
+          ymm5) "vzeroupper                                \n"
       : "+r"(src_rgba),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
@@ -1536,7 +1539,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
         "+r"(dst_v),                       // %2
         "+rm"(width)                       // %3
       : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(kSub128),                   // %5
+        "m"(kSub128),                      // %5
         "m"(kARGBToVJ),                    // %6
         "m"(kARGBToUJ),                    // %7
         "m"(kShufARGBToUV_AVX)             // %8
@@ -1606,7 +1609,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
       : "r"((intptr_t)(src_stride_argb)),  // %4
         "m"(kARGBToVJ),                    // %5
         "m"(kARGBToUJ),                    // %6
-        "m"(kSub128)                    // %7
+        "m"(kSub128)                       // %7
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 #endif  // HAS_ARGBTOUVJROW_SSSE3
@@ -1675,15 +1678,15 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
       "movdqa    %4,%%xmm5                       \n"
       "movdqa    %5,%%xmm7                       \n"
 
-      LABELALIGN
-      RGBTOY(xmm7)
+      LABELALIGN RGBTOY(xmm7)
       : "+r"(src_bgra),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       : "m"(kBGRAToY),   // %3
         "m"(kSub128),    // %4
         "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
@@ -1755,15 +1758,15 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
       "movdqa    %4,%%xmm5                       \n"
       "movdqa    %5,%%xmm7                       \n"
 
-      LABELALIGN
-      RGBTOY(xmm7)
+      LABELALIGN RGBTOY(xmm7)
       : "+r"(src_abgr),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       : "m"(kABGRToY),   // %3
         "m"(kSub128),    // %4
         "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
@@ -1772,15 +1775,15 @@ void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
       "movdqa    %4,%%xmm5                       \n"
       "movdqa    %5,%%xmm7                       \n"
 
-      LABELALIGN
-      RGBTOY(xmm7)
+      LABELALIGN RGBTOY(xmm7)
       : "+r"(src_rgba),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       : "m"(kRGBAToY),   // %3
         "m"(kSub128),    // %4
         "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
diff --git a/source/row_mmi.cc b/source/row_mmi.cc
index 50cfca72..d7d34e47 100644
--- a/source/row_mmi.cc
+++ b/source/row_mmi.cc
@@ -6040,90 +6040,93 @@ void I444ToARGBRow_MMI(const uint8_t* src_y,
                        uint8_t* rgb_buf,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec[2], g_vec[2], r_vec[2];
+  uint64_t y,u,v;
+  uint64_t b_vec[2],g_vec[2],r_vec[2];
   uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
-  __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"  // yg
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"  // bb
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"  // ub
-      "or         %[ub],           %[ub],             %[mask]       \n\t"  // must
-                                                                           // sign
-                                                                           // extension
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"  // bg
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"  // ug
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"  // vg
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"  // br
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"  // vr
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask]       \n\t"  // sign
-                                                                           // extension
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"  // y*0x0101
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"  // y1
-
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"  // u
-      "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-      "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-      "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"  // v
-      "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-      "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"  // u*ug
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"  // v*vg
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-      "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-      "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"  // v*vr
-      "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-      "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-      "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"  // rrrrbbbb
-      "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"  // ffffgggg
-      "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-      "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"  // gbgbgbgb
-      "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"  // frfrfrfr
-      "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"  // frgbfrgb
-      "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"  // frgbfrgb
-      "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-      "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
-        [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
-        [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
-        [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
-        [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
-        [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6),
-        [five] "f"(0x55), [mask] "f"(mask)
-      : "memory");
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+  __asm__ volatile (
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"//yg
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"//bb
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"//ub
+    "or         %[ub],           %[ub],             %[mask]       \n\t"//must sign extension
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"//bg
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"//ug
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"//vg
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"//br
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"//vr
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"//sign extension
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
+
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"//u
+    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
+    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
+    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
+
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"//v
+    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
+    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"//u*ug
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"//v*vg
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
+
+    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
+    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"//v*vr
+    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
+    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
+
+    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
+    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"//ffffgggg
+    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
+    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//gbgbgbgb
+    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//frfrfrfr
+    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
+    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
+    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+    : [y]"=&f"(y),
+      [u]"=&f"(u),                         [v]"=&f"(v),
+      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
+      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
+      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [alpha]"f"(-1),
+      [six]"f"(0x6),                       [five]"f"(0x55),
+      [mask]"f"(mask)
+    : "memory"
+  );
 }
 
 // Also used for 420
@@ -6133,96 +6136,99 @@ void I422ToARGBRow_MMI(const uint8_t* src_y,
                        uint8_t* rgb_buf,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec[2], g_vec[2], r_vec[2];
+  uint64_t y,u,v;
+  uint64_t b_vec[2],g_vec[2],r_vec[2];
   uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"  // yg
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"  // bb
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"  // ub
-      "or         %[ub],           %[ub],             %[mask]       \n\t"  // must
-                                                                           // sign
-                                                                           // extension
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"  // bg
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"  // ug
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"  // vg
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"  // br
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"  // vr
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask]       \n\t"  // sign
-                                                                           // extension
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"  // y*0x0101
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"  // y1
-
-      // u3|u2|u1|u0 --> u1|u1|u0|u0
-      "punpcklbh  %[u],            %[u],              %[u]          \n\t"  // u
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-      "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-      "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-      // v3|v2|v1|v0 --> v1|v1|v0|v0
-      "punpcklbh  %[v],            %[v],              %[v]          \n\t"  // v
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-      "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-      "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"  // u*ug
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"  // v*vg
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-      "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-      "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"  // v*vr
-      "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-      "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-      "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"  // rrrrbbbb
-      "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"  // ffffgggg
-      "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-      "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"  // gbgbgbgb
-      "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"  // frfrfrfr
-      "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"  // frgbfrgb
-      "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"  // frgbfrgb
-      "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-      "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
-        [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
-        [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
-        [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
-        [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
-        [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6),
-        [five] "f"(0x55), [mask] "f"(mask)
-      : "memory");
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"//yg
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"//bb
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"//ub
+    "or         %[ub],           %[ub],             %[mask]       \n\t"//must sign extension
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"//bg
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"//ug
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"//vg
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"//br
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"//vr
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"//sign extension
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
+
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
+    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
+    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
+
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"//v
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
+    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"//u*ug
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"//v*vg
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
+
+    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
+    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"//v*vr
+    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
+    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
+
+    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
+    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"//ffffgggg
+    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
+    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//gbgbgbgb
+    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//frfrfrfr
+    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
+    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
+    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),
+      [u]"=&f"(u),                         [v]"=&f"(v),
+      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
+      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
+      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [alpha]"f"(-1),
+      [six]"f"(0x6),                       [five]"f"(0x55),
+      [mask]"f"(mask)
+    : "memory"
+  );
 }
 
 // 10 bit YUV to ARGB
@@ -6232,96 +6238,102 @@ void I210ToARGBRow_MMI(const uint16_t* src_y,
                        uint8_t* rgb_buf,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec[2], g_vec[2], r_vec[2];
+  uint64_t y,u,v;
+  uint64_t b_vec[2],g_vec[2],r_vec[2];
   uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask]       \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-      "1:                                                           \n\t"
-      "gsldlc1    %[y],            0x07(%[y_ptr])                   \n\t"
-      "gsldrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-      "psllh      %[y],            %[y],              %[six]        \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-      "punpcklhw  %[u],            %[u],              %[u]          \n\t"
-      "psrah      %[u],            %[u],              %[two]        \n\t"
-      "punpcklhw  %[v],            %[v],              %[v]          \n\t"
-      "psrah      %[v],            %[v],              %[two]        \n\t"
-      "pminsh     %[u],            %[u],              %[mask1]      \n\t"
-      "pminsh     %[v],            %[v],              %[mask1]      \n\t"
-
-      "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-      "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-
-      "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-      "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-
-      "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-      "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-
-      "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-      "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-      "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-      "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-      "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-      "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-      "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-      "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-      "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-      "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-      "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-      "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-      "daddiu     %[y_ptr],        %[y_ptr],          0x08          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
-        [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
-        [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
-        [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
-        [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
-        [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6),
-        [five] "f"(0x55), [mask] "f"(mask), [two] "f"(0x02),
-        [mask1] "f"(0x00ff00ff00ff00ff)
-      : "memory");
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask]       \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"
+
+    "1:                                                           \n\t"
+    "gsldlc1    %[y],            0x07(%[y_ptr])                   \n\t"
+    "gsldrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "psllh      %[y],            %[y],              %[six]        \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "punpcklhw  %[u],            %[u],              %[u]          \n\t"
+    "psrah      %[u],            %[u],              %[two]        \n\t"
+    "punpcklhw  %[v],            %[v],              %[v]          \n\t"
+    "psrah      %[v],            %[v],              %[two]        \n\t"
+    "pminsh     %[u],            %[u],              %[mask1]      \n\t"
+    "pminsh     %[v],            %[v],              %[mask1]      \n\t"
+
+    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
+    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
+
+    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
+    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+
+    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
+    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
+
+    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
+    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
+    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
+
+    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
+    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
+    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
+    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
+    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
+    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x08          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),
+      [u]"=&f"(u),                         [v]"=&f"(v),
+      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
+      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
+      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [alpha]"f"(-1),
+      [six]"f"(0x6),                       [five]"f"(0x55),
+      [mask]"f"(mask),                     [two]"f"(0x02),
+      [mask1]"f"(0x00ff00ff00ff00ff)
+    : "memory"
+  );
 }
 
 void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
@@ -6331,96 +6343,102 @@ void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
                             uint8_t* rgb_buf,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  uint64_t y, u, v, a;
-  uint64_t b_vec[2], g_vec[2], r_vec[2];
+  uint64_t y,u,v,a;
+  uint64_t b_vec[2],g_vec[2],r_vec[2];
   uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask]       \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-      "gslwlc1    %[a],            0x03(%[a_ptr])                   \n\t"
-      "gslwrc1    %[a],            0x00(%[a_ptr])                   \n\t"
-
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"  // y*0x0101
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"  // y1
-
-      // u3|u2|u1|u0 --> u1|u1|u0|u0
-      "punpcklbh  %[u],            %[u],              %[u]          \n\t"  // u
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-      "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-      "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-      // v3|v2|v1|v0 --> v1|v1|v0|v0
-      "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-      "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-      "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-      "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-      "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-      "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-      "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"  // rrrrbbbb
-      "packushb   %[g_vec0],       %[g_vec0],         %[a]          \n\t"
-      "punpcklwd  %[g_vec0],       %[g_vec0],         %[a]          \n\t"  // aaaagggg
-      "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-      "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-      "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-      "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-      "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-      "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[a_ptr],        %[a_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [a] "=&f"(a),
-        [b_vec0] "=&f"(b_vec[0]), [b_vec1] "=&f"(b_vec[1]),
-        [g_vec0] "=&f"(g_vec[0]), [g_vec1] "=&f"(g_vec[1]),
-        [r_vec0] "=&f"(r_vec[0]), [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub),
-        [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb),
-        [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [a_ptr] "r"(src_a), [zero] "f"(0x00),
-        [six] "f"(0x6), [five] "f"(0x55), [mask] "f"(mask)
-      : "memory");
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask]       \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+    "gslwlc1    %[a],            0x03(%[a_ptr])                   \n\t"
+    "gslwrc1    %[a],            0x00(%[a_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
+
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
+    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
+    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
+
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
+    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
+
+    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
+    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
+    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
+
+    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
+    "packushb   %[g_vec0],       %[g_vec0],         %[a]          \n\t"
+    "punpcklwd  %[g_vec0],       %[g_vec0],         %[a]          \n\t"//aaaagggg
+    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
+    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
+    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[a_ptr],        %[a_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),                         [a]"=&f"(a),
+      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
+      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
+      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [a_ptr]"r"(src_a),                   [zero]"f"(0x00),
+      [six]"f"(0x6),                       [five]"f"(0x55),
+      [mask]"f"(mask)
+    : "memory"
+  );
 }
 
 void I422ToRGB24Row_MMI(const uint8_t* src_y,
@@ -6429,105 +6447,113 @@ void I422ToRGB24Row_MMI(const uint8_t* src_y,
                         uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec[2], g_vec[2], r_vec[2];
+  uint64_t y,u,v;
+  uint64_t b_vec[2],g_vec[2],r_vec[2];
   uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask]       \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"  // y*0x0101
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"  // y1
-
-      // u3|u2|u1|u0 --> u1|u1|u0|u0
-      "punpcklbh  %[u],            %[u],              %[u]          \n\t"  // u
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-      "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-      "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-      // v3|v2|v1|v0 --> v1|v1|v0|v0
-      "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-      "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-      "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-      "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-      "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-      "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-      "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-      "packushb   %[g_vec0],       %[g_vec0],         %[zero]       \n\t"
-      "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-      "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-      "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-      "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-
-      "punpckhwd  %[r_vec0],       %[g_vec0],         %[g_vec0]     \n\t"
-      "psllw      %[r_vec1],       %[r_vec0],         %[lmove1]     \n\t"
-      "or         %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-      "psrlw      %[r_vec1],       %[r_vec0],         %[rmove1]     \n\t"
-      "pextrh     %[r_vec1],       %[r_vec1],         %[zero]       \n\t"
-      "pinsrh_2   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-      "pextrh     %[r_vec1],       %[g_vec1],         %[zero]       \n\t"
-      "pinsrh_3   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-      "pextrh     %[r_vec1],       %[g_vec1],         %[one]        \n\t"
-      "punpckhwd  %[g_vec1],       %[g_vec1],         %[g_vec1]     \n\t"
-      "psllw      %[g_vec1],       %[g_vec1],         %[rmove1]     \n\t"
-      "or         %[g_vec1],       %[g_vec1],         %[r_vec1]     \n\t"
-      "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-      "gsswlc1    %[g_vec1],       0x0b(%[rgbbuf_ptr])              \n\t"
-      "gsswrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0c          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
-        [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
-        [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
-        [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
-        [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
-        [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
-        [mask] "f"(mask), [lmove1] "f"(0x18), [rmove1] "f"(0x8), [one] "f"(0x1)
-      : "memory");
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask]       \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
+
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
+    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
+    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
+
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
+    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
+
+    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
+    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
+    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
+
+    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
+    "packushb   %[g_vec0],       %[g_vec0],         %[zero]       \n\t"
+    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
+    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
+
+    "punpckhwd  %[r_vec0],       %[g_vec0],         %[g_vec0]     \n\t"
+    "psllw      %[r_vec1],       %[r_vec0],         %[lmove1]     \n\t"
+    "or         %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
+    "psrlw      %[r_vec1],       %[r_vec0],         %[rmove1]     \n\t"
+    "pextrh     %[r_vec1],       %[r_vec1],         %[zero]       \n\t"
+    "pinsrh_2   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
+    "pextrh     %[r_vec1],       %[g_vec1],         %[zero]       \n\t"
+    "pinsrh_3   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
+    "pextrh     %[r_vec1],       %[g_vec1],         %[one]        \n\t"
+    "punpckhwd  %[g_vec1],       %[g_vec1],         %[g_vec1]     \n\t"
+    "psllw      %[g_vec1],       %[g_vec1],         %[rmove1]     \n\t"
+    "or         %[g_vec1],       %[g_vec1],         %[r_vec1]     \n\t"
+    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
+    "gsswlc1    %[g_vec1],       0x0b(%[rgbbuf_ptr])              \n\t"
+    "gsswrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
+
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0c          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
+      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
+      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask]"f"(mask),
+      [lmove1]"f"(0x18),                   [rmove1]"f"(0x8),
+      [one]"f"(0x1)
+    : "memory"
+  );
 }
 
 void I422ToARGB4444Row_MMI(const uint8_t* src_y,
@@ -6538,103 +6564,110 @@ void I422ToARGB4444Row_MMI(const uint8_t* src_y,
                            int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask]       \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"  // y*0x0101
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"  // y1
-
-      // u3|u2|u1|u0 --> u1|u1|u0|u0
-      "punpcklbh  %[u],            %[u],              %[u]          \n\t"  // u
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-      // v3|v2|v1|v0 --> v1|v1|v0|v0
-      "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-      "and        %[g_vec],        %[g_vec],          %[mask1]      \n\t"
-      "psrlw      %[g_vec],        %[g_vec],          %[four]       \n\t"
-      "psrlw      %[r_vec],        %[g_vec],          %[four]       \n\t"
-      "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-      "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
-      "and        %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-
-      "and        %[b_vec],        %[b_vec],          %[mask1]      \n\t"
-      "psrlw      %[b_vec],        %[b_vec],          %[four]       \n\t"
-      "psrlw      %[r_vec],        %[b_vec],          %[four]       \n\t"
-      "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
-      "and        %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[b_vec]      \n\t"
-
-      "gssdlc1    %[g_vec],        0x07(%[dst_argb4444])            \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[dst_argb4444])            \n\t"
-
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-      "daddiu     %[dst_argb4444], %[dst_argb4444],   0x08          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [dst_argb4444] "r"(dst_argb4444), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
-        [mask] "f"(0xff00ff00ff00ff00), [four] "f"(0x4),
-        [mask1] "f"(0xf0f0f0f0f0f0f0f0), [alpha] "f"(-1)
-      : "memory");
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask]       \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
+
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "and        %[g_vec],        %[g_vec],          %[mask1]      \n\t"
+    "psrlw      %[g_vec],        %[g_vec],          %[four]       \n\t"
+    "psrlw      %[r_vec],        %[g_vec],          %[four]       \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
+    "and        %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+
+    "and        %[b_vec],        %[b_vec],          %[mask1]      \n\t"
+    "psrlw      %[b_vec],        %[b_vec],          %[four]       \n\t"
+    "psrlw      %[r_vec],        %[b_vec],          %[four]       \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
+    "and        %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[b_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],        0x07(%[dst_argb4444])            \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[dst_argb4444])            \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[dst_argb4444], %[dst_argb4444],   0x08          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [dst_argb4444]"r"(dst_argb4444),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask]"f"(0xff00ff00ff00ff00),
+      [four]"f"(0x4),                      [mask1]"f"(0xf0f0f0f0f0f0f0f0),
+      [alpha]"f"(-1)
+    : "memory"
+  );
 }
 
 void I422ToARGB1555Row_MMI(const uint8_t* src_y,
@@ -6645,118 +6678,125 @@ void I422ToARGB1555Row_MMI(const uint8_t* src_y,
                            int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-      // u3|u2|u1|u0 --> u1|u1|u0|u0
-      "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-      // v3|v2|v1|v0 --> v1|v1|v0|v0
-      "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-      "psrlw      %[temp],         %[g_vec],          %[three]      \n\t"
-      "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-      "or         %[g_vec],        %[g_vec],          %[mask3]      \n\t"
-
-      "psrlw      %[temp],         %[b_vec],          %[three]      \n\t"
-      "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "or         %[b_vec],        %[b_vec],          %[mask3]      \n\t"
-
-      "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
-
-      "gssdlc1    %[g_vec],        0x07(%[dst_argb1555])            \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[dst_argb1555])            \n\t"
-
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-      "daddiu     %[dst_argb1555], %[dst_argb1555],   0x08          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [dst_argb1555] "r"(dst_argb1555), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
-        [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3),
-        [mask2] "f"(0x1f0000001f), [eight] "f"(0x8),
-        [mask3] "f"(0x800000008000), [lmove5] "f"(0x5)
-      : "memory");
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "psrlw      %[temp],         %[g_vec],          %[three]      \n\t"
+    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "or         %[g_vec],        %[g_vec],          %[mask3]      \n\t"
+
+    "psrlw      %[temp],         %[b_vec],          %[three]      \n\t"
+    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "or         %[b_vec],        %[b_vec],          %[mask3]      \n\t"
+
+    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],        0x07(%[dst_argb1555])            \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[dst_argb1555])            \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[dst_argb1555], %[dst_argb1555],   0x08          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [dst_argb1555]"r"(dst_argb1555),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
+      [eight]"f"(0x8),                     [mask3]"f"(0x800000008000),
+      [lmove5]"f"(0x5)
+    : "memory"
+  );
 }
 
 void I422ToRGB565Row_MMI(const uint8_t* src_y,
@@ -6767,120 +6807,127 @@ void I422ToRGB565Row_MMI(const uint8_t* src_y,
                          int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-      // u3|u2|u1|u0 --> u1|u1|u0|u0
-      "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-      // v3|v2|v1|v0 --> v1|v1|v0|v0
-      "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-      "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
-      "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-      "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-      "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-      "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "paddb      %[temp],         %[three],          %[eight]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-
-      "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
-      "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-      "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-      "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "paddb      %[temp],         %[three],          %[eight]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-      "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
-
-      "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
-
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-      "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [dst_rgb565] "r"(dst_rgb565), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
-        [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3),
-        [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7),
-        [lmove5] "f"(0x5)
-      : "memory");
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
+    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
+    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
+    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "paddb      %[temp],         %[three],          %[eight]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+
+    "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
+    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
+    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
+    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "paddb      %[temp],         %[three],          %[eight]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [dst_rgb565]"r"(dst_rgb565),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
+      [eight]"f"(0x8),                     [seven]"f"(0x7),
+      [lmove5]"f"(0x5)
+    : "memory"
+  );
 }
 
 void NV12ToARGBRow_MMI(const uint8_t* src_y,
@@ -6890,83 +6937,91 @@ void NV12ToARGBRow_MMI(const uint8_t* src_y,
                        int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-      "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-      "pshufh     %[u],            %[u],              %[ushu]       \n\t"
-
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-      "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-      "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-      "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-      "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
-
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf),
-        [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
-        [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
-        [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1)
-      : "memory");
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
+    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
+    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
+    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
+      [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [alpha]"f"(-1)
+    : "memory"
+  );
 }
 
 void NV21ToARGBRow_MMI(const uint8_t* src_y,
@@ -6976,83 +7031,91 @@ void NV21ToARGBRow_MMI(const uint8_t* src_y,
                        int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
-      "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "pshufh     %[v],            %[u],              %[ushu]       \n\t"
-      "pshufh     %[u],            %[u],              %[vshu]       \n\t"
-
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-      "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-      "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-      "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-      "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
-
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf),
-        [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
-        [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
-        [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1)
-      : "memory");
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
+    "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "pshufh     %[v],            %[u],              %[ushu]       \n\t"
+    "pshufh     %[u],            %[u],              %[vshu]       \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
+    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [vu_ptr]"r"(src_vu),
+      [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [alpha]"f"(-1)
+    : "memory"
+  );
 }
 
 void NV12ToRGB24Row_MMI(const uint8_t* src_y,
@@ -7062,95 +7125,103 @@ void NV12ToRGB24Row_MMI(const uint8_t* src_y,
                         int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-      "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-      "pshufh     %[u],            %[u],              %[ushu]       \n\t"
-
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-      "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
-      "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
-      "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
-      "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
-      "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
-      "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
-      "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
-      "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
-      "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-      "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
-      "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf),
-        [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
-        [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
-        [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [lmove1] "f"(0x18),
-        [one] "f"(0x1), [rmove1] "f"(0x8)
-      : "memory");
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
+    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
+    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
+    "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
+    "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
+    "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
+    "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
+    "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
+    "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
+    "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
+    "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
+    "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
+    "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
+      [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [alpha]"f"(-1),                      [lmove1]"f"(0x18),
+      [one]"f"(0x1),                       [rmove1]"f"(0x8)
+    : "memory"
+  );
 }
 
 void NV21ToRGB24Row_MMI(const uint8_t* src_y,
@@ -7160,95 +7231,103 @@ void NV21ToRGB24Row_MMI(const uint8_t* src_y,
                         int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
-      "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "pshufh     %[v],            %[u],              %[ushu]       \n\t"
-      "pshufh     %[u],            %[u],              %[vshu]       \n\t"
-
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-      "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
-      "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
-      "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
-      "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
-      "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
-      "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
-      "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
-      "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
-      "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-      "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
-      "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf),
-        [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
-        [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
-        [ushu] "f"(0xA0), [vshu] "f"(0xf5), [lmove1] "f"(0x18),
-        [rmove1] "f"(0x8), [one] "f"(0x1)
-      : "memory");
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
+    "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "pshufh     %[v],            %[u],              %[ushu]       \n\t"
+    "pshufh     %[u],            %[u],              %[vshu]       \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
+    "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
+    "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
+    "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
+    "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
+    "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
+    "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
+    "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
+    "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
+    "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
+    "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [vu_ptr]"r"(src_vu),
+      [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [lmove1]"f"(0x18),                   [rmove1]"f"(0x8),
+      [one]"f"(0x1)
+    : "memory"
+  );
 }
 
 void NV12ToRGB565Row_MMI(const uint8_t* src_y,
@@ -7258,115 +7337,123 @@ void NV12ToRGB565Row_MMI(const uint8_t* src_y,
                          int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-      "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-      "pshufh     %[u],            %[u],              %[ushu]       \n\t"
-
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-      "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
-      "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-      "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-      "psubb      %[y],            %[eight],          %[three]      \n\t"  // 5
-      "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
-      "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-      "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-      "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "paddb      %[temp],         %[three],          %[eight]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-
-      "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
-      "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-      "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-      "psubb      %[y],            %[eight],          %[three]      \n\t"  // 5
-      "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
-      "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-      "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "paddb      %[temp],         %[three],          %[eight]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-      "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
-
-      "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
-
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-      "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [dst_rgb565] "r"(dst_rgb565),
-        [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
-        [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
-        [ushu] "f"(0xA0), [vshu] "f"(0xf5), [three] "f"(0x3),
-        [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7)
-      : "memory");
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
+    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
+    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
+    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
+    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
+    "psubb      %[y],            %[eight],          %[three]      \n\t"//5
+    "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
+    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "paddb      %[temp],         %[three],          %[eight]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+
+    "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
+    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
+    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
+    "psubb      %[y],            %[eight],          %[three]      \n\t"//5
+    "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
+    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "paddb      %[temp],         %[three],          %[eight]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+	"daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
+    "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
+      [dst_rgb565]"r"(dst_rgb565),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
+      [eight]"f"(0x8),                     [seven]"f"(0x7)
+    : "memory"
+  );
 }
 
 void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
@@ -7375,83 +7462,90 @@ void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
                        int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-      "1:                                                           \n\t"
-      "gsldlc1    %[y],            0x07(%[yuy2_ptr])                \n\t"
-      "gsldrc1    %[y],            0x00(%[yuy2_ptr])                \n\t"
-      "psrlh      %[temp],         %[y],              %[eight]      \n\t"
-      "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
-      "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
-
-      "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
-      "and        %[y],            %[y],              %[temp]       \n\t"
-      "psllh      %[temp],         %[y],              %[eight]      \n\t"
-      "or         %[y],            %[y],              %[temp]       \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-      "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-      "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-      "daddiu     %[yuy2_ptr],     %[yuy2_ptr],       0x08          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [yuy2_ptr] "r"(src_yuy2), [rgbbuf_ptr] "r"(rgb_buf),
-        [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
-        [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
-        [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8)
-      : "memory");
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gsldlc1    %[y],            0x07(%[yuy2_ptr])                \n\t"
+    "gsldrc1    %[y],            0x00(%[yuy2_ptr])                \n\t"
+    "psrlh      %[temp],         %[y],              %[eight]      \n\t"
+    "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
+    "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
+
+    "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
+    "and        %[y],            %[y],              %[temp]       \n\t"
+    "psllh      %[temp],         %[y],              %[eight]      \n\t"
+    "or         %[y],            %[y],              %[temp]       \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[yuy2_ptr],     %[yuy2_ptr],       0x08          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [yuy2_ptr]"r"(src_yuy2),             [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [alpha]"f"(-1),                      [eight]"f"(0x8)
+    : "memory"
+  );
 }
 
 void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
@@ -7460,83 +7554,90 @@ void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
                        int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-      "1:                                                           \n\t"
-      "gsldlc1    %[y],            0x07(%[uyvy_ptr])                \n\t"
-      "gsldrc1    %[y],            0x00(%[uyvy_ptr])                \n\t"
-      "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
-      "and        %[temp],         %[y],              %[temp]       \n\t"
-      "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
-      "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
-
-      "psrlh      %[y],            %[y],              %[eight]      \n\t"
-      "psllh      %[temp],         %[y],              %[eight]      \n\t"
-      "or         %[y],            %[y],              %[temp]       \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-      "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-      "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-      "daddiu     %[uyvy_ptr],     %[uyvy_ptr],       0x08          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [uyvy_ptr] "r"(src_uyvy), [rgbbuf_ptr] "r"(rgb_buf),
-        [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
-        [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
-        [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8)
-      : "memory");
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gsldlc1    %[y],            0x07(%[uyvy_ptr])                \n\t"
+    "gsldrc1    %[y],            0x00(%[uyvy_ptr])                \n\t"
+    "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
+    "and        %[temp],         %[y],              %[temp]       \n\t"
+    "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
+    "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
+
+    "psrlh      %[y],            %[y],              %[eight]      \n\t"
+    "psllh      %[temp],         %[y],              %[eight]      \n\t"
+    "or         %[y],            %[y],              %[temp]       \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
+
+    "daddiu     %[uyvy_ptr],     %[uyvy_ptr],       0x08          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [uyvy_ptr]"r"(src_uyvy),             [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [alpha]"f"(-1),                      [eight]"f"(0x8)
+    : "memory"
+  );
 }
 
 void I422ToRGBARow_MMI(const uint8_t* src_y,
@@ -7547,104 +7648,112 @@ void I422ToRGBARow_MMI(const uint8_t* src_y,
                        int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-      "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-      "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklwd  %[g_vec],        %[alpha],          %[g_vec]      \n\t"
-      "punpcklbh  %[b_vec],        %[g_vec],          %[r_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[g_vec],          %[r_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-      "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-      "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-      "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-      "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
-
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
-        [mask1] "f"(0xff00ff00ff00ff00), [alpha] "f"(-1)
-      : "memory");
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
+
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[alpha],          %[g_vec]      \n\t"
+    "punpcklbh  %[b_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+
+    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
+    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
+
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [alpha]"f"(-1)
+    : "memory"
+  );
 }
 
 void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) {
-  __asm__ volatile(
-      "punpcklwd  %[v32],          %[v32],            %[v32]        \n\t"
-      "1:                                                           \n\t"
-      "gssdlc1    %[v32],          0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[v32],          0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[v32],          0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[v32],          0x08(%[dst_ptr])                 \n\t"
-
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [v32] "+&f"(v32)
-      : [dst_ptr] "r"(dst_argb), [width] "r"(width)
-      : "memory");
+  __asm__ volatile (
+    "punpcklwd  %[v32],          %[v32],            %[v32]        \n\t"
+    "1:                                                           \n\t"
+    "gssdlc1    %[v32],          0x07(%[dst_ptr])                 \n\t"
+    "gssdrc1    %[v32],          0x00(%[dst_ptr])                 \n\t"
+    "gssdlc1    %[v32],          0x0f(%[dst_ptr])                 \n\t"
+    "gssdrc1    %[v32],          0x08(%[dst_ptr])                 \n\t"
+
+    "daddi      %[width],        %[width],         -0x04          \n\t"
+    "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
+    "bnez       %[width],        1b                               \n\t"
+    : [v32]"+&f"(v32)
+    : [dst_ptr]"r"(dst_argb),           [width]"r"(width)
+    : "memory"
+  );
 }
 
 // 10 bit YUV to ARGB
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 197efb2a..3f64010a 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -278,7 +278,8 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
           v20) "subs       %w4, %w4, #8                   \n" ARGBTORGB565
                "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
                                                                // RGB565.
-               "b.gt       1b                             \n"
+               "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "b.gt       1b                             \n"
       : "+r"(src_y),       // %0
         "+r"(src_u),       // %1
         "+r"(src_v),       // %2
@@ -315,7 +316,8 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
           v20) "subs       %w4, %w4, #8                   \n" ARGBTOARGB1555
                "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
                                                                // RGB565.
-               "b.gt       1b                             \n"
+               "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "b.gt       1b                             \n"
       : "+r"(src_y),         // %0
         "+r"(src_u),         // %1
         "+r"(src_v),         // %2
@@ -401,6 +403,7 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
       "orr        v22.8b, v20.8b, v20.8b         \n"
       "subs       %w2, %w2, #8                   \n"
       "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_y),     // %0
         "+r"(dst_argb),  // %1
@@ -527,7 +530,8 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
           v20) "subs       %w3, %w3, #8                   \n" ARGBTORGB565
                "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels
                                                                // RGB565.
-               "b.gt       1b                             \n"
+               "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "b.gt       1b                             \n"
       : "+r"(src_y),       // %0
         "+r"(src_uv),      // %1
         "+r"(dst_rgb565),  // %2
@@ -601,6 +605,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
       "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
       "st1        {v0.16b}, [%1], #16            \n"  // store U
       "st1        {v1.16b}, [%2], #16            \n"  // store V
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_uv),               // %0
         "+r"(dst_u),                // %1
@@ -622,6 +627,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
       "ld1        {v1.16b}, [%1], #16            \n"  // load V
       "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
       "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_u),                // %0
         "+r"(src_v),                // %1
@@ -645,6 +651,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
       "st1        {v0.16b}, [%1], #16            \n"  // store R
       "st1        {v1.16b}, [%2], #16            \n"  // store G
       "st1        {v2.16b}, [%3], #16            \n"  // store B
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_rgb),                    // %0
         "+r"(dst_r),                      // %1
@@ -669,6 +676,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
       "ld1        {v2.16b}, [%2], #16            \n"  // load B
       "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
       "st3        {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_r),                      // %0
         "+r"(src_g),                      // %1
@@ -687,6 +695,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
       "ldp        q0, q1, [%0], #32              \n"
       "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
       "stp        q0, q1, [%1], #32              \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src),                  // %0
         "+r"(dst),                  // %1
@@ -703,6 +712,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
       "1:                                        \n"
       "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
       "st1        {v0.16b}, [%0], #16            \n"  // store
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(dst),   // %0
         "+r"(width)  // %1
@@ -716,6 +726,7 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
       "1:                                        \n"
       "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
       "st1        {v0.16b}, [%0], #16            \n"  // store
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(dst),   // %0
         "+r"(width)  // %1
@@ -739,6 +750,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
       "tbl        v1.16b, {v1.16b}, v3.16b       \n"
       "tbl        v0.16b, {v2.16b}, v3.16b       \n"
       "st1        {v0.16b, v1.16b}, [%1], #32    \n"  // store 32 pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src),           // %0
         "+r"(dst),           // %1
@@ -763,6 +775,7 @@ void MirrorUVRow_NEON(const uint8_t* src_uv,
       "rev64      v1.8b, v1.8b                   \n"
       "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
       "st1        {v1.8b}, [%2], #8              \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_uv),        // %0
         "+r"(dst_u),         // %1
@@ -783,6 +796,7 @@ void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
       "rev64      v0.4s, v0.4s                   \n"
       "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
       "st1        {v0.D}[0], [%1], #8            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src),           // %0
         "+r"(dst),           // %1
@@ -800,6 +814,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
       "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
       "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_rgb24),  // %0
         "+r"(dst_argb),   // %1
@@ -818,6 +833,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
       "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
       "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
       "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_raw),   // %0
         "+r"(dst_argb),  // %1
@@ -836,6 +852,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
       "orr        v2.8b, v4.8b, v4.8b            \n"  // move g
       "orr        v1.8b, v5.8b, v5.8b            \n"  // move r
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store a b g r
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_raw),   // %0
         "+r"(dst_rgba),  // %1
@@ -853,6 +870,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
       "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
       "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
       "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_raw),    // %0
         "+r"(dst_rgb24),  // %1
@@ -885,6 +903,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
       RGB565TOARGB
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_rgb565),  // %0
         "+r"(dst_argb),    // %1
@@ -942,6 +961,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
       ARGB1555TOARGB
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
                                                             // pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb1555),  // %0
         "+r"(dst_argb),      // %1
@@ -972,7 +992,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
       ARGB4444TOARGB
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
-                                                            // pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb4444),  // %0
         "+r"(dst_argb),      // %1
@@ -989,8 +1009,8 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
       "1:                                        \n"
       "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of
-                                                      // RGB24.
+      "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),   // %0
         "+r"(dst_rgb24),  // %1
@@ -1023,6 +1043,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
       "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
       "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
       "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_yuy2),  // %0
         "+r"(dst_y),     // %1
@@ -1038,6 +1059,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
       "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
       "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
       "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_uyvy),  // %0
         "+r"(dst_y),     // %1
@@ -1057,6 +1079,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
       "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
       "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
       "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_yuy2),  // %0
         "+r"(dst_u),     // %1
@@ -1077,6 +1100,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
       "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
       "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
       "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_uyvy),  // %0
         "+r"(dst_u),     // %1
@@ -1102,6 +1126,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
       "urhadd     v3.8b, v3.8b, v7.8b            \n"        // average rows of V
       "st1        {v1.8b}, [%2], #8              \n"        // store 8 U.
       "st1        {v3.8b}, [%3], #8              \n"        // store 8 V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_yuy2),   // %0
         "+r"(src_yuy2b),  // %1
@@ -1129,6 +1154,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
       "urhadd     v2.8b, v2.8b, v6.8b            \n"        // average rows of V
       "st1        {v0.8b}, [%2], #8              \n"        // store 8 U.
       "st1        {v2.8b}, [%3], #8              \n"        // store 8 V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_uyvy),   // %0
         "+r"(src_uyvyb),  // %1
@@ -1153,6 +1179,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
       "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
       "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
       "st1        {v1.16b}, [%1], #16            \n"  // store 4.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),                   // %0
         "+r"(dst_argb),                   // %1
@@ -1175,6 +1202,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
       "ld1        {v3.8b}, [%2], #8              \n"        // load 8 Vs
       "subs       %w4, %w4, #16                  \n"        // 16 pixels
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
@@ -1198,6 +1226,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
       "ld1        {v2.8b}, [%2], #8              \n"        // load 8 Vs
       "subs       %w4, %w4, #16                  \n"        // 16 pixels
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
@@ -1217,6 +1246,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
       ARGBTORGB565
       "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),    // %0
         "+r"(dst_rgb565),  // %1
@@ -1238,6 +1268,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
       "uqadd      v21.8b, v21.8b, v1.8b          \n"
       "uqadd      v22.8b, v22.8b, v1.8b          \n" ARGBTORGB565
       "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(dst_rgb)   // %0
       : "r"(src_argb),  // %1
@@ -1256,6 +1287,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
       ARGBTOARGB1555
       "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
                                                       // ARGB1555.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),      // %0
         "+r"(dst_argb1555),  // %1
@@ -1276,6 +1308,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
       ARGBTOARGB4444
       "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
                                                       // ARGB4444.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),      // %0
         "+r"(dst_argb4444),  // %1
@@ -1299,6 +1332,7 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "uqrshrn    v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit Y
       "uqadd      v0.8b, v0.8b, v7.8b            \n"
       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_y),     // %1
@@ -1316,6 +1350,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
                                                                 // pixels
       "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
       "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_a),     // %1
@@ -1338,6 +1373,7 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
       "uqrshrn    v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit Y
       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_y),     // %1
@@ -1359,6 +1395,7 @@ void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "umlal      v0.8h, v3.8b, v6.8b            \n"  // R
       "uqrshrn    v3.8b, v0.8h, #8               \n"  // 16 bit to 8 bit Y
       "st1        {v3.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_y),     // %1
@@ -1399,6 +1436,7 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
 
       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
       "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_u),     // %1
@@ -1767,6 +1805,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
       "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
       "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
       "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_rgb565),    // %0
         "+r"(src_rgb565_1),  // %1
@@ -1832,6 +1871,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
       "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
       "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
       "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb1555),    // %0
         "+r"(src_argb1555_1),  // %1
@@ -1897,6 +1937,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
       "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
       "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
       "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb4444),    // %0
         "+r"(src_argb4444_1),  // %1
@@ -1927,6 +1968,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
       "uqrshrn    v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit Y
       "uqadd      v0.8b, v0.8b, v27.8b           \n"
       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_rgb565),  // %0
         "+r"(dst_y),       // %1
@@ -1954,6 +1996,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
       "uqrshrn    v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit Y
       "uqadd      v0.8b, v0.8b, v7.8b            \n"
       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb1555),  // %0
         "+r"(dst_y),         // %1
@@ -1980,6 +2023,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
       "uqrshrn    v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit Y
       "uqadd      v0.8b, v0.8b, v27.8b           \n"
       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb4444),  // %0
         "+r"(dst_y),         // %1
@@ -2003,6 +2047,7 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
       "uqrshrn    v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit Y
       "uqadd      v0.8b, v0.8b, v7.8b            \n"
       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_bgra),  // %0
         "+r"(dst_y),     // %1
@@ -2026,6 +2071,7 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
       "uqrshrn    v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit Y
       "uqadd      v0.8b, v0.8b, v7.8b            \n"
       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_abgr),  // %0
         "+r"(dst_y),     // %1
@@ -2049,6 +2095,7 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
       "uqrshrn    v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit Y
       "uqadd      v0.8b, v0.8b, v7.8b            \n"
       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_rgba),  // %0
         "+r"(dst_y),     // %1
@@ -2072,6 +2119,7 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
       "uqrshrn    v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit Y
       "uqadd      v0.8b, v0.8b, v7.8b            \n"
       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_rgb24),  // %0
         "+r"(dst_y),      // %1
@@ -2095,6 +2143,7 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
       "uqrshrn    v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit Y
       "uqadd      v0.8b, v0.8b, v7.8b            \n"
       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_raw),  // %0
         "+r"(dst_y),    // %1
@@ -2116,6 +2165,7 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
       "umlal      v0.8h, v2.8b, v6.8b            \n"  // R
       "uqrshrn    v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit Y
       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_rgb24),  // %0
         "+r"(dst_yj),     // %1
@@ -2135,8 +2185,10 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
       "umull      v0.8h, v0.8b, v4.8b            \n"  // B
       "umlal      v0.8h, v1.8b, v5.8b            \n"  // G
       "umlal      v0.8h, v2.8b, v6.8b            \n"  // R
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 cache lines ahead
       "uqrshrn    v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit Y
       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_raw),  // %0
         "+r"(dst_yj),   // %1
@@ -2174,6 +2226,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
       "rshrn      v0.8b,  v2.8h, #8              \n"
       "rshrn2     v0.16b, v3.8h, #8              \n"
       "st1        {v0.16b}, [%0], #16            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       "b          99f                            \n"
 
@@ -2290,6 +2343,7 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
       "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
                                                             // pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
@@ -2331,6 +2385,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
       "uqxtn      v1.8b, v1.8h                   \n"
       "uqxtn      v2.8b, v2.8h                   \n"
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(dst_argb),       // %0
         "+r"(width)           // %1
@@ -2369,6 +2424,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
       "uqxtn      v6.8b, v6.8h                   \n"
       "uqxtn      v7.8b, v7.8h                   \n"
       "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
@@ -2395,6 +2451,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
       "orr        v1.8b, v0.8b, v0.8b            \n"  // G
       "orr        v2.8b, v0.8b, v0.8b            \n"  // R
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
@@ -2435,6 +2492,7 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
       "uqshrn     v1.8b, v5.8h, #7               \n"   // 16 bit to 8 bit G
       "uqshrn     v2.8b, v6.8h, #7               \n"   // 16 bit to 8 bit R
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(dst_argb),  // %0
         "+r"(width)      // %1
@@ -2495,6 +2553,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
       "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
       "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
       "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),   // %0
         "+r"(dst_argb),   // %1
@@ -2525,6 +2584,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
       "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
       "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
@@ -2550,6 +2610,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
       "uqadd      v2.8b, v2.8b, v6.8b            \n"
       "uqadd      v3.8b, v3.8b, v7.8b            \n"
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
@@ -2575,6 +2636,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
       "uqsub      v2.8b, v2.8b, v6.8b            \n"
       "uqsub      v3.8b, v3.8b, v7.8b            \n"
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb0),  // %0
         "+r"(src_argb1),  // %1
@@ -2604,6 +2666,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
       "orr        v1.8b, v0.8b, v0.8b            \n"
       "orr        v2.8b, v0.8b, v0.8b            \n"
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_sobelx),  // %0
         "+r"(src_sobely),  // %1
@@ -2626,6 +2689,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
       "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
       "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
       "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_sobelx),  // %0
         "+r"(src_sobely),  // %1
@@ -2653,6 +2717,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
       "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
       "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_sobelx),  // %0
         "+r"(src_sobely),  // %1
@@ -2689,6 +2754,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
       "abs        v0.8h, v0.8h                   \n"
       "uqxtn      v0.8b, v0.8h                   \n"
       "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_y0),                           // %0
         "+r"(src_y1),                           // %1
@@ -2727,6 +2793,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
       "abs        v0.8h, v0.8h                   \n"
       "uqxtn      v0.8b, v0.8h                   \n"
       "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_y0),                           // %0
         "+r"(src_y1),                           // %1
@@ -2754,6 +2821,7 @@ void HalfFloat1Row_NEON(const uint16_t* src,
       "fcvtn      v1.4h, v2.4s                   \n"  // 8 half floats
       "fcvtn2     v1.8h, v3.4s                   \n"
       "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
@@ -2779,6 +2847,7 @@ void HalfFloatRow_NEON(const uint16_t* src,
       "uqshrn     v1.4h, v2.4s, #13              \n"  // isolate halffloat
       "uqshrn2    v1.8h, v3.4s, #13              \n"
       "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src),                      // %0
         "+r"(dst),                      // %1
@@ -2803,6 +2872,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
       "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
       "fmul       v3.4s, v3.4s, %3.s[0]          \n"
       "st1        {v2.16b, v3.16b}, [%1], #32    \n"  // store 8 floats
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
@@ -2828,6 +2898,7 @@ float ScaleMaxSamples_NEON(const float* src,
       "fmax       v5.4s, v5.4s, v1.4s            \n"  // max
       "fmax       v6.4s, v6.4s, v2.4s            \n"
       "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       "fmax       v5.4s, v5.4s, v6.4s            \n"  // max
       "fmaxv      %s3, v5.4s                     \n"  // signed max acculator
@@ -2857,6 +2928,7 @@ float ScaleSumSamples_NEON(const float* src,
       "fmla       v5.4s, v1.4s, v1.4s            \n"  // sum of squares
       "fmla       v6.4s, v2.4s, v2.4s            \n"
       "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       "faddp      v5.4s, v5.4s, v6.4s            \n"
       "faddp      v5.4s, v5.4s, v5.4s            \n"
@@ -2878,6 +2950,7 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
       "fmul       v1.4s, v1.4s, %3.s[0]          \n"  // scale
       "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
       "st1        {v1.4s, v2.4s}, [%1], #32      \n"  // store 8 samples
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
@@ -2902,18 +2975,23 @@ void GaussCol_NEON(const uint16_t* src0,
       "ld1        {v1.8h}, [%0], #16             \n"  // load 8 samples, 5 rows
       "ld1        {v2.8h}, [%4], #16             \n"
       "uaddl      v0.4s, v1.4h, v2.4h            \n"  // * 1
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "uaddl2     v1.4s, v1.8h, v2.8h            \n"  // * 1
       "ld1        {v2.8h}, [%1], #16             \n"
       "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
+      "prfm       pldl1keep, [%1, 448]           \n"
       "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
       "ld1        {v2.8h}, [%2], #16             \n"
       "umlal      v0.4s, v2.4h, v7.4h            \n"  // * 6
+      "prfm       pldl1keep, [%2, 448]           \n"
       "umlal2     v1.4s, v2.8h, v7.8h            \n"  // * 6
       "ld1        {v2.8h}, [%3], #16             \n"
       "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
+      "prfm       pldl1keep, [%3, 448]           \n"
       "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
       "subs       %w6, %w6, #8                   \n"  // 8 processed per loop
       "st1        {v0.4s,v1.4s}, [%5], #32       \n"  // store 8 samples
+      "prfm       pldl1keep, [%4, 448]           \n"
       "b.gt       1b                             \n"
       : "+r"(src0),  // %0
         "+r"(src1),  // %1
@@ -2946,6 +3024,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
       "ld1        {v4.4s,v5.4s}, [%3], #32       \n"
       "add        v2.4s, v2.4s, v4.4s            \n"  // add rows for * 4
       "add        v3.4s, v3.4s, v5.4s            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "mla        v0.4s, v2.4s, v6.4s            \n"  // * 4
       "mla        v1.4s, v3.4s, v6.4s            \n"  // * 4
       "subs       %w5, %w5, #8                   \n"  // 8 processed per loop
@@ -2982,14 +3061,19 @@ void GaussCol_F32_NEON(const float* src0,
       "fmla       v0.4s, v2.4s, v6.4s            \n"  // * 4
       "ld1        {v4.4s, v5.4s}, [%2], #32      \n"
       "fmla       v1.4s, v3.4s, v6.4s            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "fmla       v0.4s, v4.4s, v7.4s            \n"  // * 6
       "ld1        {v2.4s, v3.4s}, [%3], #32      \n"
       "fmla       v1.4s, v5.4s, v7.4s            \n"
+      "prfm       pldl1keep, [%1, 448]           \n"
       "fmla       v0.4s, v2.4s, v6.4s            \n"  // * 4
       "ld1        {v4.4s, v5.4s}, [%4], #32      \n"
       "fmla       v1.4s, v3.4s, v6.4s            \n"
+      "prfm       pldl1keep, [%2, 448]           \n"
       "fadd       v0.4s, v0.4s, v4.4s            \n"  // * 1
+      "prfm       pldl1keep, [%3, 448]           \n"
       "fadd       v1.4s, v1.4s, v5.4s            \n"
+      "prfm       pldl1keep, [%4, 448]           \n"
       "subs       %w6, %w6, #8                   \n"  // 8 processed per loop
       "st1        {v0.4s, v1.4s}, [%5], #32      \n"  // store 8 samples
       "b.gt       1b                             \n"
@@ -3024,6 +3108,7 @@ void GaussRow_F32_NEON(const float* src,
       "fadd       v3.4s, v3.4s, v5.4s            \n"
       "fmla       v0.4s, v2.4s, v6.4s            \n"  // * 4
       "fmla       v1.4s, v3.4s, v6.4s            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "fmul       v0.4s, v0.4s, v8.4s            \n"  // / 256
       "fmul       v1.4s, v1.4s, v8.4s            \n"
       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
@@ -3052,6 +3137,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
       "zip1       v1.16b, v1.16b, v1.16b         \n"     // replicate U values
       "subs       %w3, %w3, #16                  \n"     // 16 pixels per loop
       "st3        {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_y),      // %0
         "+r"(src_vu),     // %1
@@ -3079,6 +3165,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
       "uqrshrn    v2.8b, v1.8h, #2               \n"
       "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
       "st2        {v2.8b,v3.8b}, [%2], #16       \n"  // store 8 pixels UV.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_ayuv),    // %0
         "+r"(src_ayuv_1),  // %1
@@ -3107,6 +3194,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
       "uqrshrn    v1.8b, v1.8h, #2               \n"
       "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
       "st2        {v0.8b,v1.8b}, [%2], #16       \n"  // store 8 pixels VU.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_ayuv),    // %0
         "+r"(src_ayuv_1),  // %1
@@ -3124,6 +3212,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
                                                                 // pixels
       "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
       "st1        {v2.16b}, [%1], #16            \n"  // store 16 Y pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_ayuv),  // %0
         "+r"(dst_y),     // %1
@@ -3140,6 +3229,7 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
       "orr        v2.16b, v0.16b, v0.16b         \n"  // move U after V
       "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
       "st2        {v1.16b, v2.16b}, [%1], #32    \n"  // store 16 VU pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_uv),  // %0
         "+r"(dst_vu),  // %1
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index 0a7b80ce..e155a484 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -31,6 +31,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
       // load even pixels into v0, odd into v1
       "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
       "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
       "b.gt       1b                             \n"
       : "+r"(src_ptr),   // %0
@@ -54,6 +55,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
       "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
       "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
       "st1        {v0.16b}, [%1], #16            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst),       // %1
@@ -82,6 +84,8 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
       "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
       "rshrn2     v0.16b, v1.8h, #2              \n"
       "st1        {v0.16b}, [%2], #16            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%1, 448]           \n"
       "b.gt       1b                             \n"
       : "+r"(src_ptr),     // %0
         "+r"(src_stride),  // %1
@@ -102,6 +106,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
       "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32  \n"  // src line 0
       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
       "st1     {v2.8b}, [%1], #8                 \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
@@ -131,6 +136,10 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
       "addp    v0.8h, v0.8h, v0.8h               \n"
       "rshrn   v0.8b, v0.8h, #4                  \n"  // divide by 16 w/rounding
       "st1    {v0.s}[0], [%1], #4                \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%2, 448]           \n"
+      "prfm       pldl1keep, [%3, 448]           \n"
+      "prfm       pldl1keep, [%4, 448]           \n"
       "b.gt       1b                             \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
@@ -156,7 +165,8 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
       "subs      %w2, %w2, #24                           \n"
       "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0,v1,v2
       "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
-      "b.gt      1b                                      \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "b.gt       1b                             \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
@@ -211,7 +221,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
 
       "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
 
-      "b.gt      1b                                      \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%3, 448]           \n"
+      "b.gt       1b                             \n"
       : "+r"(src_ptr),    // %0
         "+r"(dst_ptr),    // %1
         "+r"(dst_width),  // %2
@@ -252,7 +264,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
       "uqrshrn   v2.8b, v4.8h, #2                        \n"
 
       "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
-      "b.gt      1b                                      \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%3, 448]           \n"
+      "b.gt       1b                             \n"
       : "+r"(src_ptr),    // %0
         "+r"(dst_ptr),    // %1
         "+r"(dst_width),  // %2
@@ -286,7 +300,8 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
       "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b         \n"
       "st1       {v2.8b}, [%1], #8                       \n"
       "st1       {v2.s}[2], [%1], #4                     \n"
-      "b.gt      1b                                      \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "b.gt       1b                             \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
@@ -400,7 +415,10 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
 
       "st1       {v3.8b}, [%1], #8                       \n"
       "st1       {v3.s}[2], [%1], #4                     \n"
-      "b.gt      1b                                      \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%2, 448]           \n"
+      "prfm       pldl1keep, [%3, 448]           \n"
+      "b.gt       1b                             \n"
       : "+r"(src_ptr),         // %0
         "+r"(dst_ptr),         // %1
         "+r"(tmp_src_stride),  // %2
@@ -504,7 +522,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
 
       "st1       {v3.8b}, [%1], #8                       \n"
       "st1       {v3.s}[2], [%1], #4                     \n"
-      "b.gt      1b                                      \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%2, 448]           \n"
+      "b.gt       1b                             \n"
       : "+r"(src_ptr),         // %0
         "+r"(dst_ptr),         // %1
         "+r"(tmp_src_stride),  // %2
@@ -528,7 +548,8 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
       "uaddw    v1.8h, v1.8h, v0.8b              \n"
       "st1      {v1.8h, v2.8h}, [%1], #32        \n"  // store accumulator
       "subs     %w2, %w2, #16                    \n"  // 16 processed per loop
-      "b.gt     1b                               \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "b.gt       1b                             \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(src_width)  // %2
@@ -599,7 +620,7 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
     "add       v1.4s, v1.4s, v0.4s             \n"
     "add       v2.4s, v2.4s, v0.4s             \n"
     "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
-    "b.gt      1b                              \n"
+      "b.gt       1b                             \n"
   : "+r"(dst_ptr),          // %0
     "+r"(src_ptr),          // %1
     "+r"(dst_width),        // %2
@@ -647,6 +668,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
       "rshrn        v0.8b, v6.8h, #8             \n"
       "rshrn2       v0.16b, v7.8h, #8            \n"
       "st1          {v0.16b}, [%0], #16          \n"
+      "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%2, 448]           \n"
       "b.gt         1b                           \n"
       "b            99f                          \n"
 
@@ -658,6 +681,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
       "urhadd       v0.16b, v0.16b, v1.16b       \n"
       "urhadd       v0.16b, v0.16b, v1.16b       \n"
       "st1          {v0.16b}, [%0], #16          \n"
+      "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%2, 448]           \n"
       "b.gt         25b                          \n"
       "b            99f                          \n"
 
@@ -668,6 +693,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
       "subs         %w3, %w3, #16                \n"
       "urhadd       v0.16b, v0.16b, v1.16b       \n"
       "st1          {v0.16b}, [%0], #16          \n"
+      "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%2, 448]           \n"
       "b.gt         50b                          \n"
       "b            99f                          \n"
 
@@ -679,6 +706,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
       "urhadd       v0.16b, v0.16b, v1.16b       \n"
       "urhadd       v0.16b, v0.16b, v1.16b       \n"
       "st1          {v0.16b}, [%0], #16          \n"
+      "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%2, 448]           \n"
       "b.gt         75b                          \n"
       "b            99f                          \n"
 
@@ -687,6 +716,7 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
       "ld1          {v0.16b}, [%1], #16          \n"
       "subs         %w3, %w3, #16                \n"
       "st1          {v0.16b}, [%0], #16          \n"
+      "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
       "b.gt         100b                         \n"
 
       "99:                                       \n"
@@ -713,6 +743,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
       "mov        v2.16b, v3.16b                 \n"
       "st2        {v1.4s,v2.4s}, [%1], #32       \n"  // store 8 odd pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_ptr),   // %0
         "+r"(dst),       // %1
@@ -736,6 +767,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
       "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
       "urhadd     v1.16b, v2.16b, v3.16b         \n"
       "st2        {v0.4s,v1.4s}, [%1], #32       \n"  // store 8 pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
@@ -769,6 +801,8 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
       "rshrn      v2.8b, v2.8h, #2               \n"
       "rshrn      v3.8b, v3.8h, #2               \n"
       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%1, 448]           \n"
       "b.gt       1b                             \n"
       : "+r"(src_ptr),     // %0
         "+r"(src_stride),  // %1
@@ -794,6 +828,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
       "ld1        {v0.s}[3], [%0], %3            \n"
       "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
       "st1        {v0.16b}, [%1], #16            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_argb),                // %0
         "+r"(dst_argb),                // %1
@@ -838,6 +873,8 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
       "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
       "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
       "st1     {v0.16b}, [%2], #16               \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%1, 448]           \n"
       "b.gt       1b                             \n"
       : "+r"(src_argb),                // %0
         "+r"(src_stride),              // %1
@@ -878,6 +915,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
       // clang-format on
       "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(dst_argb),   // %0
         "+r"(src_argb),   // %1
@@ -949,7 +987,8 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
     "st1     {v0.4s}, [%0], #16                \n"  // store pixels
     "add     v5.4s, v5.4s, v6.4s               \n"
     "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
-    "b.gt    1b                                \n"
+      "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
+      "b.gt       1b                             \n"
   : "+r"(dst_argb),         // %0
     "+r"(src_argb),         // %1
     "+r"(dst_width),        // %2
@@ -984,6 +1023,8 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
       "rshrn      v0.4h, v0.4s, #2               \n"  // round and pack
       "rshrn2     v0.8h, v1.4s, #2               \n"
       "st1        {v0.8h}, [%2], #16             \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%1, 448]           \n"
       "b.gt       1b                             \n"
       : "+r"(src_ptr),     // %0
         "+r"(src_stride),  // %1
@@ -1032,6 +1073,8 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
       "uqrshrn    v17.4h, v18.4s, #4             \n"
       "uqrshrn2   v17.8h, v4.4s, #4              \n"
       "st2        {v16.8h-v17.8h}, [%2], #32     \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%1, 448]           \n"
       "b.gt       1b                             \n"
       : "+r"(src_ptr),     // %0
         "+r"(src_stride),  // %1
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index c75f715a..f97ad9a7 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -804,6 +804,23 @@ TEST_F(LibYUVPlanarTest, TestARGBMirror) {
   }
 }
 
+TEST_F(LibYUVPlanarTest, TestMirrorPlane) {
+  SIMD_ALIGNED(uint8_t orig_pixels[1280]);
+  SIMD_ALIGNED(uint8_t dst_pixels[1280]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i] = i;
+  }
+  MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
+
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i]);
+  }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
+  }
+}
+
 TEST_F(LibYUVPlanarTest, TestShade) {
   SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
   SIMD_ALIGNED(uint8_t shade_pixels[1280][4]);
@@ -3315,8 +3332,8 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
     }
 #else
     GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
-               &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0],
-               1280);
+               &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+               &dst_pixels_opt[0], 1280);
 #endif
   }
 
@@ -3369,36 +3386,24 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) {
   for (int i = 0; i < 1280 * 5; ++i) {
     orig_pixels[i] = static_cast<float>(i);
   }
-   GaussCol_F32_C(&orig_pixels[0],
-                  &orig_pixels[1280],
-                  &orig_pixels[1280 * 2],
-                  &orig_pixels[1280 * 3],
-                  &orig_pixels[1280 * 4],
-                  &dst_pixels_c[0], 1280);
+  GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+                 &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+                 &dst_pixels_c[0], 1280);
   for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
     int has_neon = TestCpuFlag(kCpuHasNEON);
     if (has_neon) {
-      GaussCol_F32_NEON(&orig_pixels[0],
-                        &orig_pixels[1280],
-                        &orig_pixels[1280 * 2],
-                        &orig_pixels[1280 * 3],
-                        &orig_pixels[1280 * 4],
-                        &dst_pixels_opt[0], 1280);
+      GaussCol_F32_NEON(&orig_pixels[0], &orig_pixels[1280],
+                        &orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
+                        &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
     } else {
-      GaussCol_F32_C(&orig_pixels[0],
-                     &orig_pixels[1280],
-                     &orig_pixels[1280 * 2],
-                     &orig_pixels[1280 * 3],
-                     &orig_pixels[1280 * 4],
-                     &dst_pixels_opt[0], 1280);
+      GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280],
+                     &orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
+                     &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
     }
 #else
-    GaussCol_F32_C(&orig_pixels[0],
-                   &orig_pixels[1280],
-                   &orig_pixels[1280 * 2],
-                   &orig_pixels[1280 * 3],
-                   &orig_pixels[1280 * 4],
+    GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+                   &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
                    &dst_pixels_opt[0], 1280);
 #endif
   }
@@ -3455,18 +3460,18 @@ TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
 
   MaskCpuFlags(disable_cpu_flags_);
   GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
-                 (float*)(dst_pixels_c), benchmark_width_,
-                 benchmark_width_, benchmark_height_);
+                 (float*)(dst_pixels_c), benchmark_width_, benchmark_width_,
+                 benchmark_height_);
   MaskCpuFlags(benchmark_cpu_info_);
 
   for (int i = 0; i < benchmark_iterations_; ++i) {
     GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
-                   (float*)(dst_pixels_opt), benchmark_width_,
-                   benchmark_width_, benchmark_height_);
+                   (float*)(dst_pixels_opt), benchmark_width_, benchmark_width_,
+                   benchmark_height_);
   }
-  for (int i = 0; i < benchmark_width_ * benchmark_height_ ; ++i) {
-    EXPECT_NEAR(((float*)(dst_pixels_c))  [i],
-                ((float*)(dst_pixels_opt))[i], 1.f) << i;
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    EXPECT_NEAR(((float*)(dst_pixels_c))[i], ((float*)(dst_pixels_opt))[i], 1.f)
+        << i;
   }
 
   free_aligned_buffer_page_end(dst_pixels_c);
diff --git a/unit_test/rotate_argb_test.cc b/unit_test/rotate_argb_test.cc
index d2003895..3208b66a 100644
--- a/unit_test/rotate_argb_test.cc
+++ b/unit_test/rotate_argb_test.cc
@@ -183,4 +183,46 @@ TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
                   benchmark_cpu_info_);
 }
 
+TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) {
+  int argb_plane_size = benchmark_width_ * 4 * abs(benchmark_height_);
+
+  align_buffer_page_end(src_argb, argb_plane_size);
+  align_buffer_page_end(dst_argb, argb_plane_size);
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          benchmark_width_ * 4, benchmark_width_,
+                          benchmark_height_, kRotate0));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                          benchmark_width_ * 4 - 1, benchmark_width_ - 1,
+                          benchmark_height_, kRotate0));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          benchmark_width_ * 4, benchmark_width_,
+                          benchmark_height_, kRotate180));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                          benchmark_width_ * 4 - 1, benchmark_width_ - 1,
+                          benchmark_height_, kRotate180));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          abs(benchmark_height_) * 4, benchmark_width_,
+                          benchmark_height_, kRotate90));
+
+  EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                           abs(benchmark_height_) * 4, benchmark_width_ - 1,
+                           benchmark_height_, kRotate90));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          abs(benchmark_height_) * 4, benchmark_width_,
+                          benchmark_height_, kRotate270));
+
+  EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                           abs(benchmark_height_) * 4, benchmark_width_ - 1,
+                           benchmark_height_, kRotate270));
+
+  free_aligned_buffer_page_end(dst_argb);
+  free_aligned_buffer_page_end(src_argb);
+}
+
 }  // namespace libyuv