1 files changed, 157 insertions, 91 deletions
diff --git a/libvpx/vpx_dsp/mips/variance_mmi.c b/libvpx/vpx_dsp/mips/variance_mmi.c
index c1780c33a..c2adcfa01 100644
--- a/libvpx/vpx_dsp/mips/variance_mmi.c
+++ b/libvpx/vpx_dsp/mips/variance_mmi.c
@@ -150,7 +150,7 @@ static const uint8_t bilinear_filters[8][2] = {
   "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t" \
                                                                     \
   /* store: temp2[0] ~ temp2[3] */                                  \
-  "and        %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
+  "pand       %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
   "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t" \
   "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"
 
@@ -163,7 +163,7 @@ static const uint8_t bilinear_filters[8][2] = {
   "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t" \
                                                                     \
   /* store: temp2[0] ~ temp2[3] */                                  \
-  "and        %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
+  "pand       %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
   "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t" \
   "gssdrc1    %[ftmp4],   0x00(%[temp2_ptr])                  \n\t"
 
@@ -225,8 +225,8 @@ static const uint8_t bilinear_filters[8][2] = {
   "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp14]           \n\t" \
                                                                     \
   /* store: temp2[0] ~ temp2[7] */                                  \
-  "and        %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
-  "and        %[ftmp3],   %[ftmp3],       %[mask]             \n\t" \
+  "pand       %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
+  "pand       %[ftmp3],   %[ftmp3],       %[mask]             \n\t" \
   "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t" \
   "gssdlc1    %[ftmp2],   0x07(%[temp2_ptr])                  \n\t" \
   "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"
@@ -247,8 +247,8 @@ static const uint8_t bilinear_filters[8][2] = {
   "psrlh      %[ftmp9],   %[ftmp9],       %[ftmp14]           \n\t" \
                                                                     \
   /* store: temp2[0] ~ temp2[7] */                                  \
-  "and        %[ftmp8],   %[ftmp8],       %[mask]             \n\t" \
-  "and        %[ftmp9],   %[ftmp9],       %[mask]             \n\t" \
+  "pand       %[ftmp8],   %[ftmp8],       %[mask]             \n\t" \
+  "pand       %[ftmp9],   %[ftmp9],       %[mask]             \n\t" \
   "packushb   %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t" \
   "gssdlc1    %[ftmp8],   0x07(%[temp2_ptr])                  \n\t" \
   "gssdrc1    %[ftmp8],   0x00(%[temp2_ptr])                  \n\t"
@@ -319,8 +319,8 @@ static const uint8_t bilinear_filters[8][2] = {
   "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp14]           \n\t" \
                                                                     \
   /* store: temp2[8] ~ temp2[15] */                                 \
-  "and        %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
-  "and        %[ftmp5],   %[ftmp5],       %[mask]             \n\t" \
+  "pand       %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
+  "pand       %[ftmp5],   %[ftmp5],       %[mask]             \n\t" \
   "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t" \
   "gssdlc1    %[ftmp4],   0x0f(%[temp2_ptr])                  \n\t" \
   "gssdrc1    %[ftmp4],   0x08(%[temp2_ptr])                  \n\t"
@@ -343,8 +343,8 @@ static const uint8_t bilinear_filters[8][2] = {
   "psrlh      %[ftmp11],  %[ftmp11],      %[ftmp14]           \n\t" \
                                                                     \
   /* store: temp2[8] ~ temp2[15] */                                 \
-  "and        %[ftmp10],  %[ftmp10],      %[mask]             \n\t" \
-  "and        %[ftmp11],  %[ftmp11],      %[mask]             \n\t" \
+  "pand       %[ftmp10],  %[ftmp10],      %[mask]             \n\t" \
+  "pand       %[ftmp11],  %[ftmp11],      %[mask]             \n\t" \
   "packushb   %[ftmp10],  %[ftmp10],      %[ftmp11]           \n\t" \
   "gssdlc1    %[ftmp10],  0x0f(%[temp2_ptr])                  \n\t" \
   "gssdrc1    %[ftmp10],  0x08(%[temp2_ptr])                  \n\t"
@@ -414,13 +414,14 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
     MMI_L(%[tmp0], %[high], 0x00)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
-    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
     "1:                                                         \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
@@ -478,7 +479,7 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
     "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
     "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"
     "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"
-    "dsrl       %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
+    "ssrld      %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
     "paddw      %[ftmp1],   %[ftmp1],       %[ftmp10]           \n\t"
     "swc1       %[ftmp1],   0x00(%[sse])                        \n\t"
     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
@@ -496,6 +497,7 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / (64 * high));
 }
@@ -519,13 +521,14 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
     "li         %[tmp0],    0x40                                \n\t"
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
-    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
     "1:                                                         \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
@@ -559,7 +562,7 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
     "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
     "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"
     "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"
-    "dsrl       %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
+    "ssrld      %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
     "paddw      %[ftmp1],   %[ftmp1],       %[ftmp10]           \n\t"
     "swc1       %[ftmp1],   0x00(%[sse])                        \n\t"
     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
@@ -577,6 +580,7 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
       [sse]"r"(sse)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / 2048);
 }
@@ -590,14 +594,15 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
     MMI_L(%[tmp0], %[high], 0x00)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
-    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
-    "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "pxor       %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
     "1:                                                         \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
@@ -625,7 +630,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
-    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
 
@@ -636,7 +641,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
-    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
 
@@ -653,6 +658,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / (32 * high));
 }
@@ -676,14 +682,15 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
     MMI_L(%[tmp0], %[high], 0x00)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
-    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
-    "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "pxor       %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
     "1:                                                         \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
@@ -701,7 +708,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
-    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
 
@@ -712,7 +719,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
-    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
 
@@ -729,6 +736,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / (16 * high));
 }
@@ -753,14 +761,15 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
     MMI_L(%[tmp0], %[high], 0x00)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
-    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
-    "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "pxor       %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
     "1:                                                         \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
@@ -773,7 +782,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
-    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
 
@@ -784,7 +793,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
-    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
 
@@ -801,6 +810,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / (8 * high));
 }
@@ -825,14 +835,15 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
     MMI_L(%[tmp0], %[high], 0x00)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-    "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
     "1:                                                         \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
@@ -845,7 +856,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
-    "dsrl       %[ftmp9],   %[ftmp6],       %[ftmp10]           \n\t"
+    "ssrld      %[ftmp9],   %[ftmp6],       %[ftmp10]           \n\t"
     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp6]            \n\t"
     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
 
@@ -856,7 +867,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
-    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp10]           \n\t"
+    "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp10]           \n\t"
     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
@@ -872,6 +883,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / (4 * high));
 }
@@ -894,12 +906,13 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
     MMI_L(%[tmp0], %[high], 0x00)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
 
     "1:                                                         \n\t"
     VARIANCE_SSE_16
@@ -909,7 +922,7 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
-    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
@@ -925,6 +938,7 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse;
 }
@@ -947,12 +961,13 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
     MMI_L(%[tmp0], %[high], 0x00)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
 
     "1:                                                         \n\t"
     VARIANCE_SSE_8
@@ -962,7 +977,7 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
-    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
@@ -978,6 +993,7 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse;
 }
@@ -1021,22 +1037,39 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
   uint8_t *temp2_ptr = temp2;
   mips_reg l_counter = counter;
   double ftmp[15];
+  double ff_ph_40, mask;
+  double filter_x0, filter_x1, filter_y0, filter_y1;
   mips_reg tmp[2];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  uint64_t x0, x1, y0, y1, all;
 
   const uint8_t *filter_x = bilinear_filters[x_offset];
   const uint8_t *filter_y = bilinear_filters[y_offset];
+  x0 = (uint64_t)filter_x[0];
+  x1 = (uint64_t)filter_x[1];
+  y0 = (uint64_t)filter_y[0];
+  y1 = (uint64_t)filter_y[1];
+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_MTC1(%[all], %[ftmp14])
+    "punpcklbh  %[ftmp14],  %[ftmp14],      %[ftmp0]            \n\t"
+    "pshufh     %[filter_x0], %[ftmp14],    %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x10)
+    MMI_MTC1(%[tmp0], %[mask])
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_x1], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y0], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y1], %[ftmp14],    %[ftmp0]            \n\t"
     MMI_LI(%[tmp0], 0x07)
     MMI_MTC1(%[tmp0], %[ftmp14])
-    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
-
+    MMI_LI(%[tmp0], 0x0040004000400040)
+    MMI_MTC1(%[tmp0], %[ff_ph_40])
+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+    MMI_MTC1(%[tmp0], %[mask])
     // fdata3: fdata3[0] ~ fdata3[15]
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
 
@@ -1072,15 +1105,13 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
       [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
       [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
       [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
-      [counter]"+&r"(l_counter)
-    : [filter_x0] "f"((uint64_t)filter_x[0]),
-      [filter_x1] "f"((uint64_t)filter_x[1]),
-      [filter_y0] "f"((uint64_t)filter_y[0]),
-      [filter_y1] "f"((uint64_t)filter_y[1]),
-      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
-      [mask] "f"(mask)
+      [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
     : "memory"
   );
+  /* clang-format on */
 }
 
 #define SUBPIX_VAR16XN(H)                                                      \
@@ -1105,19 +1136,38 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
   mips_reg l_counter = counter;
   double ftmp[15];
   mips_reg tmp[2];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  double ff_ph_40, mask;
+  uint64_t x0, x1, y0, y1, all;
+  double filter_x0, filter_x1, filter_y0, filter_y1;
   const uint8_t *filter_x = bilinear_filters[x_offset];
   const uint8_t *filter_y = bilinear_filters[y_offset];
+  x0 = (uint64_t)filter_x[0];
+  x1 = (uint64_t)filter_x[1];
+  y0 = (uint64_t)filter_y[0];
+  y1 = (uint64_t)filter_y[1];
+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_MTC1(%[all], %[ftmp14])
+    "punpcklbh  %[ftmp14],  %[ftmp14],      %[ftmp0]            \n\t"
+    "pshufh     %[filter_x0], %[ftmp14],    %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x10)
+    MMI_MTC1(%[tmp0], %[mask])
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_x1], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y0], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y1], %[ftmp14],    %[ftmp0]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
     MMI_LI(%[tmp0], 0x07)
     MMI_MTC1(%[tmp0], %[ftmp14])
-    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x0040004000400040)
+    MMI_MTC1(%[tmp0], %[ff_ph_40])
+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+    MMI_MTC1(%[tmp0], %[mask])
 
     // fdata3: fdata3[0] ~ fdata3[7]
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
@@ -1154,15 +1204,13 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
       [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
       [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
       [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
-      [counter]"+&r"(l_counter)
-    : [filter_x0] "f"((uint64_t)filter_x[0]),
-      [filter_x1] "f"((uint64_t)filter_x[1]),
-      [filter_y0] "f"((uint64_t)filter_y[0]),
-      [filter_y1] "f"((uint64_t)filter_y[1]),
-      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
-      [mask] "f"(mask)
+      [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
     : "memory"
   );
+  /* clang-format on */
 }
 
 #define SUBPIX_VAR8XN(H)                                                      \
@@ -1188,19 +1236,38 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
   mips_reg l_counter = counter;
   double ftmp[7];
   mips_reg tmp[2];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  double ff_ph_40, mask;
+  uint64_t x0, x1, y0, y1, all;
+  double filter_x0, filter_x1, filter_y0, filter_y1;
   const uint8_t *filter_x = bilinear_filters[x_offset];
   const uint8_t *filter_y = bilinear_filters[y_offset];
+  x0 = (uint64_t)filter_x[0];
+  x1 = (uint64_t)filter_x[1];
+  y0 = (uint64_t)filter_y[0];
+  y1 = (uint64_t)filter_y[1];
+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_MTC1(%[all], %[ftmp6])
+    "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+    "pshufh     %[filter_x0], %[ftmp6],     %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x10)
+    MMI_MTC1(%[tmp0], %[mask])
+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
+    "pshufh     %[filter_x1], %[ftmp6],     %[ftmp0]            \n\t"
+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
+    "pshufh     %[filter_y0], %[ftmp6],     %[ftmp0]            \n\t"
+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
+    "pshufh     %[filter_y1], %[ftmp6],     %[ftmp0]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
     MMI_LI(%[tmp0], 0x07)
     MMI_MTC1(%[tmp0], %[ftmp6])
-    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x0040004000400040)
+    MMI_MTC1(%[tmp0], %[ff_ph_40])
+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+    MMI_MTC1(%[tmp0], %[mask])
     // fdata3: fdata3[0] ~ fdata3[3]
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
 
@@ -1232,15 +1299,14 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
     : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
       [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
       [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr),
-      [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)
-    : [filter_x0] "f"((uint64_t)filter_x[0]),
-      [filter_x1] "f"((uint64_t)filter_x[1]),
-      [filter_y0] "f"((uint64_t)filter_y[0]),
-      [filter_y1] "f"((uint64_t)filter_y[1]),
-      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
-      [mask] "f"(mask)
+      [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter),
+      [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
     : "memory"
   );
+  /* clang-format on */
 }
 
 #define SUBPIX_VAR4XN(H)                                                      \