diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-07-07 04:46:39 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-07-07 04:46:39 +0000 |
commit | 61206b6b77e25faa5a07ffc7822a437487c3b996 (patch) | |
tree | 25cbeae94c26cad28a13bb089a662291b4b5d905 /files/source/row_mmi.cc | |
parent | 2618a02a89c99046741140d41b2cfda18c6dc9dd (diff) | |
parent | 3c4c137522d3a6759f2d96aee67149410ca2c1b9 (diff) | |
download | libyuv-61206b6b77e25faa5a07ffc7822a437487c3b996.tar.gz |
Snap for 10453563 from 3c4c137522d3a6759f2d96aee67149410ca2c1b9 to mainline-conscrypt-release
Change-Id: Ie22a992a771a7de60db49aa6825555946cace106
Diffstat (limited to 'files/source/row_mmi.cc')
-rw-r--r-- | files/source/row_mmi.cc | 2450 |
1 files changed, 2125 insertions, 325 deletions
diff --git a/files/source/row_mmi.cc b/files/source/row_mmi.cc index d8726d09..362fd1cf 100644 --- a/files/source/row_mmi.cc +++ b/files/source/row_mmi.cc @@ -21,6 +21,8 @@ extern "C" { // This module is for Mips MMI. #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) +// clang-format off + void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { @@ -603,7 +605,7 @@ void ARGBToARGB4444Row_MMI(const uint8_t* src_argb, : "memory"); } -void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ARGBToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -611,8 +613,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -624,8 +626,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -637,8 +639,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -650,8 +652,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -669,35 +671,38 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void ARGBToUVRow_MMI(const uint8_t* src_rgb0, +void ARGBToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0026004a00700002; - const uint64_t mask_v = 0x00020070005e0012; + const uint64_t mask_u = 0x0013002500380002; + const uint64_t mask_v = 0x00020038002f0009; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -707,15 +712,16 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -725,7 +731,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -741,8 +748,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -752,15 +759,16 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -770,7 +778,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -786,8 +795,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -797,15 +806,16 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -815,7 +825,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -831,8 +842,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -842,15 +853,16 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -860,7 +872,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -888,7 +901,7 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -898,16 +911,17 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } -void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -915,8 +929,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -928,8 +942,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -941,8 +955,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -954,8 +968,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -973,35 +987,38 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void BGRAToUVRow_MMI(const uint8_t* src_rgb0, +void BGRAToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x00020070004a0026; - const uint64_t mask_v = 0x0012005e00700002; + const uint64_t mask_u = 0x0002003800250013; + const uint64_t mask_v = 0x0009002f00380002; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1011,15 +1028,16 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_0 %[dest0_v], %[src0], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1029,7 +1047,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1045,8 +1064,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src0], %[src1] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1056,15 +1075,16 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_0 %[dest1_v], %[src0], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1074,7 +1094,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1090,8 +1111,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src0], %[src1] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1101,15 +1122,16 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_0 %[dest2_v], %[src0], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1119,7 +1141,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1135,8 +1158,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src0], %[src1] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1146,15 +1169,16 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_0 %[dest3_v], %[src0], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1164,7 +1188,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1192,7 +1217,7 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -1202,16 +1227,17 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } -void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -1219,8 +1245,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1232,8 +1258,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1245,8 +1271,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1258,8 +1284,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1277,35 +1303,38 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void ABGRToUVRow_MMI(const uint8_t* src_rgb0, +void ABGRToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x00020070004a0026; - const uint64_t mask_v = 0x0012005e00700002; + const uint64_t mask_u = 0x0002003800250013; + const uint64_t mask_v = 0x0009002F00380002; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1315,15 +1344,16 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" "dsll %[dest0_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1333,7 +1363,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1349,8 +1380,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src0], %[src1] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1360,15 +1391,16 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" "dsll %[dest1_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1378,7 +1410,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1394,8 +1427,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src0], %[src1] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1405,15 +1438,16 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" "dsll %[dest2_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1423,7 +1457,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1439,8 +1474,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src0], %[src1] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1450,15 +1485,16 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" "dsll %[dest3_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1468,7 +1504,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1496,7 +1533,7 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -1506,16 +1543,17 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } -void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -1523,8 +1561,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1536,8 +1574,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1549,8 +1587,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1562,8 +1600,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1581,35 +1619,38 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void RGBAToUVRow_MMI(const uint8_t* src_rgb0, +void RGBAToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0026004a00700002; - const uint64_t mask_v = 0x00020070005e0012; + const uint64_t mask_u = 0x0013002500380002; + const uint64_t mask_v = 0x00020038002f0009; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1619,15 +1660,16 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest0_u], %[src0], %[value] \n\t" "dsrl %[dest0_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1637,7 +1679,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1653,8 +1696,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1664,15 +1707,16 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest1_u], %[src0], %[value] \n\t" "dsrl %[dest1_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1682,7 +1726,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1698,8 +1743,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1709,15 +1754,16 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest2_u], %[src0], %[value] \n\t" "dsrl %[dest2_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1727,7 +1773,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1743,8 +1790,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1754,15 +1801,16 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest3_u], %[src0], %[value] \n\t" "dsrl %[dest3_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1772,7 +1820,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1800,7 +1849,7 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -1810,16 +1859,17 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } -void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -1827,8 +1877,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1841,8 +1891,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1855,8 +1905,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1869,8 +1919,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1889,35 +1939,38 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t" + "daddiu %[src_argb], %[src_argb], 0x18 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, +void RGB24ToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0026004a00700002; - const uint64_t mask_v = 0x00020070005e0012; + const uint64_t mask_u = 0x0013002500380002; + const uint64_t mask_v = 0x00020038002f0009; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1929,15 +1982,16 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1949,7 +2003,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -1965,8 +2020,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1978,15 +2033,16 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1998,7 +2054,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -2014,8 +2071,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2027,15 +2084,16 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2047,7 +2105,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -2063,8 +2122,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2076,15 +2135,16 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2096,7 +2156,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -2124,7 +2185,7 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -2134,16 +2195,17 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } -void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -2151,8 +2213,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2165,8 +2227,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2179,8 +2241,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2193,8 +2255,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2213,35 +2275,38 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t" + "daddiu %[src_argb], %[src_argb], 0x18 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void RAWToUVRow_MMI(const uint8_t* src_rgb0, +void RAWToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x00020070004a0026; - const uint64_t mask_v = 0x0012005e00700002; + const uint64_t mask_u = 0x0002003800250013; + const uint64_t mask_v = 0x0009002f00380002; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2253,15 +2318,16 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" "dsll %[dest0_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2273,7 +2339,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2289,8 +2356,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src0], %[src1] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2302,15 +2369,16 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" "dsll %[dest1_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2322,7 +2390,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2338,8 +2407,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src0], %[src1] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2351,15 +2420,16 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" "dsll %[dest2_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2371,7 +2441,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2387,8 +2458,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src0], %[src1] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2400,15 +2471,16 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" "dsll %[dest3_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2420,7 +2492,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2448,7 +2521,7 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -2458,23 +2531,24 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } -void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ARGBToYJRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest, dest0, dest1, dest2, dest3; uint64_t tmp0, tmp1; - const uint64_t shift = 0x07; - const uint64_t value = 0x0040; + const uint64_t shift = 0x08; + const uint64_t value = 0x80; const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x00010026004B000FULL; + const uint64_t mask1 = 0x0001004D0096001DULL; __asm__ volatile( "1: \n\t" @@ -2544,13 +2618,13 @@ void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1) - : [src_ptr] "r"(src_argb0), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0), + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0), [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value), [width] "r"(width) : "memory"); } -void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, +void ARGBToUVJRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -2558,22 +2632,22 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, uint64_t src_rgb1; uint64_t ftmp[12]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x002b0054007f0002; - const uint64_t mask_v = 0x0002007f006b0014; + const uint64_t mask_u = 0x0015002a003f0002; + const uint64_t mask_v = 0x0002003f0035000a; __asm__ volatile( "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" @@ -2581,16 +2655,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" @@ -2607,16 +2681,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" @@ -2624,16 +2698,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" @@ -2650,16 +2724,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" @@ -2667,16 +2741,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" @@ -2693,16 +2767,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" @@ -2710,16 +2784,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" @@ -2748,7 +2822,7 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -2759,10 +2833,10 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [sixteen] "f"(0x10) : "memory"); } @@ -4052,10 +4126,10 @@ void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) { uint64_t tmp0, tmp1; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0x01; - const uint64_t mask2 = 0x00400026004B000FULL; + const uint64_t mask2 = 0x0080004D0096001DULL; const uint64_t mask3 = 0xFF000000FF000000ULL; const uint64_t mask4 = ~mask3; - const uint64_t shift = 0x07; + const uint64_t shift = 0x08; __asm__ volatile( "1: \n\t" @@ -4312,7 +4386,7 @@ void ARGBShadeRow_MMI(const uint8_t* src_argb, : "memory"); } -void ARGBMultiplyRow_MMI(const uint8_t* src_argb0, +void ARGBMultiplyRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -4348,12 +4422,12 @@ void ARGBMultiplyRow_MMI(const uint8_t* src_argb0, [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask) : "memory"); } -void ARGBAddRow_MMI(const uint8_t* src_argb0, +void ARGBAddRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -4375,12 +4449,12 @@ void ARGBAddRow_MMI(const uint8_t* src_argb0, "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [width] "r"(width) : "memory"); } -void ARGBSubtractRow_MMI(const uint8_t* src_argb0, +void ARGBSubtractRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -4402,7 +4476,7 @@ void ARGBSubtractRow_MMI(const uint8_t* src_argb0, "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [width] "r"(width) : "memory"); } @@ -4778,7 +4852,9 @@ void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) { : "memory"); } -void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, int width) { +// TODO - respect YuvConstants +void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, + const struct YuvConstants*, int width) { uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0x55; @@ -4912,10 +4988,10 @@ void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { : "memory"); } -void MirrorUVRow_MMI(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +void MirrorSplitUVRow_MMI(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { uint64_t src0, src1, dest0, dest1; const uint64_t mask0 = 0x00ff00ff00ff00ffULL; const uint64_t mask1 = 0x1b; @@ -5476,10 +5552,10 @@ void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { : "memory"); } -// Blend src_argb0 over src_argb1 and store to dst_argb. -// dst_argb may be src_argb0 or src_argb1. +// Blend src_argb over src_argb1 and store to dst_argb. +// dst_argb may be src_argb or src_argb1. // This code mimics the SSSE3 version for better testability. -void ARGBBlendRow_MMI(const uint8_t* src_argb0, +void ARGBBlendRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -5532,7 +5608,7 @@ void ARGBBlendRow_MMI(const uint8_t* src_argb0, [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4), [shift] "f"(shift), [width] "r"(width) @@ -6034,6 +6110,1730 @@ void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) { : "memory"); } +void I444ToARGBRow_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; + uint64_t mask = 0xff00ff00ff00ff00ULL; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + __asm__ volatile ( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub + "or %[ub], %[ub], %[mask] \n\t"//must sign extension + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t"//sign extension + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + + "punpcklbh %[u], %[u], %[zero] \n\t"//u + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + + "punpcklbh %[v], %[v], %[zero] \n\t"//v + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [y]"=&f"(y), + [u]"=&f"(u), [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [alpha]"f"(-1), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask) + : "memory" + ); +} + +// Also used for 420 +void I422ToARGBRow_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; + uint64_t mask = 0xff00ff00ff00ff00ULL; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub + "or %[ub], %[ub], %[mask] \n\t"//must sign extension + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t"//sign extension + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t"//v + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), + [u]"=&f"(u), [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [alpha]"f"(-1), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask) + : "memory" + ); +} + +// 10 bit YUV to ARGB +void I210ToARGBRow_MMI(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; + uint64_t mask = 0xff00ff00ff00ff00ULL; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" + + "1: \n\t" + "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "psllh %[y], %[y], %[six] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "punpcklhw %[u], %[u], %[u] \n\t" + "psrah %[u], %[u], %[two] \n\t" + "punpcklhw %[v], %[v], %[v] \n\t" + "psrah %[v], %[v], %[two] \n\t" + "pminsh %[u], %[u], %[mask1] \n\t" + "pminsh %[v], %[v], %[mask1] \n\t" + + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), + [u]"=&f"(u), [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [alpha]"f"(-1), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask), [two]"f"(0x02), + [mask1]"f"(0x00ff00ff00ff00ff) + : "memory" + ); +} + +void I422AlphaToARGBRow_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y,u,v,a; + uint64_t b_vec[2],g_vec[2],r_vec[2]; + uint64_t mask = 0xff00ff00ff00ff00ULL; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t" + "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[a] \n\t" + "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t"//aaaagggg + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), [a]"=&f"(a), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [a_ptr]"r"(src_a), [zero]"f"(0x00), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask) + : "memory" + ); +} + +void I422ToRGB24Row_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; + uint64_t mask = 0xff00ff00ff00ff00ULL; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" + "packushb %[g_vec0], %[g_vec0], %[zero] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + + "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t" + "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t" + "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t" + "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t" + "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t" + "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "pextrh %[r_vec1], %[g_vec1], %[one] \n\t" + "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t" + "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t" + "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask]"f"(mask), + [lmove1]"f"(0x18), [rmove1]"f"(0x8), + [one]"f"(0x1) + : "memory" + ); +} + +void I422ToARGB4444Row_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "and %[g_vec], %[g_vec], %[mask1] \n\t" + "psrlw %[g_vec], %[g_vec], %[four] \n\t" + "psrlw %[r_vec], %[g_vec], %[four] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" + "and %[g_vec], %[g_vec], %[r_vec] \n\t" + + "and %[b_vec], %[b_vec], %[mask1] \n\t" + "psrlw %[b_vec], %[b_vec], %[four] \n\t" + "psrlw %[r_vec], %[b_vec], %[four] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" + "and %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[b_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [dst_argb4444]"r"(dst_argb4444), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask]"f"(0xff00ff00ff00ff00), + [four]"f"(0x4), [mask1]"f"(0xf0f0f0f0f0f0f0f0), + [alpha]"f"(-1) + : "memory" + ); +} + +void I422ToARGB1555Row_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "psrlw %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "or %[g_vec], %[g_vec], %[mask3] \n\t" + + "psrlw %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "or %[b_vec], %[b_vec], %[mask3] \n\t" + + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [dst_argb1555]"r"(dst_argb1555), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [three]"f"(0x3), [mask2]"f"(0x1f0000001f), + [eight]"f"(0x8), [mask3]"f"(0x800000008000), + [lmove5]"f"(0x5) + : "memory" + ); +} + +void I422ToRGB565Row_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "psrlh %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + + "psrlh %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [dst_rgb565]"r"(dst_rgb565), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [three]"f"(0x3), [mask2]"f"(0x1f0000001f), + [eight]"f"(0x8), [seven]"f"(0x7), + [lmove5]"f"(0x5) + : "memory" + ); +} + +void NV12ToARGBRow_MMI(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1) + : "memory" + ); +} + +void NV21ToARGBRow_MMI(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[ushu] \n\t" + "pshufh %[u], %[u], %[vshu] \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1) + : "memory" + ); +} + +void NV12ToRGB24Row_MMI(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" + "psllw %[temp], %[r_vec], %[lmove1] \n\t" + "or %[g_vec], %[g_vec], %[temp] \n\t" + "psrlw %[temp], %[r_vec], %[rmove1] \n\t" + "pextrh %[temp], %[temp], %[zero] \n\t" + "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[zero] \n\t" + "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[one] \n\t" + "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" + "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" + "or %[b_vec], %[b_vec], %[temp] \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1), [lmove1]"f"(0x18), + [one]"f"(0x1), [rmove1]"f"(0x8) + : "memory" + ); +} + +void NV21ToRGB24Row_MMI(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[ushu] \n\t" + "pshufh %[u], %[u], %[vshu] \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" + "psllw %[temp], %[r_vec], %[lmove1] \n\t" + "or %[g_vec], %[g_vec], %[temp] \n\t" + "psrlw %[temp], %[r_vec], %[rmove1] \n\t" + "pextrh %[temp], %[temp], %[zero] \n\t" + "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[zero] \n\t" + "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[one] \n\t" + "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" + "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" + "or %[b_vec], %[b_vec], %[temp] \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [lmove1]"f"(0x18), [rmove1]"f"(0x8), + [one]"f"(0x1) + : "memory" + ); +} + +void NV12ToRGB565Row_MMI(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "psrlh %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psubb %[y], %[eight], %[three] \n\t"//5 + "psllw %[r_vec], %[r_vec], %[y] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + + "psrlh %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psubb %[y], %[eight], %[three] \n\t"//5 + "psllw %[r_vec], %[r_vec], %[y] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), + [dst_rgb565]"r"(dst_rgb565), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [three]"f"(0x3), [mask2]"f"(0x1f0000001f), + [eight]"f"(0x8), [seven]"f"(0x7) + : "memory" + ); +} + +void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t" + "psrlh %[temp], %[y], %[eight] \n\t" + "pshufh %[u], %[temp], %[ushu] \n\t" + "pshufh %[v], %[temp], %[vshu] \n\t" + + "psrlh %[temp], %[mask1], %[eight] \n\t" + "and %[y], %[y], %[temp] \n\t" + "psllh %[temp], %[y], %[eight] \n\t" + "or %[y], %[y], %[temp] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [yuy2_ptr]"r"(src_yuy2), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1), [eight]"f"(0x8) + : "memory" + ); +} + +void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t" + "psrlh %[temp], %[mask1], %[eight] \n\t" + "and %[temp], %[y], %[temp] \n\t" + "pshufh %[u], %[temp], %[ushu] \n\t" + "pshufh %[v], %[temp], %[vshu] \n\t" + + "psrlh %[y], %[y], %[eight] \n\t" + "psllh %[temp], %[y], %[eight] \n\t" + "or %[y], %[y], %[temp] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [uyvy_ptr]"r"(src_uyvy), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1), [eight]"f"(0x8) + : "memory" + ); +} + +void I422ToRGBARow_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t" + "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t" + "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [alpha]"f"(-1) + : "memory" + ); +} + +void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) { + __asm__ volatile ( + "punpcklwd %[v32], %[v32], %[v32] \n\t" + "1: \n\t" + "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t" + "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t" + + "daddi %[width], %[width], -0x04 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "bnez %[width], 1b \n\t" + : [v32]"+&f"(v32) + : [dst_ptr]"r"(dst_argb), [width]"r"(width) + : "memory" + ); +} +// clang-format on + +// 10 bit YUV to ARGB #endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) #ifdef __cplusplus |