aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.chromium2
-rw-r--r--include/libyuv/planar_functions.h9
-rw-r--r--include/libyuv/rotate.h8
-rw-r--r--include/libyuv/version.h2
-rw-r--r--source/compare_neon64.cc4
-rw-r--r--source/planar_functions.cc126
-rw-r--r--source/rotate_argb.cc77
-rw-r--r--source/rotate_neon64.cc19
-rw-r--r--source/row_gcc.cc123
-rw-r--r--source/row_mmi.cc2971
-rw-r--r--source/row_neon64.cc102
-rw-r--r--source/scale_neon64.cc61
-rw-r--r--unit_test/planar_test.cc69
-rw-r--r--unit_test/rotate_argb_test.cc42
14 files changed, 1969 insertions, 1646 deletions
diff --git a/README.chromium b/README.chromium
index 9c78a007..4a6830aa 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1744
+Version: 1746
License: BSD
License File: LICENSE
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 57395262..2aa95335 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -313,6 +313,15 @@ int ARGBMirror(const uint8_t* src_argb,
int width,
int height);
+// Mirror a plane of data.
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
// Convert NV12 to RGB565.
LIBYUV_API
int NV12ToRGB565(const uint8_t* src_y,
diff --git a/include/libyuv/rotate.h b/include/libyuv/rotate.h
index c64e0216..30888224 100644
--- a/include/libyuv/rotate.h
+++ b/include/libyuv/rotate.h
@@ -118,6 +118,10 @@ void RotatePlane270(const uint8_t* src,
int width,
int height);
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them. Deprecated.
LIBYUV_API
void RotateUV90(const uint8_t* src,
int src_stride,
@@ -128,10 +132,6 @@ void RotateUV90(const uint8_t* src,
int width,
int height);
-// Rotations for when U and V are interleaved.
-// These functions take one input pointer and
-// split the data into two buffers while
-// rotating them. Deprecated.
LIBYUV_API
void RotateUV180(const uint8_t* src,
int src_stride,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 2b52c724..9d487f0c 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1744
+#define LIBYUV_VERSION 1746
#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc
index 6e8f672a..a22ba75b 100644
--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@@ -33,8 +33,10 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"eor v1.16b, v1.16b, v3.16b \n"
"cnt v0.16b, v0.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
"cnt v1.16b, v1.16b \n"
"subs %w2, %w2, #32 \n"
"add v0.16b, v0.16b, v1.16b \n"
@@ -65,8 +67,10 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
"subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 7e7e6e35..b6aac913 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -716,70 +716,6 @@ void MergeRGBPlane(const uint8_t* src_r,
}
}
-// Mirror a plane of data.
-void MirrorPlane(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_y,
- int dst_stride_y,
- int width,
- int height) {
- int y;
- void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
- }
-#if defined(HAS_MIRRORROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- MirrorRow = MirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 32)) {
- MirrorRow = MirrorRow_NEON;
- }
- }
-#endif
-#if defined(HAS_MIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- MirrorRow = MirrorRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_MIRRORROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- MirrorRow = MirrorRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- MirrorRow = MirrorRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_MIRRORROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- MirrorRow = MirrorRow_Any_MSA;
- if (IS_ALIGNED(width, 64)) {
- MirrorRow = MirrorRow_MSA;
- }
- }
-#endif
-#if defined(HAS_MIRRORROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MirrorRow = MirrorRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- MirrorRow = MirrorRow_MMI;
- }
- }
-#endif
-
- // Mirror plane
- for (y = 0; y < height; ++y) {
- MirrorRow(src_y, dst_y, width);
- src_y += src_stride_y;
- dst_y += dst_stride_y;
- }
-}
-
// Convert YUY2 to I422.
LIBYUV_API
int YUY2ToI422(const uint8_t* src_yuy2,
@@ -1047,6 +983,68 @@ int YUY2ToY(const uint8_t* src_yuy2,
return 0;
}
+// Mirror a plane of data.
+// See Also I400Mirror
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y,
+ int dst_stride_y, int width, int height) {
+ int y;
+ void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRow = MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorRow = MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorRow = MirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorRow = MirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MirrorRow = MirrorRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorRow = MirrorRow_MMI;
+ }
+ }
+#endif
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ MirrorRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
// Mirror I400 with optional flipping
LIBYUV_API
int I400Mirror(const uint8_t* src_y,
diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc
index a93fd55f..12a240f3 100644
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -21,17 +21,21 @@ namespace libyuv {
extern "C" {
#endif
-static void ARGBTranspose(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBTranspose(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int i;
int src_pixel_step = src_stride_argb >> 2;
void (*ScaleARGBRowDownEven)(
const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
+ // Check stride is a multiple of 4.
+ if (src_stride_argb & 3) {
+ return -1;
+ }
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
@@ -70,44 +74,45 @@ static void ARGBTranspose(const uint8_t* src_argb,
dst_argb += dst_stride_argb;
src_argb += 4;
}
+ return 0;
}
-void ARGBRotate90(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBRotate90(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
src_argb += src_stride_argb * (height - 1);
src_stride_argb = -src_stride_argb;
- ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
+ return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
}
-void ARGBRotate270(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBRotate270(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
dst_argb += dst_stride_argb * (width - 1);
dst_stride_argb = -dst_stride_argb;
- ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
+ return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
}
-void ARGBRotate180(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBRotate180(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width * 4);
const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
@@ -190,6 +195,7 @@ void ARGBRotate180(const uint8_t* src_argb,
dst_bot -= dst_stride_argb;
}
free_aligned_buffer_64(row);
+ return 0;
}
LIBYUV_API
@@ -217,17 +223,14 @@ int ARGBRotate(const uint8_t* src_argb,
return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height);
case kRotate90:
- ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
- return 0;
+ return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
case kRotate270:
- ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
- return 0;
+ return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
case kRotate180:
- ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
- return 0;
+ return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
default:
break;
}
diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc
index f469baac..99f7ee16 100644
--- a/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@@ -37,7 +37,7 @@ void TransposeWx8_NEON(const uint8_t* src,
"sub %w3, %w3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
- "1: \n"
+ "1: \n"
"mov %0, %1 \n"
"ld1 {v0.8b}, [%0], %5 \n"
@@ -48,23 +48,39 @@ void TransposeWx8_NEON(const uint8_t* src,
"ld1 {v5.8b}, [%0], %5 \n"
"ld1 {v6.8b}, [%0], %5 \n"
"ld1 {v7.8b}, [%0] \n"
+ "mov %0, %1 \n"
"trn2 v16.8b, v0.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"trn1 v17.8b, v0.8b, v1.8b \n"
+ "add %0, %0, %5 \n"
"trn2 v18.8b, v2.8b, v3.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 1
"trn1 v19.8b, v2.8b, v3.8b \n"
+ "add %0, %0, %5 \n"
"trn2 v20.8b, v4.8b, v5.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 2
"trn1 v21.8b, v4.8b, v5.8b \n"
+ "add %0, %0, %5 \n"
"trn2 v22.8b, v6.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 3
"trn1 v23.8b, v6.8b, v7.8b \n"
+ "add %0, %0, %5 \n"
"trn2 v3.4h, v17.4h, v19.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 4
"trn1 v1.4h, v17.4h, v19.4h \n"
+ "add %0, %0, %5 \n"
"trn2 v2.4h, v16.4h, v18.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 5
"trn1 v0.4h, v16.4h, v18.4h \n"
+ "add %0, %0, %5 \n"
"trn2 v7.4h, v21.4h, v23.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 6
"trn1 v5.4h, v21.4h, v23.4h \n"
+ "add %0, %0, %5 \n"
"trn2 v6.4h, v20.4h, v22.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 7
"trn1 v4.4h, v20.4h, v22.4h \n"
"trn2 v21.2s, v1.2s, v5.2s \n"
@@ -226,6 +242,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
"ld1 {v5.16b}, [%0], %5 \n"
"ld1 {v6.16b}, [%0], %5 \n"
"ld1 {v7.16b}, [%0] \n"
+ "mov %0, %1 \n"
"trn1 v16.16b, v0.16b, v1.16b \n"
"trn2 v17.16b, v0.16b, v1.16b \n"
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index fa7b8cb3..c041ba11 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -84,7 +84,7 @@ static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
- 0x8080u, 0x8080u, 0x8080u, 0x8080u};
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u};
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
@@ -1101,8 +1101,11 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
"lea 0x40(%0),%0 \n" \
"phaddw %%xmm0,%%xmm6 \n" \
"phaddw %%xmm2,%%xmm1 \n" \
- "paddw %%" #round ",%%xmm6 \n" \
- "paddw %%" #round ",%%xmm1 \n" \
+ "prefetcht0 1280(%0) \n" \
+ "paddw %%" #round \
+ ",%%xmm6 \n" \
+ "paddw %%" #round \
+ ",%%xmm1 \n" \
"psrlw $0x8,%%xmm6 \n" \
"psrlw $0x8,%%xmm1 \n" \
"packuswb %%xmm1,%%xmm6 \n" \
@@ -1111,33 +1114,36 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
"sub $0x10,%2 \n" \
"jg 1b \n"
-#define RGBTOY_AVX2(round) \
- "1: \n" \
- "vmovdqu (%0),%%ymm0 \n" \
- "vmovdqu 0x20(%0),%%ymm1 \n" \
- "vmovdqu 0x40(%0),%%ymm2 \n" \
- "vmovdqu 0x60(%0),%%ymm3 \n" \
- "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
- "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
- "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
- "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
- "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
- "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
- "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
- "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
- "lea 0x80(%0),%0 \n" \
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
- "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
- "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
- "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
- "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
- "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
- "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
- "vmovdqu %%ymm0,(%1) \n" \
- "lea 0x20(%1),%1 \n" \
- "sub $0x20,%2 \n" \
- "jg 1b \n" \
+#define RGBTOY_AVX2(round) \
+ "1: \n" \
+ "vmovdqu (%0),%%ymm0 \n" \
+ "vmovdqu 0x20(%0),%%ymm1 \n" \
+ "vmovdqu 0x40(%0),%%ymm2 \n" \
+ "vmovdqu 0x60(%0),%%ymm3 \n" \
+ "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
+ "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
+ "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
+ "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
+ "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
+ "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
+ "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
+ "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
+ "lea 0x80(%0),%0 \n" \
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
+ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
+ "prefetcht0 1280(%0) \n" \
+ "vpaddw %%" #round \
+ ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
+ "vpaddw %%" #round \
+ ",%%ymm2,%%ymm2 \n" \
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
+ "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
+ "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
+ "vmovdqu %%ymm0,(%1) \n" \
+ "lea 0x20(%1),%1 \n" \
+ "sub $0x20,%2 \n" \
+ "jg 1b \n" \
"vzeroupper \n"
#ifdef HAS_ARGBTOYROW_SSSE3
@@ -1148,15 +1154,15 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n"
- LABELALIGN
- RGBTOY(xmm7)
+ LABELALIGN RGBTOY(xmm7)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToY), // %3
"m"(kSub128), // %4
"m"(kAddY16) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBTOYROW_SSSE3
@@ -1168,8 +1174,7 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
- LABELALIGN
- RGBTOY(xmm5)
+ LABELALIGN RGBTOY(xmm5)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1187,8 +1192,7 @@ void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
- LABELALIGN
- RGBTOY(xmm5)
+ LABELALIGN RGBTOY(xmm5)
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1210,8 +1214,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"vbroadcastf128 %5,%%ymm7 \n"
"vmovdqu %6,%%ymm6 \n"
- LABELALIGN
- RGBTOY_AVX2(ymm7)
+ LABELALIGN RGBTOY_AVX2(ymm7)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1219,7 +1222,8 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"m"(kSub128), // %4
"m"(kAddY16), // %5
"m"(kPermdARGBToY_AVX) // %6
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBTOYROW_AVX2
@@ -1232,8 +1236,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
"vbroadcastf128 %5,%%ymm7 \n"
"vmovdqu %6,%%ymm6 \n"
- LABELALIGN
- RGBTOY_AVX2(ymm7)
+ LABELALIGN RGBTOY_AVX2(ymm7)
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1241,7 +1244,8 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
"m"(kSub128), // %4
"m"(kAddY16), // %5
"m"(kPermdARGBToY_AVX) // %6
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ABGRTOYROW_AVX2
@@ -1253,15 +1257,15 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"vbroadcastf128 %4,%%ymm5 \n"
"vmovdqu %5,%%ymm6 \n"
- LABELALIGN
- RGBTOY_AVX2(ymm5)
+ LABELALIGN RGBTOY_AVX2(ymm5)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToYJ), // %3
"m"(kSub128), // %4
"m"(kPermdARGBToY_AVX) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBTOYJROW_AVX2
@@ -1273,9 +1277,8 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
"vbroadcastf128 %4,%%ymm5 \n"
"vmovdqu %5,%%ymm6 \n"
- LABELALIGN
- RGBTOY_AVX2(ymm5)
- "vzeroupper \n"
+ LABELALIGN RGBTOY_AVX2(
+ ymm5) "vzeroupper \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1536,7 +1539,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"((intptr_t)(src_stride_argb)), // %4
- "m"(kSub128), // %5
+ "m"(kSub128), // %5
"m"(kARGBToVJ), // %6
"m"(kARGBToUJ), // %7
"m"(kShufARGBToUV_AVX) // %8
@@ -1606,7 +1609,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
: "r"((intptr_t)(src_stride_argb)), // %4
"m"(kARGBToVJ), // %5
"m"(kARGBToUJ), // %6
- "m"(kSub128) // %7
+ "m"(kSub128) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
}
#endif // HAS_ARGBTOUVJROW_SSSE3
@@ -1675,15 +1678,15 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n"
- LABELALIGN
- RGBTOY(xmm7)
+ LABELALIGN RGBTOY(xmm7)
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kBGRAToY), // %3
"m"(kSub128), // %4
"m"(kAddY16) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
@@ -1755,15 +1758,15 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n"
- LABELALIGN
- RGBTOY(xmm7)
+ LABELALIGN RGBTOY(xmm7)
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kABGRToY), // %3
"m"(kSub128), // %4
"m"(kAddY16) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
@@ -1772,15 +1775,15 @@ void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n"
- LABELALIGN
- RGBTOY(xmm7)
+ LABELALIGN RGBTOY(xmm7)
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kRGBAToY), // %3
"m"(kSub128), // %4
"m"(kAddY16) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
diff --git a/source/row_mmi.cc b/source/row_mmi.cc
index 50cfca72..d7d34e47 100644
--- a/source/row_mmi.cc
+++ b/source/row_mmi.cc
@@ -6040,90 +6040,93 @@ void I444ToARGBRow_MMI(const uint8_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
- uint64_t y, u, v;
- uint64_t b_vec[2], g_vec[2], r_vec[2];
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
uint64_t mask = 0xff00ff00ff00ff00ULL;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
- __asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" // yg
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" // bb
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" // ub
- "or %[ub], %[ub], %[mask] \n\t" // must
- // sign
- // extension
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" // bg
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" // ug
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" // vg
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" // br
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" // vr
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask] \n\t" // sign
- // extension
-
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
-
- "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101
- "pmulhuh %[y], %[y], %[yg] \n\t" // y1
-
- "punpcklbh %[u], %[u], %[zero] \n\t" // u
- "paddsh %[b_vec0], %[y], %[bb] \n\t"
- "pmullh %[b_vec1], %[u], %[ub] \n\t"
- "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
- "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
-
- "punpcklbh %[v], %[v], %[zero] \n\t" // v
- "paddsh %[g_vec0], %[y], %[bg] \n\t"
- "pmullh %[g_vec1], %[u], %[ug] \n\t" // u*ug
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "pmullh %[g_vec1], %[v], %[vg] \n\t" // v*vg
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
-
- "paddsh %[r_vec0], %[y], %[br] \n\t"
- "pmullh %[r_vec1], %[v], %[vr] \n\t" // v*vr
- "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
- "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
-
- "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb
- "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" // ffffgggg
- "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
- "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" // gbgbgbgb
- "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" // frfrfrfr
- "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb
- "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb
- "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
-
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
- [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
- [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
- [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
- [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
- [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6),
- [five] "f"(0x55), [mask] "f"(mask)
- : "memory");
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+ __asm__ volatile (
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub
+ "or %[ub], %[ub], %[mask] \n\t"//must sign extension
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"//sign extension
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
+
+ "punpcklbh %[u], %[u], %[zero] \n\t"//u
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+
+ "punpcklbh %[v], %[v], %[zero] \n\t"//v
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb
+ "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg
+ "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [y]"=&f"(y),
+ [u]"=&f"(u), [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [alpha]"f"(-1),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask)
+ : "memory"
+ );
}
// Also used for 420
@@ -6133,96 +6136,99 @@ void I422ToARGBRow_MMI(const uint8_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
- uint64_t y, u, v;
- uint64_t b_vec[2], g_vec[2], r_vec[2];
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
uint64_t mask = 0xff00ff00ff00ff00ULL;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" // yg
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" // bb
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" // ub
- "or %[ub], %[ub], %[mask] \n\t" // must
- // sign
- // extension
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" // bg
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" // ug
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" // vg
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" // br
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" // vr
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask] \n\t" // sign
- // extension
-
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
-
- "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101
- "pmulhuh %[y], %[y], %[yg] \n\t" // y1
-
- // u3|u2|u1|u0 --> u1|u1|u0|u0
- "punpcklbh %[u], %[u], %[u] \n\t" // u
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "paddsh %[b_vec0], %[y], %[bb] \n\t"
- "pmullh %[b_vec1], %[u], %[ub] \n\t"
- "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
- "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
-
- // v3|v2|v1|v0 --> v1|v1|v0|v0
- "punpcklbh %[v], %[v], %[v] \n\t" // v
- "punpcklbh %[v], %[v], %[zero] \n\t"
- "paddsh %[g_vec0], %[y], %[bg] \n\t"
- "pmullh %[g_vec1], %[u], %[ug] \n\t" // u*ug
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "pmullh %[g_vec1], %[v], %[vg] \n\t" // v*vg
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
-
- "paddsh %[r_vec0], %[y], %[br] \n\t"
- "pmullh %[r_vec1], %[v], %[vr] \n\t" // v*vr
- "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
- "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
-
- "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb
- "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" // ffffgggg
- "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
- "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" // gbgbgbgb
- "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" // frfrfrfr
- "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb
- "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb
- "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
-
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
- [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
- [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
- [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
- [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
- [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6),
- [five] "f"(0x55), [mask] "f"(mask)
- : "memory");
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub
+ "or %[ub], %[ub], %[mask] \n\t"//must sign extension
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"//sign extension
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"//v
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb
+ "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg
+ "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y),
+ [u]"=&f"(u), [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [alpha]"f"(-1),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask)
+ : "memory"
+ );
}
// 10 bit YUV to ARGB
@@ -6232,96 +6238,102 @@ void I210ToARGBRow_MMI(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
- uint64_t y, u, v;
- uint64_t b_vec[2], g_vec[2], r_vec[2];
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
uint64_t mask = 0xff00ff00ff00ff00ULL;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask] \n\t"
-
- "1: \n\t"
- "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t"
- "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
-
- "psllh %[y], %[y], %[six] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
-
- "punpcklhw %[u], %[u], %[u] \n\t"
- "psrah %[u], %[u], %[two] \n\t"
- "punpcklhw %[v], %[v], %[v] \n\t"
- "psrah %[v], %[v], %[two] \n\t"
- "pminsh %[u], %[u], %[mask1] \n\t"
- "pminsh %[v], %[v], %[mask1] \n\t"
-
- "paddsh %[b_vec0], %[y], %[bb] \n\t"
- "pmullh %[b_vec1], %[u], %[ub] \n\t"
- "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
-
- "paddsh %[g_vec0], %[y], %[bg] \n\t"
- "pmullh %[g_vec1], %[u], %[ug] \n\t"
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "pmullh %[g_vec1], %[v], %[vg] \n\t"
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
-
- "paddsh %[r_vec0], %[y], %[br] \n\t"
- "pmullh %[r_vec1], %[v], %[vr] \n\t"
- "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
-
- "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
- "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
- "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
-
- "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"
- "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"
- "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
- "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
- "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
- "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
- "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
- "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
-
- "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
- [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
- [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
- [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
- [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
- [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6),
- [five] "f"(0x55), [mask] "f"(mask), [two] "f"(0x02),
- [mask1] "f"(0x00ff00ff00ff00ff)
- : "memory");
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t"
+ "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "psllh %[y], %[y], %[six] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "punpcklhw %[u], %[u], %[u] \n\t"
+ "psrah %[u], %[u], %[two] \n\t"
+ "punpcklhw %[v], %[v], %[v] \n\t"
+ "psrah %[v], %[v], %[two] \n\t"
+ "pminsh %[u], %[u], %[mask1] \n\t"
+ "pminsh %[v], %[v], %[mask1] \n\t"
+
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y),
+ [u]"=&f"(u), [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [alpha]"f"(-1),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask), [two]"f"(0x02),
+ [mask1]"f"(0x00ff00ff00ff00ff)
+ : "memory"
+ );
}
void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
@@ -6331,96 +6343,102 @@ void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
- uint64_t y, u, v, a;
- uint64_t b_vec[2], g_vec[2], r_vec[2];
+ uint64_t y,u,v,a;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
uint64_t mask = 0xff00ff00ff00ff00ULL;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask] \n\t"
-
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
- "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t"
- "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t"
-
- "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101
- "pmulhuh %[y], %[y], %[yg] \n\t" // y1
-
- // u3|u2|u1|u0 --> u1|u1|u0|u0
- "punpcklbh %[u], %[u], %[u] \n\t" // u
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "paddsh %[b_vec0], %[y], %[bb] \n\t"
- "pmullh %[b_vec1], %[u], %[ub] \n\t"
- "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
- "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
-
- // v3|v2|v1|v0 --> v1|v1|v0|v0
- "punpcklbh %[v], %[v], %[v] \n\t"
- "punpcklbh %[v], %[v], %[zero] \n\t"
- "paddsh %[g_vec0], %[y], %[bg] \n\t"
- "pmullh %[g_vec1], %[u], %[ug] \n\t"
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "pmullh %[g_vec1], %[v], %[vg] \n\t"
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
-
- "paddsh %[r_vec0], %[y], %[br] \n\t"
- "pmullh %[r_vec1], %[v], %[vr] \n\t"
- "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
- "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
-
- "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb
- "packushb %[g_vec0], %[g_vec0], %[a] \n\t"
- "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t" // aaaagggg
- "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
- "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
- "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
- "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
- "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
-
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [a] "=&f"(a),
- [b_vec0] "=&f"(b_vec[0]), [b_vec1] "=&f"(b_vec[1]),
- [g_vec0] "=&f"(g_vec[0]), [g_vec1] "=&f"(g_vec[1]),
- [r_vec0] "=&f"(r_vec[0]), [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub),
- [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb),
- [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [a_ptr] "r"(src_a), [zero] "f"(0x00),
- [six] "f"(0x6), [five] "f"(0x55), [mask] "f"(mask)
- : "memory");
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+ "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t"
+ "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb
+ "packushb %[g_vec0], %[g_vec0], %[a] \n\t"
+ "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t"//aaaagggg
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v), [a]"=&f"(a),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [a_ptr]"r"(src_a), [zero]"f"(0x00),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask)
+ : "memory"
+ );
}
void I422ToRGB24Row_MMI(const uint8_t* src_y,
@@ -6429,105 +6447,113 @@ void I422ToRGB24Row_MMI(const uint8_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
- uint64_t y, u, v;
- uint64_t b_vec[2], g_vec[2], r_vec[2];
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
uint64_t mask = 0xff00ff00ff00ff00ULL;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask] \n\t"
-
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
-
- "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101
- "pmulhuh %[y], %[y], %[yg] \n\t" // y1
-
- // u3|u2|u1|u0 --> u1|u1|u0|u0
- "punpcklbh %[u], %[u], %[u] \n\t" // u
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "paddsh %[b_vec0], %[y], %[bb] \n\t"
- "pmullh %[b_vec1], %[u], %[ub] \n\t"
- "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
- "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
-
- // v3|v2|v1|v0 --> v1|v1|v0|v0
- "punpcklbh %[v], %[v], %[v] \n\t"
- "punpcklbh %[v], %[v], %[zero] \n\t"
- "paddsh %[g_vec0], %[y], %[bg] \n\t"
- "pmullh %[g_vec1], %[u], %[ug] \n\t"
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "pmullh %[g_vec1], %[v], %[vg] \n\t"
- "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
- "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
-
- "paddsh %[r_vec0], %[y], %[br] \n\t"
- "pmullh %[r_vec1], %[v], %[vr] \n\t"
- "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
- "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
-
- "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"
- "packushb %[g_vec0], %[g_vec0], %[zero] \n\t"
- "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
- "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
- "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
- "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
-
- "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t"
- "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t"
- "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
- "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t"
- "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t"
- "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
- "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t"
- "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
- "pextrh %[r_vec1], %[g_vec1], %[one] \n\t"
- "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t"
- "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t"
- "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t"
- "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
- "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t"
- "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
-
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
- [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
- [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
- [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
- [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
- [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
- [mask] "f"(mask), [lmove1] "f"(0x18), [rmove1] "f"(0x8), [one] "f"(0x1)
- : "memory");
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "packushb %[g_vec0], %[g_vec0], %[zero] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
+
+ "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t"
+ "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t"
+ "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
+ "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t"
+ "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t"
+ "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
+ "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t"
+ "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
+ "pextrh %[r_vec1], %[g_vec1], %[one] \n\t"
+ "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t"
+ "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t"
+ "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t"
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t"
+ "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask]"f"(mask),
+ [lmove1]"f"(0x18), [rmove1]"f"(0x8),
+ [one]"f"(0x1)
+ : "memory"
+ );
}
void I422ToARGB4444Row_MMI(const uint8_t* src_y,
@@ -6538,103 +6564,110 @@ void I422ToARGB4444Row_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask] \n\t"
-
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
-
- "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101
- "pmulhuh %[y], %[y], %[yg] \n\t" // y1
-
- // u3|u2|u1|u0 --> u1|u1|u0|u0
- "punpcklbh %[u], %[u], %[u] \n\t" // u
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
-
- // v3|v2|v1|v0 --> v1|v1|v0|v0
- "punpcklbh %[v], %[v], %[v] \n\t"
- "punpcklbh %[v], %[v], %[zero] \n\t"
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
-
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
-
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
-
- "and %[g_vec], %[g_vec], %[mask1] \n\t"
- "psrlw %[g_vec], %[g_vec], %[four] \n\t"
- "psrlw %[r_vec], %[g_vec], %[four] \n\t"
- "or %[g_vec], %[g_vec], %[r_vec] \n\t"
- "punpcklbh %[r_vec], %[alpha], %[zero] \n\t"
- "and %[g_vec], %[g_vec], %[r_vec] \n\t"
-
- "and %[b_vec], %[b_vec], %[mask1] \n\t"
- "psrlw %[b_vec], %[b_vec], %[four] \n\t"
- "psrlw %[r_vec], %[b_vec], %[four] \n\t"
- "or %[b_vec], %[b_vec], %[r_vec] \n\t"
- "punpcklbh %[r_vec], %[alpha], %[zero] \n\t"
- "and %[b_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[b_vec] \n\t"
-
- "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t"
-
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
- "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [dst_argb4444] "r"(dst_argb4444), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
- [mask] "f"(0xff00ff00ff00ff00), [four] "f"(0x4),
- [mask1] "f"(0xf0f0f0f0f0f0f0f0), [alpha] "f"(-1)
- : "memory");
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "and %[g_vec], %[g_vec], %[mask1] \n\t"
+ "psrlw %[g_vec], %[g_vec], %[four] \n\t"
+ "psrlw %[r_vec], %[g_vec], %[four] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "punpcklbh %[r_vec], %[alpha], %[zero] \n\t"
+ "and %[g_vec], %[g_vec], %[r_vec] \n\t"
+
+ "and %[b_vec], %[b_vec], %[mask1] \n\t"
+ "psrlw %[b_vec], %[b_vec], %[four] \n\t"
+ "psrlw %[r_vec], %[b_vec], %[four] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "punpcklbh %[r_vec], %[alpha], %[zero] \n\t"
+ "and %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[b_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [dst_argb4444]"r"(dst_argb4444),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask]"f"(0xff00ff00ff00ff00),
+ [four]"f"(0x4), [mask1]"f"(0xf0f0f0f0f0f0f0f0),
+ [alpha]"f"(-1)
+ : "memory"
+ );
}
void I422ToARGB1555Row_MMI(const uint8_t* src_y,
@@ -6645,118 +6678,125 @@ void I422ToARGB1555Row_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
-
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
-
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
-
- // u3|u2|u1|u0 --> u1|u1|u0|u0
- "punpcklbh %[u], %[u], %[u] \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
-
- // v3|v2|v1|v0 --> v1|v1|v0|v0
- "punpcklbh %[v], %[v], %[v] \n\t"
- "punpcklbh %[v], %[v], %[zero] \n\t"
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
-
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
-
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
-
- "psrlw %[temp], %[g_vec], %[three] \n\t"
- "and %[g_vec], %[temp], %[mask2] \n\t"
- "psrlw %[temp], %[temp], %[eight] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "or %[g_vec], %[g_vec], %[r_vec] \n\t"
- "psrlw %[temp], %[temp], %[eight] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "or %[g_vec], %[g_vec], %[r_vec] \n\t"
- "or %[g_vec], %[g_vec], %[mask3] \n\t"
-
- "psrlw %[temp], %[b_vec], %[three] \n\t"
- "and %[b_vec], %[temp], %[mask2] \n\t"
- "psrlw %[temp], %[temp], %[eight] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "or %[b_vec], %[b_vec], %[r_vec] \n\t"
- "psrlw %[temp], %[temp], %[eight] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "or %[b_vec], %[b_vec], %[r_vec] \n\t"
- "or %[b_vec], %[b_vec], %[mask3] \n\t"
-
- "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
- "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
- "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
-
- "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t"
-
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
- "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [dst_argb1555] "r"(dst_argb1555), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
- [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3),
- [mask2] "f"(0x1f0000001f), [eight] "f"(0x8),
- [mask3] "f"(0x800000008000), [lmove5] "f"(0x5)
- : "memory");
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "psrlw %[temp], %[g_vec], %[three] \n\t"
+ "and %[g_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "or %[g_vec], %[g_vec], %[mask3] \n\t"
+
+ "psrlw %[temp], %[b_vec], %[three] \n\t"
+ "and %[b_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "or %[b_vec], %[b_vec], %[mask3] \n\t"
+
+ "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
+ "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
+ "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [dst_argb1555]"r"(dst_argb1555),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [three]"f"(0x3), [mask2]"f"(0x1f0000001f),
+ [eight]"f"(0x8), [mask3]"f"(0x800000008000),
+ [lmove5]"f"(0x5)
+ : "memory"
+ );
}
void I422ToRGB565Row_MMI(const uint8_t* src_y,
@@ -6767,120 +6807,127 @@ void I422ToRGB565Row_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
-
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
-
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
-
- // u3|u2|u1|u0 --> u1|u1|u0|u0
- "punpcklbh %[u], %[u], %[u] \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
-
- // v3|v2|v1|v0 --> v1|v1|v0|v0
- "punpcklbh %[v], %[v], %[v] \n\t"
- "punpcklbh %[v], %[v], %[zero] \n\t"
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
-
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
-
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
-
- "psrlh %[temp], %[g_vec], %[three] \n\t"
- "and %[g_vec], %[temp], %[mask2] \n\t"
- "psrlw %[temp], %[temp], %[seven] \n\t"
- "psrlw %[r_vec], %[mask1], %[eight] \n\t"
- "and %[r_vec], %[temp], %[r_vec] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "or %[g_vec], %[g_vec], %[r_vec] \n\t"
- "paddb %[r_vec], %[three], %[six] \n\t"
- "psrlw %[temp], %[temp], %[r_vec] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "paddb %[temp], %[three], %[eight] \n\t"
- "psllw %[r_vec], %[r_vec], %[temp] \n\t"
- "or %[g_vec], %[g_vec], %[r_vec] \n\t"
-
- "psrlh %[temp], %[b_vec], %[three] \n\t"
- "and %[b_vec], %[temp], %[mask2] \n\t"
- "psrlw %[temp], %[temp], %[seven] \n\t"
- "psrlw %[r_vec], %[mask1], %[eight] \n\t"
- "and %[r_vec], %[temp], %[r_vec] \n\t"
- "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
- "or %[b_vec], %[b_vec], %[r_vec] \n\t"
- "paddb %[r_vec], %[three], %[six] \n\t"
- "psrlw %[temp], %[temp], %[r_vec] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "paddb %[temp], %[three], %[eight] \n\t"
- "psllw %[r_vec], %[r_vec], %[temp] \n\t"
- "or %[b_vec], %[b_vec], %[r_vec] \n\t"
-
- "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
- "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
- "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
-
- "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t"
-
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
- "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [dst_rgb565] "r"(dst_rgb565), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
- [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3),
- [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7),
- [lmove5] "f"(0x5)
- : "memory");
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "psrlh %[temp], %[g_vec], %[three] \n\t"
+ "and %[g_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+
+ "psrlh %[temp], %[b_vec], %[three] \n\t"
+ "and %[b_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
+ "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
+ "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [dst_rgb565]"r"(dst_rgb565),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [three]"f"(0x3), [mask2]"f"(0x1f0000001f),
+ [eight]"f"(0x8), [seven]"f"(0x7),
+ [lmove5]"f"(0x5)
+ : "memory"
+ );
}
void NV12ToARGBRow_MMI(const uint8_t* src_y,
@@ -6890,83 +6937,91 @@ void NV12ToARGBRow_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
-
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "pshufh %[v], %[u], %[vshu] \n\t"
- "pshufh %[u], %[u], %[ushu] \n\t"
-
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
-
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
-
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
-
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
-
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
-
- "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
-
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf),
- [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
- [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
- [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1)
- : "memory");
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[vshu] \n\t"
+ "pshufh %[u], %[u], %[ushu] \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1)
+ : "memory"
+ );
}
void NV21ToARGBRow_MMI(const uint8_t* src_y,
@@ -6976,83 +7031,91 @@ void NV21ToARGBRow_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
-
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "pshufh %[v], %[u], %[ushu] \n\t"
- "pshufh %[u], %[u], %[vshu] \n\t"
-
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
-
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
-
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
-
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
-
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
-
- "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
-
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf),
- [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
- [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
- [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1)
- : "memory");
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[ushu] \n\t"
+ "pshufh %[u], %[u], %[vshu] \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1)
+ : "memory"
+ );
}
void NV12ToRGB24Row_MMI(const uint8_t* src_y,
@@ -7062,95 +7125,103 @@ void NV12ToRGB24Row_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
-
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "pshufh %[v], %[u], %[vshu] \n\t"
- "pshufh %[u], %[u], %[ushu] \n\t"
-
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
-
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
-
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
-
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
-
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
-
- "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t"
- "psllw %[temp], %[r_vec], %[lmove1] \n\t"
- "or %[g_vec], %[g_vec], %[temp] \n\t"
- "psrlw %[temp], %[r_vec], %[rmove1] \n\t"
- "pextrh %[temp], %[temp], %[zero] \n\t"
- "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t"
- "pextrh %[temp], %[b_vec], %[zero] \n\t"
- "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t"
- "pextrh %[temp], %[b_vec], %[one] \n\t"
- "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t"
- "psllw %[b_vec], %[b_vec], %[rmove1] \n\t"
- "or %[b_vec], %[b_vec], %[temp] \n\t"
- "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
- "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t"
- "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
-
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf),
- [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
- [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
- [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [lmove1] "f"(0x18),
- [one] "f"(0x1), [rmove1] "f"(0x8)
- : "memory");
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[vshu] \n\t"
+ "pshufh %[u], %[u], %[ushu] \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t"
+ "psllw %[temp], %[r_vec], %[lmove1] \n\t"
+ "or %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrlw %[temp], %[r_vec], %[rmove1] \n\t"
+ "pextrh %[temp], %[temp], %[zero] \n\t"
+ "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[zero] \n\t"
+ "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[one] \n\t"
+ "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t"
+ "psllw %[b_vec], %[b_vec], %[rmove1] \n\t"
+ "or %[b_vec], %[b_vec], %[temp] \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t"
+ "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1), [lmove1]"f"(0x18),
+ [one]"f"(0x1), [rmove1]"f"(0x8)
+ : "memory"
+ );
}
void NV21ToRGB24Row_MMI(const uint8_t* src_y,
@@ -7160,95 +7231,103 @@ void NV21ToRGB24Row_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
-
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "pshufh %[v], %[u], %[ushu] \n\t"
- "pshufh %[u], %[u], %[vshu] \n\t"
-
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
-
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
-
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
-
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
-
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
-
- "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t"
- "psllw %[temp], %[r_vec], %[lmove1] \n\t"
- "or %[g_vec], %[g_vec], %[temp] \n\t"
- "psrlw %[temp], %[r_vec], %[rmove1] \n\t"
- "pextrh %[temp], %[temp], %[zero] \n\t"
- "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t"
- "pextrh %[temp], %[b_vec], %[zero] \n\t"
- "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t"
- "pextrh %[temp], %[b_vec], %[one] \n\t"
- "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t"
- "psllw %[b_vec], %[b_vec], %[rmove1] \n\t"
- "or %[b_vec], %[b_vec], %[temp] \n\t"
- "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
- "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t"
- "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
-
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf),
- [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
- [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
- [ushu] "f"(0xA0), [vshu] "f"(0xf5), [lmove1] "f"(0x18),
- [rmove1] "f"(0x8), [one] "f"(0x1)
- : "memory");
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[ushu] \n\t"
+ "pshufh %[u], %[u], %[vshu] \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t"
+ "psllw %[temp], %[r_vec], %[lmove1] \n\t"
+ "or %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrlw %[temp], %[r_vec], %[rmove1] \n\t"
+ "pextrh %[temp], %[temp], %[zero] \n\t"
+ "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[zero] \n\t"
+ "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[one] \n\t"
+ "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t"
+ "psllw %[b_vec], %[b_vec], %[rmove1] \n\t"
+ "or %[b_vec], %[b_vec], %[temp] \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t"
+ "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [lmove1]"f"(0x18), [rmove1]"f"(0x8),
+ [one]"f"(0x1)
+ : "memory"
+ );
}
void NV12ToRGB565Row_MMI(const uint8_t* src_y,
@@ -7258,115 +7337,123 @@ void NV12ToRGB565Row_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
-
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "pshufh %[v], %[u], %[vshu] \n\t"
- "pshufh %[u], %[u], %[ushu] \n\t"
-
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
-
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
-
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
-
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
-
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
-
- "psrlh %[temp], %[g_vec], %[three] \n\t"
- "and %[g_vec], %[temp], %[mask2] \n\t"
- "psrlw %[temp], %[temp], %[seven] \n\t"
- "psrlw %[r_vec], %[mask1], %[eight] \n\t"
- "and %[r_vec], %[temp], %[r_vec] \n\t"
- "psubb %[y], %[eight], %[three] \n\t" // 5
- "psllw %[r_vec], %[r_vec], %[y] \n\t"
- "or %[g_vec], %[g_vec], %[r_vec] \n\t"
- "paddb %[r_vec], %[three], %[six] \n\t"
- "psrlw %[temp], %[temp], %[r_vec] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "paddb %[temp], %[three], %[eight] \n\t"
- "psllw %[r_vec], %[r_vec], %[temp] \n\t"
- "or %[g_vec], %[g_vec], %[r_vec] \n\t"
-
- "psrlh %[temp], %[b_vec], %[three] \n\t"
- "and %[b_vec], %[temp], %[mask2] \n\t"
- "psrlw %[temp], %[temp], %[seven] \n\t"
- "psrlw %[r_vec], %[mask1], %[eight] \n\t"
- "and %[r_vec], %[temp], %[r_vec] \n\t"
- "psubb %[y], %[eight], %[three] \n\t" // 5
- "psllw %[r_vec], %[r_vec], %[y] \n\t"
- "or %[b_vec], %[b_vec], %[r_vec] \n\t"
- "paddb %[r_vec], %[three], %[six] \n\t"
- "psrlw %[temp], %[temp], %[r_vec] \n\t"
- "and %[r_vec], %[temp], %[mask2] \n\t"
- "paddb %[temp], %[three], %[eight] \n\t"
- "psllw %[r_vec], %[r_vec], %[temp] \n\t"
- "or %[b_vec], %[b_vec], %[r_vec] \n\t"
-
- "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
- "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
- "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
-
- "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t"
-
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
- "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [dst_rgb565] "r"(dst_rgb565),
- [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
- [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
- [ushu] "f"(0xA0), [vshu] "f"(0xf5), [three] "f"(0x3),
- [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7)
- : "memory");
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[vshu] \n\t"
+ "pshufh %[u], %[u], %[ushu] \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "psrlh %[temp], %[g_vec], %[three] \n\t"
+ "and %[g_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psubb %[y], %[eight], %[three] \n\t"//5
+ "psllw %[r_vec], %[r_vec], %[y] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+
+ "psrlh %[temp], %[b_vec], %[three] \n\t"
+ "and %[b_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psubb %[y], %[eight], %[three] \n\t"//5
+ "psllw %[r_vec], %[r_vec], %[y] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
+ "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
+ "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
+ "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv),
+ [dst_rgb565]"r"(dst_rgb565),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [three]"f"(0x3), [mask2]"f"(0x1f0000001f),
+ [eight]"f"(0x8), [seven]"f"(0x7)
+ : "memory"
+ );
}
void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
@@ -7375,83 +7462,90 @@ void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
-
- "1: \n\t"
- "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t"
- "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t"
- "psrlh %[temp], %[y], %[eight] \n\t"
- "pshufh %[u], %[temp], %[ushu] \n\t"
- "pshufh %[v], %[temp], %[vshu] \n\t"
-
- "psrlh %[temp], %[mask1], %[eight] \n\t"
- "and %[y], %[y], %[temp] \n\t"
- "psllh %[temp], %[y], %[eight] \n\t"
- "or %[y], %[y], %[temp] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
-
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
-
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
-
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
-
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
-
- "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
-
- "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [yuy2_ptr] "r"(src_yuy2), [rgbbuf_ptr] "r"(rgb_buf),
- [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
- [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
- [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8)
- : "memory");
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t"
+ "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t"
+ "psrlh %[temp], %[y], %[eight] \n\t"
+ "pshufh %[u], %[temp], %[ushu] \n\t"
+ "pshufh %[v], %[temp], %[vshu] \n\t"
+
+ "psrlh %[temp], %[mask1], %[eight] \n\t"
+ "and %[y], %[y], %[temp] \n\t"
+ "psllh %[temp], %[y], %[eight] \n\t"
+ "or %[y], %[y], %[temp] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [yuy2_ptr]"r"(src_yuy2), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1), [eight]"f"(0x8)
+ : "memory"
+ );
}
void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
@@ -7460,83 +7554,90 @@ void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
-
- "1: \n\t"
- "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t"
- "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t"
- "psrlh %[temp], %[mask1], %[eight] \n\t"
- "and %[temp], %[y], %[temp] \n\t"
- "pshufh %[u], %[temp], %[ushu] \n\t"
- "pshufh %[v], %[temp], %[vshu] \n\t"
-
- "psrlh %[y], %[y], %[eight] \n\t"
- "psllh %[temp], %[y], %[eight] \n\t"
- "or %[y], %[y], %[temp] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
-
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
-
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
-
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
-
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
- "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
- "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
-
- "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
-
- "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [uyvy_ptr] "r"(src_uyvy), [rgbbuf_ptr] "r"(rgb_buf),
- [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
- [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
- [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8)
- : "memory");
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t"
+ "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t"
+ "psrlh %[temp], %[mask1], %[eight] \n\t"
+ "and %[temp], %[y], %[temp] \n\t"
+ "pshufh %[u], %[temp], %[ushu] \n\t"
+ "pshufh %[v], %[temp], %[vshu] \n\t"
+
+ "psrlh %[y], %[y], %[eight] \n\t"
+ "psllh %[temp], %[y], %[eight] \n\t"
+ "or %[y], %[y], %[temp] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [uyvy_ptr]"r"(src_uyvy), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1), [eight]"f"(0x8)
+ : "memory"
+ );
}
void I422ToRGBARow_MMI(const uint8_t* src_y,
@@ -7547,104 +7648,112 @@ void I422ToRGBARow_MMI(const uint8_t* src_y,
int width) {
uint64_t y, u, v;
uint64_t b_vec, g_vec, r_vec, temp;
- uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
__asm__ volatile(
- "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
- "or %[ub], %[ub], %[mask1] \n\t"
- "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
- "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[ug], %[ug], %[zero] \n\t"
- "pshufh %[ug], %[ug], %[zero] \n\t"
- "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vg], %[vg], %[zero] \n\t"
- "pshufh %[vg], %[vg], %[five] \n\t"
- "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
- "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
- "punpcklbh %[vr], %[vr], %[zero] \n\t"
- "pshufh %[vr], %[vr], %[five] \n\t"
- "or %[vr], %[vr], %[mask1] \n\t"
-
- "1: \n\t"
- "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
- "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
- "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
- "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
- "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
- "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
-
- "punpcklbh %[y], %[y], %[y] \n\t"
- "pmulhuh %[y], %[y], %[yg] \n\t"
-
- "punpcklbh %[u], %[u], %[u] \n\t"
- "punpcklbh %[u], %[u], %[zero] \n\t"
- "paddsh %[b_vec], %[y], %[bb] \n\t"
- "pmullh %[temp], %[u], %[ub] \n\t"
- "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
- "psrah %[b_vec], %[b_vec], %[six] \n\t"
-
- "punpcklbh %[v], %[v], %[v] \n\t"
- "punpcklbh %[v], %[v], %[zero] \n\t"
- "paddsh %[g_vec], %[y], %[bg] \n\t"
- "pmullh %[temp], %[u], %[ug] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "pmullh %[temp], %[v], %[vg] \n\t"
- "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
- "psrah %[g_vec], %[g_vec], %[six] \n\t"
-
- "paddsh %[r_vec], %[y], %[br] \n\t"
- "pmullh %[temp], %[v], %[vr] \n\t"
- "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
- "psrah %[r_vec], %[r_vec], %[six] \n\t"
-
- "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
- "packushb %[g_vec], %[g_vec], %[zero] \n\t"
- "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t"
- "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t"
- "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t"
- "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
- "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
-
- "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
- "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
- "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
-
- "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
- "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
- "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
- "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
- [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
- [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
- [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
- : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
- [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
- [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
- [mask1] "f"(0xff00ff00ff00ff00), [alpha] "f"(-1)
- : "memory");
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "punpcklbh %[u], %[u], %[u] \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t"
+ "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t"
+ "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [alpha]"f"(-1)
+ : "memory"
+ );
}
void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) {
- __asm__ volatile(
- "punpcklwd %[v32], %[v32], %[v32] \n\t"
- "1: \n\t"
- "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t"
- "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t"
- "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t"
-
- "daddi %[width], %[width], -0x04 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
- "bnez %[width], 1b \n\t"
- : [v32] "+&f"(v32)
- : [dst_ptr] "r"(dst_argb), [width] "r"(width)
- : "memory");
+ __asm__ volatile (
+ "punpcklwd %[v32], %[v32], %[v32] \n\t"
+ "1: \n\t"
+ "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t"
+ "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t"
+
+ "daddi %[width], %[width], -0x04 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [v32]"+&f"(v32)
+ : [dst_ptr]"r"(dst_argb), [width]"r"(width)
+ : "memory"
+ );
}
// 10 bit YUV to ARGB
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 197efb2a..3f64010a 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -278,7 +278,8 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
// RGB565.
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -315,7 +316,8 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
// RGB565.
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -401,6 +403,7 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
"orr v22.8b, v20.8b, v20.8b \n"
"subs %w2, %w2, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
@@ -527,7 +530,8 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
// RGB565.
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
@@ -601,6 +605,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
"subs %w3, %w3, #16 \n" // 16 processed per loop
"st1 {v0.16b}, [%1], #16 \n" // store U
"st1 {v1.16b}, [%2], #16 \n" // store V
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
@@ -622,6 +627,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
"ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %w3, %w3, #16 \n" // 16 processed per loop
"st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
@@ -645,6 +651,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
"st1 {v0.16b}, [%1], #16 \n" // store R
"st1 {v1.16b}, [%2], #16 \n" // store G
"st1 {v2.16b}, [%3], #16 \n" // store B
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
@@ -669,6 +676,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
"ld1 {v2.16b}, [%2], #16 \n" // load B
"subs %w4, %w4, #16 \n" // 16 processed per loop
"st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
@@ -687,6 +695,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
"ldp q0, q1, [%0], #32 \n"
"subs %w2, %w2, #32 \n" // 32 processed per loop
"stp q0, q1, [%1], #32 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -703,6 +712,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
"1: \n"
"subs %w1, %w1, #16 \n" // 16 bytes per loop
"st1 {v0.16b}, [%0], #16 \n" // store
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
@@ -716,6 +726,7 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
"1: \n"
"subs %w1, %w1, #4 \n" // 4 ints per loop
"st1 {v0.16b}, [%0], #16 \n" // store
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
@@ -739,6 +750,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
"tbl v1.16b, {v1.16b}, v3.16b \n"
"tbl v0.16b, {v2.16b}, v3.16b \n"
"st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -763,6 +775,7 @@ void MirrorUVRow_NEON(const uint8_t* src_uv,
"rev64 v1.8b, v1.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // dst += 8
"st1 {v1.8b}, [%2], #8 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
@@ -783,6 +796,7 @@ void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
"rev64 v0.4s, v0.4s \n"
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16
"st1 {v0.D}[0], [%1], #8 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -800,6 +814,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
@@ -818,6 +833,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
"orr v3.8b, v1.8b, v1.8b \n" // move g
"orr v4.8b, v0.8b, v0.8b \n" // move r
"st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
@@ -836,6 +852,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
"orr v2.8b, v4.8b, v4.8b \n" // move g
"orr v1.8b, v5.8b, v5.8b \n" // move r
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgba), // %1
@@ -853,6 +870,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
"orr v3.8b, v1.8b, v1.8b \n" // move g
"orr v4.8b, v0.8b, v0.8b \n" // move r
"st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgb24), // %1
@@ -885,6 +903,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
"subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
@@ -942,6 +961,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
ARGB1555TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
@@ -972,7 +992,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- // pixels
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1
@@ -989,8 +1009,8 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
"1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
"subs %w2, %w2, #8 \n" // 8 processed per loop.
- "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
- // RGB24.
+ "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
@@ -1023,6 +1043,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
@@ -1038,6 +1059,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
@@ -1057,6 +1079,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
"st1 {v3.8b}, [%2], #8 \n" // store 8 V.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
@@ -1077,6 +1100,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
"st1 {v2.8b}, [%2], #8 \n" // store 8 V.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
@@ -1102,6 +1126,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
"urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
"st1 {v1.8b}, [%2], #8 \n" // store 8 U.
"st1 {v3.8b}, [%3], #8 \n" // store 8 V.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(src_yuy2b), // %1
@@ -1129,6 +1154,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
"urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
"st1 {v0.8b}, [%2], #8 \n" // store 8 U.
"st1 {v2.8b}, [%3], #8 \n" // store 8 V.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(src_uyvyb), // %1
@@ -1153,6 +1179,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
"subs %w2, %w2, #4 \n" // 4 processed per loop
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
"st1 {v1.16b}, [%1], #16 \n" // store 4.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -1175,6 +1202,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
"subs %w4, %w4, #16 \n" // 16 pixels
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
@@ -1198,6 +1226,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %w4, %w4, #16 \n" // 16 pixels
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
@@ -1217,6 +1246,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTORGB565
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1
@@ -1238,6 +1268,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
"uqadd v21.8b, v21.8b, v1.8b \n"
"uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
"st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
@@ -1256,6 +1287,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
ARGBTOARGB1555
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
// ARGB1555.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
@@ -1276,6 +1308,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
ARGBTOARGB4444
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
// ARGB4444.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
@@ -1299,6 +1332,7 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
@@ -1316,6 +1350,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
// pixels
"subs %w2, %w2, #16 \n" // 16 processed per loop
"st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
@@ -1338,6 +1373,7 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"umlal v3.8h, v2.8b, v6.8b \n" // R
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
@@ -1359,6 +1395,7 @@ void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"umlal v0.8h, v3.8b, v6.8b \n" // R
"uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
"st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
@@ -1399,6 +1436,7 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
@@ -1767,6 +1805,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
"uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(src_rgb565_1), // %1
@@ -1832,6 +1871,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
"uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(src_argb1555_1), // %1
@@ -1897,6 +1937,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
"uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(src_argb4444_1), // %1
@@ -1927,6 +1968,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v27.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_y), // %1
@@ -1954,6 +1996,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_y), // %1
@@ -1980,6 +2023,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v27.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_y), // %1
@@ -2003,6 +2047,7 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
@@ -2026,6 +2071,7 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
@@ -2049,6 +2095,7 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
@@ -2072,6 +2119,7 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_y), // %1
@@ -2095,6 +2143,7 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_y), // %1
@@ -2116,6 +2165,7 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
"umlal v0.8h, v2.8b, v6.8b \n" // R
"uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_yj), // %1
@@ -2135,8 +2185,10 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
"umull v0.8h, v0.8b, v4.8b \n" // B
"umlal v0.8h, v1.8b, v5.8b \n" // G
"umlal v0.8h, v2.8b, v6.8b \n" // R
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 cache lines ahead
"uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_yj), // %1
@@ -2174,6 +2226,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
"rshrn v0.8b, v2.8h, #8 \n"
"rshrn2 v0.16b, v3.8h, #8 \n"
"st1 {v0.16b}, [%0], #16 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
"b 99f \n"
@@ -2290,6 +2343,7 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
"uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -2331,6 +2385,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
"uqxtn v1.8b, v1.8h \n"
"uqxtn v2.8b, v2.8h \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
@@ -2369,6 +2424,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
"uqxtn v6.8b, v6.8h \n"
"uqxtn v7.8b, v7.8h \n"
"st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -2395,6 +2451,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
"orr v1.8b, v0.8b, v0.8b \n" // G
"orr v2.8b, v0.8b, v0.8b \n" // R
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -2435,6 +2492,7 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
"uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
"uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
@@ -2495,6 +2553,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
"sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
"sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -2525,6 +2584,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
"rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
"rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -2550,6 +2610,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
"uqadd v2.8b, v2.8b, v6.8b \n"
"uqadd v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -2575,6 +2636,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
"uqsub v2.8b, v2.8b, v6.8b \n"
"uqsub v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -2604,6 +2666,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
"orr v1.8b, v0.8b, v0.8b \n"
"orr v2.8b, v0.8b, v0.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
@@ -2626,6 +2689,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
"subs %w3, %w3, #16 \n" // 16 processed per loop.
"uqadd v0.16b, v0.16b, v1.16b \n" // add
"st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
@@ -2653,6 +2717,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v1.8b, v0.8b, v2.8b \n" // add
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
@@ -2689,6 +2754,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
"abs v0.8h, v0.8h \n"
"uqxtn v0.8b, v0.8h \n"
"st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
@@ -2727,6 +2793,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
"abs v0.8h, v0.8h \n"
"uqxtn v0.8b, v0.8h \n"
"st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
@@ -2754,6 +2821,7 @@ void HalfFloat1Row_NEON(const uint16_t* src,
"fcvtn v1.4h, v2.4s \n" // 8 half floats
"fcvtn2 v1.8h, v3.4s \n"
"st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -2779,6 +2847,7 @@ void HalfFloatRow_NEON(const uint16_t* src,
"uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
"uqshrn2 v1.8h, v3.4s, #13 \n"
"st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -2803,6 +2872,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
"fmul v3.4s, v3.4s, %3.s[0] \n"
"st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -2828,6 +2898,7 @@ float ScaleMaxSamples_NEON(const float* src,
"fmax v5.4s, v5.4s, v1.4s \n" // max
"fmax v6.4s, v6.4s, v2.4s \n"
"st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
"fmax v5.4s, v5.4s, v6.4s \n" // max
"fmaxv %s3, v5.4s \n" // signed max acculator
@@ -2857,6 +2928,7 @@ float ScaleSumSamples_NEON(const float* src,
"fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
"fmla v6.4s, v2.4s, v2.4s \n"
"st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
"faddp v5.4s, v5.4s, v6.4s \n"
"faddp v5.4s, v5.4s, v5.4s \n"
@@ -2878,6 +2950,7 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
"fmul v1.4s, v1.4s, %3.s[0] \n" // scale
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -2902,18 +2975,23 @@ void GaussCol_NEON(const uint16_t* src0,
"ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
"ld1 {v2.8h}, [%4], #16 \n"
"uaddl v0.4s, v1.4h, v2.4h \n" // * 1
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
"ld1 {v2.8h}, [%1], #16 \n"
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "prfm pldl1keep, [%1, 448] \n"
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
"ld1 {v2.8h}, [%2], #16 \n"
"umlal v0.4s, v2.4h, v7.4h \n" // * 6
+ "prfm pldl1keep, [%2, 448] \n"
"umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
"ld1 {v2.8h}, [%3], #16 \n"
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "prfm pldl1keep, [%3, 448] \n"
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
"subs %w6, %w6, #8 \n" // 8 processed per loop
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
+ "prfm pldl1keep, [%4, 448] \n"
"b.gt 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
@@ -2946,6 +3024,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
"ld1 {v4.4s,v5.4s}, [%3], #32 \n"
"add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
"add v3.4s, v3.4s, v5.4s \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"mla v0.4s, v2.4s, v6.4s \n" // * 4
"mla v1.4s, v3.4s, v6.4s \n" // * 4
"subs %w5, %w5, #8 \n" // 8 processed per loop
@@ -2982,14 +3061,19 @@ void GaussCol_F32_NEON(const float* src0,
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
"ld1 {v4.4s, v5.4s}, [%2], #32 \n"
"fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"fmla v0.4s, v4.4s, v7.4s \n" // * 6
"ld1 {v2.4s, v3.4s}, [%3], #32 \n"
"fmla v1.4s, v5.4s, v7.4s \n"
+ "prfm pldl1keep, [%1, 448] \n"
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
"ld1 {v4.4s, v5.4s}, [%4], #32 \n"
"fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%2, 448] \n"
"fadd v0.4s, v0.4s, v4.4s \n" // * 1
+ "prfm pldl1keep, [%3, 448] \n"
"fadd v1.4s, v1.4s, v5.4s \n"
+ "prfm pldl1keep, [%4, 448] \n"
"subs %w6, %w6, #8 \n" // 8 processed per loop
"st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
"b.gt 1b \n"
@@ -3024,6 +3108,7 @@ void GaussRow_F32_NEON(const float* src,
"fadd v3.4s, v3.4s, v5.4s \n"
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
"fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"fmul v0.4s, v0.4s, v8.4s \n" // / 256
"fmul v1.4s, v1.4s, v8.4s \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop
@@ -3052,6 +3137,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
"zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
"subs %w3, %w3, #16 \n" // 16 pixels per loop
"st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
@@ -3079,6 +3165,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
"uqrshrn v2.8b, v1.8h, #2 \n"
"subs %w3, %w3, #16 \n" // 16 processed per loop.
"st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_ayuv_1), // %1
@@ -3107,6 +3194,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
"uqrshrn v1.8b, v1.8h, #2 \n"
"subs %w3, %w3, #16 \n" // 16 processed per loop.
"st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_ayuv_1), // %1
@@ -3124,6 +3212,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
// pixels
"subs %w2, %w2, #16 \n" // 16 pixels per loop
"st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(dst_y), // %1
@@ -3140,6 +3229,7 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
"orr v2.16b, v0.16b, v0.16b \n" // move U after V
"subs %w2, %w2, #16 \n" // 16 pixels per loop
"st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index 0a7b80ce..e155a484 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -31,6 +31,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
// load even pixels into v0, odd into v1
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels
"b.gt 1b \n"
: "+r"(src_ptr), // %0
@@ -54,6 +55,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
"subs %w2, %w2, #16 \n" // 16 processed per loop
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
"st1 {v0.16b}, [%1], #16 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
@@ -82,6 +84,8 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
"rshrn v0.8b, v0.8h, #2 \n" // round and pack
"rshrn2 v0.16b, v1.8h, #2 \n"
"st1 {v0.16b}, [%2], #16 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
@@ -102,6 +106,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #8 \n" // 8 processed per loop
"st1 {v2.8b}, [%1], #8 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -131,6 +136,10 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
"addp v0.8h, v0.8h, v0.8h \n"
"rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
"st1 {v0.s}[0], [%1], #4 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "prfm pldl1keep, [%2, 448] \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "prfm pldl1keep, [%4, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -156,7 +165,8 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
"subs %w2, %w2, #24 \n"
"orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -211,7 +221,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "prfm pldl1keep, [%3, 448] \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -252,7 +264,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
"uqrshrn v2.8b, v4.8h, #2 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "prfm pldl1keep, [%3, 448] \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -286,7 +300,8 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
"st1 {v2.8b}, [%1], #8 \n"
"st1 {v2.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -400,7 +415,10 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
"st1 {v3.8b}, [%1], #8 \n"
"st1 {v3.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "prfm pldl1keep, [%2, 448] \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_src_stride), // %2
@@ -504,7 +522,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"st1 {v3.8b}, [%1], #8 \n"
"st1 {v3.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "prfm pldl1keep, [%2, 448] \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_src_stride), // %2
@@ -528,7 +548,8 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
"uaddw v1.8h, v1.8h, v0.8b \n"
"st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
"subs %w2, %w2, #16 \n" // 16 processed per loop
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
@@ -599,7 +620,7 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
"add v1.4s, v1.4s, v0.4s \n"
"add v2.4s, v2.4s, v0.4s \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop
- "b.gt 1b \n"
+ "b.gt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
@@ -647,6 +668,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"rshrn v0.8b, v6.8h, #8 \n"
"rshrn2 v0.16b, v7.8h, #8 \n"
"st1 {v0.16b}, [%0], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "prfm pldl1keep, [%2, 448] \n"
"b.gt 1b \n"
"b 99f \n"
@@ -658,6 +681,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"st1 {v0.16b}, [%0], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "prfm pldl1keep, [%2, 448] \n"
"b.gt 25b \n"
"b 99f \n"
@@ -668,6 +693,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"st1 {v0.16b}, [%0], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "prfm pldl1keep, [%2, 448] \n"
"b.gt 50b \n"
"b 99f \n"
@@ -679,6 +706,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"st1 {v0.16b}, [%0], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "prfm pldl1keep, [%2, 448] \n"
"b.gt 75b \n"
"b 99f \n"
@@ -687,6 +716,7 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n"
"subs %w3, %w3, #16 \n"
"st1 {v0.16b}, [%0], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"b.gt 100b \n"
"99: \n"
@@ -713,6 +743,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
"subs %w2, %w2, #8 \n" // 8 processed per loop
"mov v2.16b, v3.16b \n"
"st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
@@ -736,6 +767,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
"urhadd v1.16b, v2.16b, v3.16b \n"
"st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -769,6 +801,8 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
"rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
@@ -794,6 +828,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
"ld1 {v0.s}[3], [%0], %3 \n"
"subs %w2, %w2, #4 \n" // 4 pixels per loop.
"st1 {v0.16b}, [%1], #16 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -838,6 +873,8 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
"subs %w3, %w3, #4 \n" // 4 pixels per loop.
"st1 {v0.16b}, [%2], #16 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride), // %1
@@ -878,6 +915,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
// clang-format on
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
@@ -949,7 +987,8 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
"st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@@ -984,6 +1023,8 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
"rshrn v0.4h, v0.4s, #2 \n" // round and pack
"rshrn2 v0.8h, v1.4s, #2 \n"
"st1 {v0.8h}, [%2], #16 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
@@ -1032,6 +1073,8 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
"uqrshrn v17.4h, v18.4s, #4 \n"
"uqrshrn2 v17.8h, v4.4s, #4 \n"
"st2 {v16.8h-v17.8h}, [%2], #32 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index c75f715a..f97ad9a7 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -804,6 +804,23 @@ TEST_F(LibYUVPlanarTest, TestARGBMirror) {
}
}
+TEST_F(LibYUVPlanarTest, TestMirrorPlane) {
+ SIMD_ALIGNED(uint8_t orig_pixels[1280]);
+ SIMD_ALIGNED(uint8_t dst_pixels[1280]);
+
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i] = i;
+ }
+ MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
+
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i]);
+ }
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
+ }
+}
+
TEST_F(LibYUVPlanarTest, TestShade) {
SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
SIMD_ALIGNED(uint8_t shade_pixels[1280][4]);
@@ -3315,8 +3332,8 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
}
#else
GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
- &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0],
- 1280);
+ &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+ &dst_pixels_opt[0], 1280);
#endif
}
@@ -3369,36 +3386,24 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) {
for (int i = 0; i < 1280 * 5; ++i) {
orig_pixels[i] = static_cast<float>(i);
}
- GaussCol_F32_C(&orig_pixels[0],
- &orig_pixels[1280],
- &orig_pixels[1280 * 2],
- &orig_pixels[1280 * 3],
- &orig_pixels[1280 * 4],
- &dst_pixels_c[0], 1280);
+ GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+ &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+ &dst_pixels_c[0], 1280);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) {
- GaussCol_F32_NEON(&orig_pixels[0],
- &orig_pixels[1280],
- &orig_pixels[1280 * 2],
- &orig_pixels[1280 * 3],
- &orig_pixels[1280 * 4],
- &dst_pixels_opt[0], 1280);
+ GaussCol_F32_NEON(&orig_pixels[0], &orig_pixels[1280],
+ &orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
+ &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
} else {
- GaussCol_F32_C(&orig_pixels[0],
- &orig_pixels[1280],
- &orig_pixels[1280 * 2],
- &orig_pixels[1280 * 3],
- &orig_pixels[1280 * 4],
- &dst_pixels_opt[0], 1280);
+ GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280],
+ &orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
+ &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
}
#else
- GaussCol_F32_C(&orig_pixels[0],
- &orig_pixels[1280],
- &orig_pixels[1280 * 2],
- &orig_pixels[1280 * 3],
- &orig_pixels[1280 * 4],
+ GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+ &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
&dst_pixels_opt[0], 1280);
#endif
}
@@ -3455,18 +3460,18 @@ TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
MaskCpuFlags(disable_cpu_flags_);
GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
- (float*)(dst_pixels_c), benchmark_width_,
- benchmark_width_, benchmark_height_);
+ (float*)(dst_pixels_c), benchmark_width_, benchmark_width_,
+ benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
- (float*)(dst_pixels_opt), benchmark_width_,
- benchmark_width_, benchmark_height_);
+ (float*)(dst_pixels_opt), benchmark_width_, benchmark_width_,
+ benchmark_height_);
}
- for (int i = 0; i < benchmark_width_ * benchmark_height_ ; ++i) {
- EXPECT_NEAR(((float*)(dst_pixels_c)) [i],
- ((float*)(dst_pixels_opt))[i], 1.f) << i;
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ EXPECT_NEAR(((float*)(dst_pixels_c))[i], ((float*)(dst_pixels_opt))[i], 1.f)
+ << i;
}
free_aligned_buffer_page_end(dst_pixels_c);
diff --git a/unit_test/rotate_argb_test.cc b/unit_test/rotate_argb_test.cc
index d2003895..3208b66a 100644
--- a/unit_test/rotate_argb_test.cc
+++ b/unit_test/rotate_argb_test.cc
@@ -183,4 +183,46 @@ TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
benchmark_cpu_info_);
}
+TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) {
+ int argb_plane_size = benchmark_width_ * 4 * abs(benchmark_height_);
+
+ align_buffer_page_end(src_argb, argb_plane_size);
+ align_buffer_page_end(dst_argb, argb_plane_size);
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+ benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_, kRotate0));
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+ benchmark_width_ * 4 - 1, benchmark_width_ - 1,
+ benchmark_height_, kRotate0));
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+ benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_, kRotate180));
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+ benchmark_width_ * 4 - 1, benchmark_width_ - 1,
+ benchmark_height_, kRotate180));
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+ abs(benchmark_height_) * 4, benchmark_width_,
+ benchmark_height_, kRotate90));
+
+ EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+ abs(benchmark_height_) * 4, benchmark_width_ - 1,
+ benchmark_height_, kRotate90));
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+ abs(benchmark_height_) * 4, benchmark_width_,
+ benchmark_height_, kRotate270));
+
+ EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+ abs(benchmark_height_) * 4, benchmark_width_ - 1,
+ benchmark_height_, kRotate270));
+
+ free_aligned_buffer_page_end(dst_argb);
+ free_aligned_buffer_page_end(src_argb);
+}
+
} // namespace libyuv