aboutsummaryrefslogtreecommitdiff
path: root/files/source/row_gcc.cc
diff options
context:
space:
mode:
Diffstat (limited to 'files/source/row_gcc.cc')
-rw-r--r--files/source/row_gcc.cc578
1 files changed, 487 insertions, 91 deletions
diff --git a/files/source/row_gcc.cc b/files/source/row_gcc.cc
index dce8c439..e94fd04d 100644
--- a/files/source/row_gcc.cc
+++ b/files/source/row_gcc.cc
@@ -27,6 +27,9 @@ static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
+static const uvec8 kABGRToYJ = {77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u,
+ 77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u};
+
static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
@@ -39,12 +42,18 @@ static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
127, -84, -43, 0, 127, -84, -43, 0};
+static const vec8 kABGRToUJ = {-43, -84, 127, 0, -43, -84, 127, 0,
+ -43, -84, 127, 0, -43, -84, 127, 0};
+
static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
-18, -94, 112, 0, -18, -94, 112, 0};
static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
-20, -107, 127, 0, -20, -107, 127, 0};
+static const vec8 kABGRToVJ = {127, -107, -20, 0, 127, -107, -20, 0,
+ 127, -107, -20, 0, 127, -107, -20, 0};
+
// Constants for BGRA
static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
@@ -729,7 +738,7 @@ void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
uint8_t* dst,
- const uint32_t dither4,
+ uint32_t dither4,
int width) {
asm volatile(
"movd %3,%%xmm6 \n"
@@ -777,7 +786,7 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
uint8_t* dst,
- const uint32_t dither4,
+ uint32_t dither4,
int width) {
asm volatile(
"vbroadcastss %3,%%xmm6 \n"
@@ -1201,6 +1210,7 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
"lea 0x40(%1),%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_ar64), // %1
"+r"(width) // %2
@@ -1228,6 +1238,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
"lea 0x40(%1),%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_ab64), // %1
"+r"(width) // %2
@@ -1256,6 +1267,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
"lea 0x20(%1),%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_ar64), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -1284,6 +1296,7 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
"lea 0x20(%1),%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_ab64), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -1398,6 +1411,24 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
}
#endif // HAS_ARGBTOYJROW_SSSE3
+#ifdef HAS_ABGRTOYJROW_SSSE3
+// Convert 16 ABGR pixels (64 bytes) to 16 YJ values.
+// Same as ABGRToYRow but different coefficients, no add 16.
+void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN RGBTOY(xmm5)
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kABGRToYJ), // %3
+ "m"(kSub128) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_ABGRTOYJROW_SSSE3
+
#ifdef HAS_RGBATOYJROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16.
@@ -1416,7 +1447,8 @@ void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
}
#endif // HAS_RGBATOYJROW_SSSE3
-#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \
+ defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
// vpermd for vphaddw + vpackuswb vpermd.
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
#endif
@@ -1429,9 +1461,8 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vbroadcastf128 %5,%%ymm7 \n"
- "vmovdqu %6,%%ymm6 \n"
-
- LABELALIGN RGBTOY_AVX2(ymm7)
+ "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(
+ ymm7) "vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1451,9 +1482,8 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vbroadcastf128 %5,%%ymm7 \n"
- "vmovdqu %6,%%ymm6 \n"
-
- LABELALIGN RGBTOY_AVX2(ymm7)
+ "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(
+ ymm7) "vzeroupper \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1472,9 +1502,8 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
- "vmovdqu %5,%%ymm6 \n"
-
- LABELALIGN RGBTOY_AVX2(ymm5)
+ "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(
+ ymm5) "vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1486,15 +1515,32 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
}
#endif // HAS_ARGBTOYJROW_AVX2
+#ifdef HAS_ABGRTOYJROW_AVX2
+// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
+void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(
+ ymm5) "vzeroupper \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kABGRToYJ), // %3
+ "m"(kSub128), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ABGRTOYJROW_AVX2
+
#ifdef HAS_RGBATOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
- "vmovdqu %5,%%ymm6 \n"
-
- LABELALIGN RGBTOY_AVX2(
+ "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(
ymm5) "vzeroupper \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
@@ -1571,11 +1617,15 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
}
#endif // HAS_ARGBTOUVROW_SSSE3
-#ifdef HAS_ARGBTOUVROW_AVX2
+#if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \
+ defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2)
// vpshufb for vphaddw + vpackuswb packed to shorts.
static const lvec8 kShufARGBToUV_AVX = {
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+#endif
+
+#if defined(HAS_ARGBTOUVROW_AVX2)
void ARGBToUVRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
@@ -1765,6 +1815,71 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
}
#endif // HAS_ARGBTOUVJROW_AVX2
+// TODO(fbarchard): Pass kABGRToVJ / kABGRToUJ as matrix
+#ifdef HAS_ABGRTOUVJROW_AVX2
+void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_abgr)), // %4
+ "m"(kSub128), // %5
+ "m"(kABGRToVJ), // %6
+ "m"(kABGRToUJ), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ABGRTOUVJROW_AVX2
+
#ifdef HAS_ARGBTOUVJROW_SSSE3
void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
@@ -1831,6 +1946,72 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
}
#endif // HAS_ARGBTOUVJROW_SSSE3
+#ifdef HAS_ABGRTOUVJROW_SSSE3
+void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_abgr)), // %4
+ "m"(kABGRToVJ), // %5
+ "m"(kABGRToUJ), // %6
+ "m"(kSub128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+#endif // HAS_ABGRTOUVJROW_SSSE3
+
#ifdef HAS_ARGBTOUV444ROW_SSSE3
void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
uint8_t* dst_u,
@@ -2153,9 +2334,6 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
"lea 0x8(%[y_buf]),%[y_buf] \n"
// Read 4 UV from 422 10 bit, upsample to 8 UV
-// TODO(fbarchard): Consider shufb to replace pack/unpack
-// TODO(fbarchard): Consider pmulhuw to replace psraw
-// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
#define READYUV210 \
"movq (%[u_buf]),%%xmm3 \n" \
"movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
@@ -2165,7 +2343,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
"packuswb %%xmm3,%%xmm3 \n" \
"punpcklwd %%xmm3,%%xmm3 \n" \
"movdqu (%[y_buf]),%%xmm4 \n" \
+ "movdqa %%xmm4,%%xmm2 \n" \
"psllw $6,%%xmm4 \n" \
+ "psrlw $4,%%xmm2 \n" \
+ "paddw %%xmm2,%%xmm4 \n" \
"lea 0x10(%[y_buf]),%[y_buf] \n"
#define READYUVA210 \
@@ -2177,7 +2358,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
"packuswb %%xmm3,%%xmm3 \n" \
"punpcklwd %%xmm3,%%xmm3 \n" \
"movdqu (%[y_buf]),%%xmm4 \n" \
+ "movdqa %%xmm4,%%xmm2 \n" \
"psllw $6,%%xmm4 \n" \
+ "psrlw $4,%%xmm2 \n" \
+ "paddw %%xmm2,%%xmm4 \n" \
"lea 0x10(%[y_buf]),%[y_buf] \n" \
"movdqu (%[a_buf]),%%xmm5 \n" \
"psraw $2,%%xmm5 \n" \
@@ -2196,7 +2380,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
"punpckhwd %%xmm2,%%xmm1 \n" \
"packuswb %%xmm1,%%xmm3 \n" \
"movdqu (%[y_buf]),%%xmm4 \n" \
+ "movdqa %%xmm4,%%xmm2 \n" \
"psllw $6,%%xmm4 \n" \
+ "psrlw $4,%%xmm2 \n" \
+ "paddw %%xmm2,%%xmm4 \n" \
"lea 0x10(%[y_buf]),%[y_buf] \n"
// Read 8 UV from 444 10 bit. With 8 Alpha.
@@ -2211,7 +2398,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
"punpckhwd %%xmm2,%%xmm1 \n" \
"packuswb %%xmm1,%%xmm3 \n" \
"movdqu (%[y_buf]),%%xmm4 \n" \
- "psllw $0x6,%%xmm4 \n" \
+ "movdqa %%xmm4,%%xmm2 \n" \
+ "psllw $6,%%xmm4 \n" \
+ "psrlw $4,%%xmm2 \n" \
+ "paddw %%xmm2,%%xmm4 \n" \
"lea 0x10(%[y_buf]),%[y_buf] \n" \
"movdqu (%[a_buf]),%%xmm5 \n" \
"psraw $2,%%xmm5 \n" \
@@ -2228,7 +2418,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
"packuswb %%xmm3,%%xmm3 \n" \
"punpcklwd %%xmm3,%%xmm3 \n" \
"movdqu (%[y_buf]),%%xmm4 \n" \
- "psllw $0x4,%%xmm4 \n" \
+ "movdqa %%xmm4,%%xmm2 \n" \
+ "psllw $4,%%xmm4 \n" \
+ "psrlw $8,%%xmm2 \n" \
+ "paddw %%xmm2,%%xmm4 \n" \
"lea 0x10(%[y_buf]),%[y_buf] \n"
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
@@ -2399,6 +2592,20 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
"movdqu %%xmm0,0x10(%[dst_rgba]) \n" \
"lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
+// Store 8 RGB24 values.
+#define STORERGB24 \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklbw %%xmm2,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "punpcklwd %%xmm2,%%xmm0 \n" \
+ "punpckhwd %%xmm2,%%xmm1 \n" \
+ "pshufb %%xmm5,%%xmm0 \n" \
+ "pshufb %%xmm6,%%xmm1 \n" \
+ "palignr $0xc,%%xmm0,%%xmm1 \n" \
+ "movq %%xmm0,(%[dst_rgb24]) \n" \
+ "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" \
+ "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+
// Store 8 AR30 values.
#define STOREAR30 \
"psraw $0x4,%%xmm0 \n" \
@@ -2508,17 +2715,43 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
"1: \n"
READYUV422
YUVTORGB(yuvconstants)
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "movq %%xmm0,(%[dst_rgb24]) \n"
- "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
- "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+ STORERGB24
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
+#if defined(__i386__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+ [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+
+void OMITFP I444ToRGB24Row_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV444
+ YUVTORGB(yuvconstants)
+ STORERGB24
"subl $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -3209,7 +3442,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
"vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
"vmovdqu (%[y_buf]),%%ymm4 \n" \
- "vpsllw $6,%%ymm4,%%ymm4 \n" \
+ "vpsllw $6,%%ymm4,%%ymm2 \n" \
+ "vpsrlw $4,%%ymm4,%%ymm4 \n" \
+ "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
"lea 0x20(%[y_buf]),%[y_buf] \n"
// Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
@@ -3224,7 +3459,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
"vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
"vmovdqu (%[y_buf]),%%ymm4 \n" \
- "vpsllw $6,%%ymm4,%%ymm4 \n" \
+ "vpsllw $6,%%ymm4,%%ymm2 \n" \
+ "vpsrlw $4,%%ymm4,%%ymm4 \n" \
+ "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
"lea 0x20(%[y_buf]),%[y_buf] \n" \
"vmovdqu (%[a_buf]),%%ymm5 \n" \
"vpsraw $2,%%ymm5,%%ymm5 \n" \
@@ -3242,7 +3479,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \
"vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
"vmovdqu (%[y_buf]),%%ymm4 \n" \
- "vpsllw $6,%%ymm4,%%ymm4 \n" \
+ "vpsllw $6,%%ymm4,%%ymm2 \n" \
+ "vpsrlw $4,%%ymm4,%%ymm4 \n" \
+ "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
"lea 0x20(%[y_buf]),%[y_buf] \n"
// Read 8 UV from 212 12 bit, upsample to 16 UV
@@ -3257,7 +3496,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
"vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
"vmovdqu (%[y_buf]),%%ymm4 \n" \
- "vpsllw $0x4,%%ymm4,%%ymm4 \n" \
+ "vpsllw $4,%%ymm4,%%ymm2 \n" \
+ "vpsrlw $8,%%ymm4,%%ymm4 \n" \
+ "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
"lea 0x20(%[y_buf]),%[y_buf] \n"
// Read 16 UV from 410. With 16 Alpha.
@@ -3271,7 +3512,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \
"vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
"vmovdqu (%[y_buf]),%%ymm4 \n" \
- "vpsllw $6,%%ymm4,%%ymm4 \n" \
+ "vpsllw $6,%%ymm4,%%ymm2 \n" \
+ "vpsrlw $4,%%ymm4,%%ymm4 \n" \
+ "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
"lea 0x20(%[y_buf]),%[y_buf] \n" \
"vmovdqu (%[a_buf]),%%ymm5 \n" \
"vpsraw $2,%%ymm5,%%ymm5 \n" \
@@ -4785,6 +5028,84 @@ void DetileRow_SSE2(const uint8_t* src,
}
#endif // HAS_DETILEROW_SSE2
+#ifdef HAS_DETILEROW_16_SSE2
+void DetileRow_16_SSE2(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(src_tile_stride) // %3
+ : "cc", "memory", "xmm0", "xmm1");
+}
+#endif // HAS_DETILEROW_SSE2
+
+#ifdef HAS_DETILEROW_16_AVX
+void DetileRow_16_AVX(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea (%0,%3,2),%0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(src_tile_stride) // %3
+ : "cc", "memory", "xmm0");
+}
+#endif // HAS_DETILEROW_AVX
+
+#ifdef HAS_DETILETOYUY2_SSE2
+// Read 16 Y, 8 UV, and write 8 YUYV.
+void DetileToYUY2_SSE2(const uint8_t* src_y,
+ ptrdiff_t src_y_tile_stride,
+ const uint8_t* src_uv,
+ ptrdiff_t src_uv_tile_stride,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // Load 16 Y
+ "sub $0x10,%3 \n"
+ "lea (%0,%4),%0 \n"
+ "movdqu (%1),%%xmm1 \n" // Load 8 UV
+ "lea (%1,%5),%1 \n"
+ "movdqu %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_yuy2), // %2
+ "+r"(width) // %3
+ : "r"(src_y_tile_stride), // %4
+ "r"(src_uv_tile_stride) // %5
+ : "cc", "memory", "xmm0", "xmm1", "xmm2" // Clobber list
+ );
+}
+#endif
+
#ifdef HAS_DETILESPLITUVROW_SSSE3
// TODO(greenjustin): Look into generating these constants instead of loading
// them since this can cause branch mispredicts for fPIC code on 32-bit
@@ -4821,36 +5142,59 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
}
#endif // HAS_DETILESPLITUVROW_SSSE3
+#ifdef HAS_MERGEUVROW_AVX512BW
+void MergeUVRow_AVX512BW(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile("sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbw (%0),%%zmm0 \n"
+ "vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpsllw $0x8,%%zmm1,%%zmm1 \n"
+ "vporq %%zmm0,%%zmm1,%%zmm2 \n"
+ "vmovdqu64 %%zmm2,(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_MERGEUVROW_AVX512BW
+
#ifdef HAS_MERGEUVROW_AVX2
void MergeUVRow_AVX2(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
- asm volatile(
+ asm volatile("sub %0,%1 \n"
- "sub %0,%1 \n"
-
- LABELALIGN
+ LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
- "lea 0x20(%0),%0 \n"
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
- "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm2,(%2) \n"
- "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
- "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
- "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x20,%3 \n"
+ "vpmovzxbw (%0),%%ymm0 \n"
+ "vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "vpsllw $0x8,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm0,%%ymm1,%%ymm2 \n"
+ "vmovdqu %%ymm2,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x10,%3 \n"
"jg 1b \n"
"vzeroupper \n"
- : "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_MERGEUVROW_AVX2
@@ -4859,11 +5203,9 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
- asm volatile(
+ asm volatile("sub %0,%1 \n"
- "sub %0,%1 \n"
-
- LABELALIGN
+ LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%1,1),%%xmm1 \n"
@@ -4876,12 +5218,12 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
"lea 0x20(%2),%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
- : "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_MERGEUVROW_SSE2
@@ -4891,37 +5233,35 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
uint16_t* dst_uv,
int depth,
int width) {
- depth = 16 - depth;
// clang-format off
asm volatile (
"vmovd %4,%%xmm3 \n"
+ "vmovd %5,%%xmm4 \n"
+
+
"sub %0,%1 \n"
+ // 8 pixels per loop.
- // 16 pixels per loop.
- LABELALIGN
+ LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu (%0,%1,1),%%ymm1 \n"
- "add $0x20,%0 \n"
-
+ "vpmovzxwd (%0),%%ymm0 \n"
+ "vpmovzxwd 0x00(%0,%1,1),%%ymm1 \n"
+ "lea 0x10(%0),%0 \n"
"vpsllw %%xmm3,%%ymm0,%%ymm0 \n"
- "vpsllw %%xmm3,%%ymm1,%%ymm1 \n"
- "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
- "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm2,(%2) \n"
- "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
- "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
- "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
- "add $0x40,%2 \n"
- "sub $0x10,%3 \n"
+ "vpslld %%xmm4,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm0,%%ymm1,%%ymm2 \n"
+ "vmovdqu %%ymm2,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
"jg 1b \n"
"vzeroupper \n"
- : "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3
- : "r"(depth) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"(16 - depth), // %4
+ "r"(32 - depth) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
// clang-format on
}
#endif // HAS_MERGEUVROW_AVX2
@@ -5127,7 +5467,6 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
// 512 = 9 bits
// 1024 = 10 bits
// 4096 = 12 bits
-// TODO(fbarchard): reduce to SSE2
void Convert8To16Row_SSE2(const uint8_t* src_y,
uint16_t* dst_y,
int scale,
@@ -6178,6 +6517,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
"lea 0x40(%1),%1 \n"
"sub $0x40,%2 \n"
"jg 1b \n"
+ "vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -6461,6 +6801,33 @@ void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
: "memory", "cc", "xmm0", "xmm1", "xmm5");
}
+void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(stride_yuy2)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_u,
@@ -6661,6 +7028,35 @@ void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
: "memory", "cc", "xmm0", "xmm1", "xmm5");
}
+void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%3,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%3,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(stride_yuy2)) // %3
+ : "memory", "cc", "xmm0", "xmm1");
+}
+
void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_u,