aboutsummaryrefslogtreecommitdiff
path: root/third_party/libyuv/source/row_win.cc
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/libyuv/source/row_win.cc')
-rw-r--r--third_party/libyuv/source/row_win.cc736
1 files changed, 365 insertions, 371 deletions
diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc
index 951518926f..5203b57c69 100644
--- a/third_party/libyuv/source/row_win.cc
+++ b/third_party/libyuv/source/row_win.cc
@@ -10,9 +10,9 @@
#include "libyuv/row.h"
-// This module is for Visual C 32/64 bit and clangcl 32 bit
+// This module is for Visual C 32/64 bit
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
- (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+ !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
#if defined(_M_X64)
#include <emmintrin.h>
@@ -29,9 +29,9 @@ extern "C" {
// Read 8 UV from 444
#define READYUV444 \
- xmm0 = _mm_loadl_epi64((__m128i*)u_buf); \
+ xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \
xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
u_buf += 8; \
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
@@ -39,9 +39,9 @@ extern "C" {
// Read 8 UV from 444, With 8 Alpha.
#define READYUVA444 \
- xmm0 = _mm_loadl_epi64((__m128i*)u_buf); \
+ xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \
xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
u_buf += 8; \
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
@@ -51,10 +51,10 @@ extern "C" {
// Read 4 UV from 422, upsample to 8 UV.
#define READYUV422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
+ xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
+ xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \
u_buf += 4; \
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
@@ -62,10 +62,10 @@ extern "C" {
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
#define READYUVA422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
+ xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
+ xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \
u_buf += 4; \
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
@@ -74,24 +74,21 @@ extern "C" {
a_buf += 8;
// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(yuvconstants) \
- xmm1 = _mm_loadu_si128(&xmm0); \
- xmm2 = _mm_loadu_si128(&xmm0); \
- xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
- xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
- xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
- xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
- xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
- xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
- xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
- xmm0 = _mm_adds_epi16(xmm0, xmm4); \
- xmm1 = _mm_adds_epi16(xmm1, xmm4); \
- xmm2 = _mm_adds_epi16(xmm2, xmm4); \
- xmm0 = _mm_srai_epi16(xmm0, 6); \
- xmm1 = _mm_srai_epi16(xmm1, 6); \
- xmm2 = _mm_srai_epi16(xmm2, 6); \
- xmm0 = _mm_packus_epi16(xmm0, xmm0); \
- xmm1 = _mm_packus_epi16(xmm1, xmm1); \
+#define YUVTORGB(yuvconstants) \
+ xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8(0x80)); \
+ xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
+ xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \
+ xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3); \
+ xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3); \
+ xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3); \
+ xmm0 = _mm_adds_epi16(xmm4, xmm0); \
+ xmm1 = _mm_subs_epi16(xmm4, xmm1); \
+ xmm2 = _mm_adds_epi16(xmm4, xmm2); \
+ xmm0 = _mm_srai_epi16(xmm0, 6); \
+ xmm1 = _mm_srai_epi16(xmm1, 6); \
+ xmm2 = _mm_srai_epi16(xmm2, 6); \
+ xmm0 = _mm_packus_epi16(xmm0, xmm0); \
+ xmm1 = _mm_packus_epi16(xmm1, xmm1); \
xmm2 = _mm_packus_epi16(xmm2, xmm2);
// Store 8 ARGB values.
@@ -112,7 +109,7 @@ void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- __m128i xmm0, xmm1, xmm2, xmm4;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
const __m128i xmm5 = _mm_set1_epi8(-1);
const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
@@ -132,7 +129,7 @@ void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
READYUVA422
@@ -150,7 +147,7 @@ void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- __m128i xmm0, xmm1, xmm2, xmm4;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
const __m128i xmm5 = _mm_set1_epi8(-1);
const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
@@ -170,7 +167,7 @@ void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
READYUVA444
@@ -247,11 +244,11 @@ static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
// 7 bit fixed point 0.5.
static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
-static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
- 0x8080u, 0x8080u, 0x8080u, 0x8080u};
+// 8 bit fixed point 0.5, for bias of UV.
+static const ulvec8 kBiasUV128 = {
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
// Shuffle table for converting RGB24 to ARGB.
static const uvec8 kShuffleMaskRGB24ToARGB = {
@@ -1427,7 +1424,7 @@ __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
}
}
-__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1440,7 +1437,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kARGBToV
movdqa xmm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v
@@ -1499,7 +1496,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
}
}
-__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1512,7 +1509,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUVJ128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kARGBToVJ
movdqa xmm7, xmmword ptr kARGBToUJ
sub edi, edx // stride from u to v
@@ -1573,7 +1570,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
}
#ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1586,7 +1583,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- vbroadcastf128 ymm5, xmmword ptr kAddUV128
+ vbroadcastf128 ymm5, xmmword ptr kBiasUV128
vbroadcastf128 ymm6, xmmword ptr kARGBToV
vbroadcastf128 ymm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v
@@ -1641,7 +1638,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
#endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2
-__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1654,7 +1651,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- vbroadcastf128 ymm5, xmmword ptr kAddUVJ128
+ vbroadcastf128 ymm5, xmmword ptr kBiasUV128
vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
sub edi, edx // stride from u to v
@@ -1709,7 +1706,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
}
#endif // HAS_ARGBTOUVJROW_AVX2
-__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
@@ -1719,7 +1716,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // width
- movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kARGBToV
movdqa xmm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v
@@ -1767,7 +1764,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
}
}
-__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1780,7 +1777,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kBGRAToV
movdqa xmm7, xmmword ptr kBGRAToU
sub edi, edx // stride from u to v
@@ -1839,7 +1836,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
}
}
-__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1852,7 +1849,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kABGRToV
movdqa xmm7, xmmword ptr kABGRToU
sub edi, edx // stride from u to v
@@ -1911,7 +1908,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
}
}
-__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1924,7 +1921,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kRGBAToV
movdqa xmm7, xmmword ptr kRGBAToU
sub edi, edx // stride from u to v
@@ -1986,14 +1983,14 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
// Read 16 UV from 444
#define READYUV444_AVX2 \
- __asm { \
- __asm vmovdqu xmm0, [esi] /* U */ \
- __asm vmovdqu xmm1, [esi + edi] /* V */ \
+ __asm { \
+ __asm vmovdqu xmm3, [esi] /* U */ \
+ __asm vmovdqu xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpermq ymm3, ymm3, 0xd8 \
__asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16]}
@@ -2001,12 +1998,12 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
// Read 16 UV from 444. With 16 Alpha.
#define READYUVA444_AVX2 \
__asm { \
- __asm vmovdqu xmm0, [esi] /* U */ \
+ __asm vmovdqu xmm3, [esi] /* U */ \
__asm vmovdqu xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpermq ymm3, ymm3, 0xd8 \
__asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
__asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
@@ -2017,123 +2014,122 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
// Read 8 UV from 422, upsample to 16 UV.
#define READYUV422_AVX2 \
- __asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
- __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm { \
+ __asm vmovq xmm3, qword ptr [esi] /* U */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16]}
// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
#define READYUVA422_AVX2 \
- __asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
- __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm { \
+ __asm vmovq xmm3, qword ptr [esi] /* U */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16] \
- __asm vmovdqu xmm5, [ebp] /* A */ \
+ __asm vmovdqu xmm5, [ebp] /* A */ \
__asm vpermq ymm5, ymm5, 0xd8 \
__asm lea ebp, [ebp + 16]}
// Read 8 UV from NV12, upsample to 16 UV.
#define READNV12_AVX2 \
- __asm { \
- __asm vmovdqu xmm0, [esi] /* UV */ \
+ __asm { \
+ __asm vmovdqu xmm3, [esi] /* UV */ \
__asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16]}
// Read 8 UV from NV21, upsample to 16 UV.
#define READNV21_AVX2 \
- __asm { \
- __asm vmovdqu xmm0, [esi] /* UV */ \
+ __asm { \
+ __asm vmovdqu xmm3, [esi] /* UV */ \
__asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleNV21 \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16]}
// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
#define READYUY2_AVX2 \
- __asm { \
- __asm vmovdqu ymm4, [eax] /* YUY2 */ \
+ __asm { \
+ __asm vmovdqu ymm4, [eax] /* YUY2 */ \
__asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
- __asm vmovdqu ymm0, [eax] /* UV */ \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
+ __asm vmovdqu ymm3, [eax] /* UV */ \
+ __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleYUY2UV \
__asm lea eax, [eax + 32]}
// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
#define READUYVY_AVX2 \
- __asm { \
- __asm vmovdqu ymm4, [eax] /* UYVY */ \
+ __asm { \
+ __asm vmovdqu ymm4, [eax] /* UYVY */ \
__asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
- __asm vmovdqu ymm0, [eax] /* UV */ \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
+ __asm vmovdqu ymm3, [eax] /* UV */ \
+ __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleUYVYUV \
__asm lea eax, [eax + 32]}
// Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_AVX2(YuvConstants) \
- __asm { \
- __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
- __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
- __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
- __asm vpsubw ymm2, ymm3, ymm2 \
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
- __asm vpsubw ymm1, ymm3, ymm1 \
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
- __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \
+ __asm { \
+ __asm vpsubb ymm3, ymm3, ymmword ptr kBiasUV128 \
__asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
- __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
- __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
- __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
+ __asm vmovdqa ymm0, ymmword ptr [YuvConstants + KUVTOB] \
+ __asm vmovdqa ymm1, ymmword ptr [YuvConstants + KUVTOG] \
+ __asm vmovdqa ymm2, ymmword ptr [YuvConstants + KUVTOR] \
+ __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */ \
+ __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */ \
+ __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */ \
+ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KYBIASTORGB] \
+ __asm vpaddw ymm4, ymm3, ymm4 \
+ __asm vpaddsw ymm0, ymm0, ymm4 \
+ __asm vpsubsw ymm1, ymm4, ymm1 \
+ __asm vpaddsw ymm2, ymm2, ymm4 \
__asm vpsraw ymm0, ymm0, 6 \
__asm vpsraw ymm1, ymm1, 6 \
__asm vpsraw ymm2, ymm2, 6 \
- __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
- __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
- __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
- }
+ __asm vpackuswb ymm0, ymm0, ymm0 \
+ __asm vpackuswb ymm1, ymm1, ymm1 \
+ __asm vpackuswb ymm2, ymm2, ymm2}
// Store 16 ARGB values.
#define STOREARGB_AVX2 \
- __asm { \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
+ __asm { \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
__asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
+ __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
__asm vpermq ymm2, ymm2, 0xd8 \
- __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
- __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
+ __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
+ __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
__asm vmovdqu 0[edx], ymm1 \
__asm vmovdqu 32[edx], ymm0 \
__asm lea edx, [edx + 64]}
// Store 16 RGBA values.
#define STORERGBA_AVX2 \
- __asm { \
- __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
+ __asm { \
+ __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
__asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
+ __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
__asm vpermq ymm2, ymm2, 0xd8 \
- __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
- __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
+ __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
+ __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
__asm vmovdqu [edx], ymm0 \
__asm vmovdqu [edx + 32], ymm1 \
__asm lea edx, [edx + 64]}
@@ -2480,11 +2476,11 @@ __declspec(naked) void I422ToRGBARow_AVX2(
// Read 8 UV from 444.
#define READYUV444 \
- __asm { \
- __asm movq xmm0, qword ptr [esi] /* U */ \
+ __asm { \
+ __asm movq xmm3, qword ptr [esi] /* U */ \
__asm movq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
@@ -2492,10 +2488,10 @@ __declspec(naked) void I422ToRGBARow_AVX2(
// Read 4 UV from 444. With 8 Alpha.
#define READYUVA444 \
__asm { \
- __asm movq xmm0, qword ptr [esi] /* U */ \
+ __asm movq xmm3, qword ptr [esi] /* U */ \
__asm movq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8] \
@@ -2504,180 +2500,178 @@ __declspec(naked) void I422ToRGBARow_AVX2(
// Read 4 UV from 422, upsample to 8 UV.
#define READYUV422 \
- __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
+ __asm { \
+ __asm movd xmm3, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
+ __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
#define READYUVA422 \
- __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
+ __asm { \
+ __asm movd xmm3, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] /* Y */ \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
+ __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \
+ __asm movq xmm4, qword ptr [eax] /* Y */ \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8] \
- __asm movq xmm5, qword ptr [ebp] /* A */ \
+ __asm movq xmm5, qword ptr [ebp] /* A */ \
__asm lea ebp, [ebp + 8]}
// Read 4 UV from NV12, upsample to 8 UV.
#define READNV12 \
- __asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ \
+ __asm { \
+ __asm movq xmm3, qword ptr [esi] /* UV */ \
__asm lea esi, [esi + 8] \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
// Read 4 VU from NV21, upsample to 8 UV.
#define READNV21 \
- __asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ \
+ __asm { \
+ __asm movq xmm3, qword ptr [esi] /* UV */ \
__asm lea esi, [esi + 8] \
- __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
+ __asm pshufb xmm3, xmmword ptr kShuffleNV21 \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
#define READYUY2 \
- __asm { \
- __asm movdqu xmm4, [eax] /* YUY2 */ \
+ __asm { \
+ __asm movdqu xmm4, [eax] /* YUY2 */ \
__asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
- __asm movdqu xmm0, [eax] /* UV */ \
- __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
+ __asm movdqu xmm3, [eax] /* UV */ \
+ __asm pshufb xmm3, xmmword ptr kShuffleYUY2UV \
__asm lea eax, [eax + 16]}
// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
#define READUYVY \
- __asm { \
- __asm movdqu xmm4, [eax] /* UYVY */ \
+ __asm { \
+ __asm movdqu xmm4, [eax] /* UYVY */ \
__asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
- __asm movdqu xmm0, [eax] /* UV */ \
- __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
+ __asm movdqu xmm3, [eax] /* UV */ \
+ __asm pshufb xmm3, xmmword ptr kShuffleUYVYUV \
__asm lea eax, [eax + 16]}
// Convert 8 pixels: 8 UV and 8 Y.
#define YUVTORGB(YuvConstants) \
- __asm { \
- __asm movdqa xmm1, xmm0 \
- __asm movdqa xmm2, xmm0 \
- __asm movdqa xmm3, xmm0 \
- __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
- __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
- __asm psubw xmm0, xmm1 \
- __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
- __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
- __asm psubw xmm1, xmm2 \
- __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
- __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
- __asm psubw xmm2, xmm3 \
+ __asm { \
+ __asm psubb xmm3, xmmword ptr kBiasUV128 \
__asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
- __asm paddsw xmm0, xmm4 /* B += Y */ \
- __asm paddsw xmm1, xmm4 /* G += Y */ \
- __asm paddsw xmm2, xmm4 /* R += Y */ \
+ __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVTOB] \
+ __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVTOG] \
+ __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVTOR] \
+ __asm pmaddubsw xmm0, xmm3 \
+ __asm pmaddubsw xmm1, xmm3 \
+ __asm pmaddubsw xmm2, xmm3 \
+ __asm movdqa xmm3, xmmword ptr [YuvConstants + KYBIASTORGB] \
+ __asm paddw xmm4, xmm3 \
+ __asm paddsw xmm0, xmm4 \
+ __asm paddsw xmm2, xmm4 \
+ __asm psubsw xmm4, xmm1 \
+ __asm movdqa xmm1, xmm4 \
__asm psraw xmm0, 6 \
__asm psraw xmm1, 6 \
__asm psraw xmm2, 6 \
- __asm packuswb xmm0, xmm0 /* B */ \
- __asm packuswb xmm1, xmm1 /* G */ \
- __asm packuswb xmm2, xmm2 /* R */ \
+ __asm packuswb xmm0, xmm0 /* B */ \
+ __asm packuswb xmm1, xmm1 /* G */ \
+ __asm packuswb xmm2, xmm2 /* R */ \
}
// Store 8 ARGB values.
#define STOREARGB \
- __asm { \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm5 /* RA */ \
+ __asm { \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm5 /* RA */ \
__asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
+ __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
__asm movdqu 0[edx], xmm0 \
__asm movdqu 16[edx], xmm1 \
__asm lea edx, [edx + 32]}
// Store 8 BGRA values.
#define STOREBGRA \
- __asm { \
- __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
- __asm punpcklbw xmm1, xmm0 /* GB */ \
- __asm punpcklbw xmm5, xmm2 /* AR */ \
+ __asm { \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm0 /* GB */ \
+ __asm punpcklbw xmm5, xmm2 /* AR */ \
__asm movdqa xmm0, xmm5 \
- __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
- __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
+ __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
__asm movdqu 0[edx], xmm5 \
__asm movdqu 16[edx], xmm0 \
__asm lea edx, [edx + 32]}
// Store 8 RGBA values.
#define STORERGBA \
- __asm { \
- __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
- __asm punpcklbw xmm1, xmm2 /* GR */ \
- __asm punpcklbw xmm5, xmm0 /* AB */ \
+ __asm { \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm2 /* GR */ \
+ __asm punpcklbw xmm5, xmm0 /* AB */ \
__asm movdqa xmm0, xmm5 \
- __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
- __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
+ __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
__asm movdqu 0[edx], xmm5 \
__asm movdqu 16[edx], xmm0 \
__asm lea edx, [edx + 32]}
// Store 8 RGB24 values.
#define STORERGB24 \
- __asm {/* Weave into RRGB */ \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm {/* Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
__asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
- __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
- __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
- __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
- __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
- __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
+ __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
+ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
+ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
+ __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
+ __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
__asm lea edx, [edx + 24]}
// Store 8 RGB565 values.
#define STORERGB565 \
- __asm {/* Weave into RRGB */ \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm {/* Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
__asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
- __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
- __asm movdqa xmm2, xmm0 /* G */ \
- __asm pslld xmm0, 8 /* R */ \
- __asm psrld xmm3, 3 /* B */ \
- __asm psrld xmm2, 5 /* G */ \
- __asm psrad xmm0, 16 /* R */ \
- __asm pand xmm3, xmm5 /* B */ \
- __asm pand xmm2, xmm6 /* G */ \
- __asm pand xmm0, xmm7 /* R */ \
- __asm por xmm3, xmm2 /* BG */ \
- __asm por xmm0, xmm3 /* BGR */ \
- __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
- __asm movdqa xmm2, xmm1 /* G */ \
- __asm pslld xmm1, 8 /* R */ \
- __asm psrld xmm3, 3 /* B */ \
- __asm psrld xmm2, 5 /* G */ \
- __asm psrad xmm1, 16 /* R */ \
- __asm pand xmm3, xmm5 /* B */ \
- __asm pand xmm2, xmm6 /* G */ \
- __asm pand xmm1, xmm7 /* R */ \
- __asm por xmm3, xmm2 /* BG */ \
- __asm por xmm1, xmm3 /* BGR */ \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
+ __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm0 /* G */ \
+ __asm pslld xmm0, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm0, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm0, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm0, xmm3 /* BGR */ \
+ __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm1 /* G */ \
+ __asm pslld xmm1, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm1, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm1, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm1, xmm3 /* BGR */ \
__asm packssdw xmm0, xmm1 \
- __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
+ __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
__asm lea edx, [edx + 16]}
// 8 pixels.
@@ -4347,13 +4341,13 @@ static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
// Blend 8 pixels at a time.
-__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
+ mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
@@ -4442,7 +4436,7 @@ __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
__asm {
- mov eax, [esp + 4] // src_argb0
+ mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
pcmpeqb xmm3, xmm3 // generate mask 0xff000000
@@ -4487,7 +4481,7 @@ __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
__asm {
- mov eax, [esp + 4] // src_argb0
+ mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
@@ -4581,7 +4575,7 @@ __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
__asm {
- mov eax, [esp + 4] // src_argb0
+ mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
@@ -4752,22 +4746,22 @@ __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
pmaddubsw xmm6, xmm2
phaddw xmm0, xmm6
psrlw xmm0, 7
- packuswb xmm0, xmm0 // 8 B values
+ packuswb xmm0, xmm0 // 8 B values
movdqu xmm5, [eax] // G
movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm3
pmaddubsw xmm1, xmm3
phaddw xmm5, xmm1
psrlw xmm5, 7
- packuswb xmm5, xmm5 // 8 G values
- punpcklbw xmm0, xmm5 // 8 BG values
+ packuswb xmm5, xmm5 // 8 G values
+ punpcklbw xmm0, xmm5 // 8 BG values
movdqu xmm5, [eax] // R
movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm4
pmaddubsw xmm1, xmm4
phaddw xmm5, xmm1
psrlw xmm5, 7
- packuswb xmm5, xmm5 // 8 R values
+ packuswb xmm5, xmm5 // 8 R values
movdqu xmm6, [eax] // A
movdqu xmm1, [eax + 16]
psrld xmm6, 24
@@ -4817,25 +4811,25 @@ __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
movdqu xmm1, [eax + 16]
pmaddubsw xmm6, xmm3
pmaddubsw xmm1, xmm3
- phaddsw xmm0, xmm7 // B
- phaddsw xmm6, xmm1 // G
- psraw xmm0, 6 // B
- psraw xmm6, 6 // G
- packuswb xmm0, xmm0 // 8 B values
- packuswb xmm6, xmm6 // 8 G values
- punpcklbw xmm0, xmm6 // 8 BG values
+ phaddsw xmm0, xmm7 // B
+ phaddsw xmm6, xmm1 // G
+ psraw xmm0, 6 // B
+ psraw xmm6, 6 // G
+ packuswb xmm0, xmm0 // 8 B values
+ packuswb xmm6, xmm6 // 8 G values
+ punpcklbw xmm0, xmm6 // 8 BG values
movdqu xmm1, [eax] // R
movdqu xmm7, [eax + 16]
pmaddubsw xmm1, xmm4
pmaddubsw xmm7, xmm4
- phaddsw xmm1, xmm7 // R
+ phaddsw xmm1, xmm7 // R
movdqu xmm6, [eax] // A
movdqu xmm7, [eax + 16]
pmaddubsw xmm6, xmm5
pmaddubsw xmm7, xmm5
phaddsw xmm6, xmm7 // A
- psraw xmm1, 6 // R
- psraw xmm6, 6 // A
+ psraw xmm1, 6 // R
+ psraw xmm6, 6 // A
packuswb xmm1, xmm1 // 8 R values
packuswb xmm6, xmm6 // 8 A values
punpcklbw xmm1, xmm6 // 8 RA values
@@ -4878,16 +4872,16 @@ __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
convertloop:
movdqu xmm0, [eax] // read 4 pixels
- punpcklbw xmm0, xmm5 // first 2 pixels
- pmulhuw xmm0, xmm2 // pixel * scale >> 16
+ punpcklbw xmm0, xmm5 // first 2 pixels
+ pmulhuw xmm0, xmm2 // pixel * scale >> 16
movdqu xmm1, [eax] // read 4 pixels
- punpckhbw xmm1, xmm5 // next 2 pixels
+ punpckhbw xmm1, xmm5 // next 2 pixels
pmulhuw xmm1, xmm2
- pmullw xmm0, xmm3 // * interval_size
+ pmullw xmm0, xmm3 // * interval_size
movdqu xmm7, [eax] // read 4 pixels
pmullw xmm1, xmm3
- pand xmm7, xmm6 // mask alpha
- paddw xmm0, xmm4 // + interval_size / 2
+ pand xmm7, xmm6 // mask alpha
+ paddw xmm0, xmm4 // + interval_size / 2
paddw xmm1, xmm4
packuswb xmm0, xmm1
por xmm0, xmm7
@@ -4907,9 +4901,9 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
int width,
uint32_t value) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
movd xmm2, [esp + 16] // value
punpcklbw xmm2, xmm2
punpcklqdq xmm2, xmm2
@@ -4918,10 +4912,10 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
movdqu xmm0, [eax] // read 4 pixels
lea eax, [eax + 16]
movdqa xmm1, xmm0
- punpcklbw xmm0, xmm0 // first 2
- punpckhbw xmm1, xmm1 // next 2
- pmulhuw xmm0, xmm2 // argb * value
- pmulhuw xmm1, xmm2 // argb * value
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ pmulhuw xmm0, xmm2 // argb * value
+ pmulhuw xmm1, xmm2 // argb * value
psrlw xmm0, 8
psrlw xmm1, 8
packuswb xmm0, xmm1
@@ -4937,29 +4931,29 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
#ifdef HAS_ARGBMULTIPLYROW_SSE2
// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
+ mov eax, [esp + 4 + 4] // src_argb
+ mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
- pxor xmm5, xmm5 // constant 0
+ pxor xmm5, xmm5 // constant 0
convertloop:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm0, [eax] // read 4 pixels from src_argb
movdqu xmm2, [esi] // read 4 pixels from src_argb1
movdqu xmm1, xmm0
movdqu xmm3, xmm2
- punpcklbw xmm0, xmm0 // first 2
- punpckhbw xmm1, xmm1 // next 2
- punpcklbw xmm2, xmm5 // first 2
- punpckhbw xmm3, xmm5 // next 2
- pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
- pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ punpcklbw xmm2, xmm5 // first 2
+ punpckhbw xmm3, xmm5 // next 2
+ pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2
+ pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2
lea eax, [eax + 16]
lea esi, [esi + 16]
packuswb xmm0, xmm1
@@ -4977,14 +4971,14 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBADDROW_SSE2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
// TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
+ mov eax, [esp + 4 + 4] // src_argb
+ mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
@@ -4992,11 +4986,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
jl convertloop49
convertloop4:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm0, [eax] // read 4 pixels from src_argb
lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
- paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ paddusb xmm0, xmm1 // src_argb + src_argb1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
@@ -5007,11 +5001,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
jl convertloop19
convertloop1:
- movd xmm0, [eax] // read 1 pixels from src_argb0
+ movd xmm0, [eax] // read 1 pixels from src_argb
lea eax, [eax + 4]
movd xmm1, [esi] // read 1 pixels from src_argb1
lea esi, [esi + 4]
- paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ paddusb xmm0, xmm1 // src_argb + src_argb1
movd [edx], xmm0
lea edx, [edx + 4]
sub ecx, 1
@@ -5026,23 +5020,23 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBSUBTRACTROW_SSE2
// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
+ mov eax, [esp + 4 + 4] // src_argb
+ mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
convertloop:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm0, [eax] // read 4 pixels from src_argb
lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
- psubusb xmm0, xmm1 // src_argb0 - src_argb1
+ psubusb xmm0, xmm1 // src_argb - src_argb1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
@@ -5056,20 +5050,20 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBMULTIPLYROW_AVX2
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
+ mov eax, [esp + 4 + 4] // src_argb
+ mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
- vpxor ymm5, ymm5, ymm5 // constant 0
+ vpxor ymm5, ymm5, ymm5 // constant 0
convertloop:
- vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
+ vmovdqu ymm1, [eax] // read 8 pixels from src_argb
lea eax, [eax + 32]
vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
lea esi, [esi + 32]
@@ -5077,8 +5071,8 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
vpunpckhbw ymm1, ymm1, ymm1 // high 4
vpunpcklbw ymm2, ymm3, ymm5 // low 4
vpunpckhbw ymm3, ymm3, ymm5 // high 4
- vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
- vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
+ vpmulhuw ymm0, ymm0, ymm2 // src_argb * src_argb1 low 4
+ vpmulhuw ymm1, ymm1, ymm3 // src_argb * src_argb1 high 4
vpackuswb ymm0, ymm0, ymm1
vmovdqu [edx], ymm0
lea edx, [edx + 32]
@@ -5094,19 +5088,19 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
#ifdef HAS_ARGBADDROW_AVX2
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
+ mov eax, [esp + 4 + 4] // src_argb
+ mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
convertloop:
- vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb
lea eax, [eax + 32]
vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
lea esi, [esi + 32]
@@ -5124,21 +5118,21 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
#ifdef HAS_ARGBSUBTRACTROW_AVX2
// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
+ mov eax, [esp + 4 + 4] // src_argb
+ mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
convertloop:
- vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb
lea eax, [eax + 32]
- vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
+ vpsubusb ymm0, ymm0, [esi] // src_argb - src_argb1
lea esi, [esi + 32]
vmovdqu [edx], ymm0
lea edx, [edx + 32]
@@ -5165,8 +5159,8 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_y0
- mov esi, [esp + 8 + 8] // src_y1
+ mov eax, [esp + 8 + 4] // src_y0
+ mov esi, [esp + 8 + 8] // src_y1
mov edi, [esp + 8 + 12] // src_y2
mov edx, [esp + 8 + 16] // dst_sobelx
mov ecx, [esp + 8 + 20] // width
@@ -5176,17 +5170,17 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
pxor xmm5, xmm5 // constant 0
convertloop:
- movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
+ movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
punpcklbw xmm0, xmm5
punpcklbw xmm1, xmm5
psubw xmm0, xmm1
- movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
+ movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
punpcklbw xmm1, xmm5
punpcklbw xmm2, xmm5
psubw xmm1, xmm2
- movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
+ movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm5
@@ -5221,8 +5215,8 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_y0
- mov esi, [esp + 4 + 8] // src_y1
+ mov eax, [esp + 4 + 4] // src_y0
+ mov esi, [esp + 4 + 8] // src_y1
mov edx, [esp + 4 + 12] // dst_sobely
mov ecx, [esp + 4 + 16] // width
sub esi, eax
@@ -5230,17 +5224,17 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
pxor xmm5, xmm5 // constant 0
convertloop:
- movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
+ movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
punpcklbw xmm0, xmm5
punpcklbw xmm1, xmm5
psubw xmm0, xmm1
- movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
+ movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
punpcklbw xmm1, xmm5
punpcklbw xmm2, xmm5
psubw xmm1, xmm2
- movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
+ movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm5
@@ -5275,8 +5269,8 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_sobelx
- mov esi, [esp + 4 + 8] // src_sobely
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
sub esi, eax
@@ -5284,7 +5278,7 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
pslld xmm5, 24 // 0xff000000
convertloop:
- movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
paddusb xmm0, xmm1 // sobel = sobelx + sobely
@@ -5323,8 +5317,8 @@ __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_sobelx
- mov esi, [esp + 4 + 8] // src_sobely
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
sub esi, eax
@@ -5357,15 +5351,15 @@ __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_sobelx
- mov esi, [esp + 4 + 8] // src_sobely
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
sub esi, eax
pcmpeqb xmm5, xmm5 // alpha 255
convertloop:
- movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
movdqa xmm2, xmm0
@@ -5535,7 +5529,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
add ecx, 4 - 1
jl l1b
- // 1 pixel loop
+ // 1 pixel loop
l1:
movdqu xmm0, [eax]
psubd xmm0, [eax + edx * 4]
@@ -5577,7 +5571,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
test edx, 15
jne l4b
- // 4 pixel loop
+ // 4 pixel loop
l4:
movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
lea eax, [eax + 16]
@@ -5623,7 +5617,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
add ecx, 4 - 1
jl l1b
- // 1 pixel loop
+ // 1 pixel loop
l1:
movd xmm2, dword ptr [eax] // 1 argb pixel
lea eax, [eax + 4]
@@ -5657,7 +5651,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
mov esi, [esp + 16] // stride
mov edx, [esp + 20] // dst_argb
mov ecx, [esp + 24] // pointer to uv_dudv
- movq xmm2, qword ptr [ecx] // uv
+ movq xmm2, qword ptr [ecx] // uv
movq xmm7, qword ptr [ecx + 8] // dudv
mov ecx, [esp + 28] // width
shl esi, 16 // 4, stride
@@ -5666,7 +5660,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
sub ecx, 4
jl l4b
- // setup for 4 pixel loop
+ // setup for 4 pixel loop
pshufd xmm7, xmm7, 0x44 // dup dudv
pshufd xmm5, xmm5, 0 // dup 4, stride
movdqa xmm0, xmm2 // x0, y0, x1, y1
@@ -5678,16 +5672,16 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
addps xmm3, xmm4
addps xmm4, xmm4 // dudv *= 4
- // 4 pixel loop
+ // 4 pixel loop
l4:
cvttps2dq xmm0, xmm2 // x, y float to int first 2
cvttps2dq xmm1, xmm3 // x, y float to int next 2
packssdw xmm0, xmm1 // x, y as 8 shorts
pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // shift right
+ pshufd xmm0, xmm0, 0x39 // shift right
movd edi, xmm0
- pshufd xmm0, xmm0, 0x39 // shift right
+ pshufd xmm0, xmm0, 0x39 // shift right
movd xmm1, [eax + esi] // read pixel 0
movd xmm6, [eax + edi] // read pixel 1
punpckldq xmm1, xmm6 // combine pixel 0 and 1
@@ -5739,8 +5733,8 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
__asm {
push esi
push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
@@ -5749,7 +5743,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
je xloop100 // 0 / 256. Blend 100 / 0.
sub edi, esi
cmp eax, 128
- je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
+ je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
vmovd xmm0, eax // high fraction 0..255
neg eax
@@ -5776,7 +5770,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
vpaddw ymm0, ymm0, ymm4
vpsrlw ymm1, ymm1, 8
vpsrlw ymm0, ymm0, 8
- vpackuswb ymm0, ymm0, ymm1 // unmutates
+ vpackuswb ymm0, ymm0, ymm1 // unmutates
vmovdqu [esi + edi], ymm0
lea esi, [esi + 32]
sub ecx, 32
@@ -5817,17 +5811,17 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
push esi
push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
- // Dispatch to specialized filters if applicable.
+ // Dispatch to specialized filters if applicable.
cmp eax, 0
je xloop100 // 0 /256. Blend 100 / 0.
cmp eax, 128
- je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
+ je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
movd xmm0, eax // high fraction 0..255
neg eax
@@ -5846,7 +5840,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
movdqu xmm1, xmm0
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
- psubb xmm0, xmm4 // bias image by -128
+ psubb xmm0, xmm4 // bias image by -128
psubb xmm1, xmm4
movdqa xmm2, xmm5
movdqa xmm3, xmm5
@@ -5895,8 +5889,8 @@ __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
const uint8_t* shuffler,
int width) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler
movdqu xmm5, [ecx]
mov ecx, [esp + 16] // width
@@ -5922,8 +5916,8 @@ __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
const uint8_t* shuffler,
int width) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler
vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
mov ecx, [esp + 16] // width
@@ -5960,18 +5954,18 @@ __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_y
- mov esi, [esp + 8 + 8] // src_u
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
mov edx, [esp + 8 + 12] // src_v
mov edi, [esp + 8 + 16] // dst_frame
mov ecx, [esp + 8 + 20] // width
sub edx, esi
convertloop:
- movq xmm2, qword ptr [esi] // U
+ movq xmm2, qword ptr [esi] // U
movq xmm3, qword ptr [esi + edx] // V
lea esi, [esi + 8]
- punpcklbw xmm2, xmm3 // UV
+ punpcklbw xmm2, xmm3 // UV
movdqu xmm0, [eax] // Y
lea eax, [eax + 16]
movdqa xmm1, xmm0
@@ -5997,22 +5991,22 @@ __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_y
- mov esi, [esp + 8 + 8] // src_u
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
mov edx, [esp + 8 + 12] // src_v
mov edi, [esp + 8 + 16] // dst_frame
mov ecx, [esp + 8 + 20] // width
sub edx, esi
convertloop:
- movq xmm2, qword ptr [esi] // U
+ movq xmm2, qword ptr [esi] // U
movq xmm3, qword ptr [esi + edx] // V
lea esi, [esi + 8]
- punpcklbw xmm2, xmm3 // UV
+ punpcklbw xmm2, xmm3 // UV
movdqu xmm0, [eax] // Y
movdqa xmm1, xmm2
lea eax, [eax + 16]
- punpcklbw xmm1, xmm0 // UYVY
+ punpcklbw xmm1, xmm0 // UYVY
punpckhbw xmm2, xmm0
movdqu [edi], xmm1
movdqu [edi + 16], xmm2
@@ -6039,10 +6033,10 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
mov ecx, [esp + 4 + 16] /* width */
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
- // 2 pixel loop.
+ // 2 pixel loop.
convertloop:
- // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
- // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
+ // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
+ // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
movq xmm0, qword ptr [eax] // BGRABGRA
lea eax, [eax + 8]
punpcklbw xmm0, xmm3
@@ -6091,8 +6085,8 @@ __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
const float* poly,
int width) {
__asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_argb */
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
mov ecx, [esp + 12] /* poly */
vbroadcastf128 ymm4, [ecx] // C0
vbroadcastf128 ymm5, [ecx + 16] // C1
@@ -6131,8 +6125,8 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
float scale,
int width) {
__asm {
- mov eax, [esp + 4] /* src */
- mov edx, [esp + 8] /* dst */
+ mov eax, [esp + 4] /* src */
+ mov edx, [esp + 8] /* dst */
movd xmm4, dword ptr [esp + 12] /* scale */
mov ecx, [esp + 16] /* width */
mulss xmm4, kExpBias
@@ -6140,7 +6134,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
pxor xmm5, xmm5
sub edx, eax
- // 8 pixel loop.
+ // 8 pixel loop.
convertloop:
movdqu xmm2, xmmword ptr [eax] // 8 shorts
add eax, 16
@@ -6178,7 +6172,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
vpxor ymm5, ymm5, ymm5
sub edx, eax
- // 16 pixel loop.
+ // 16 pixel loop.
convertloop:
vmovdqu ymm2, [eax] // 16 shorts
add eax, 32
@@ -6188,7 +6182,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
vcvtdq2ps ymm2, ymm2
vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
vmulps ymm2, ymm2, ymm4
- vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
+ vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
vpsrld ymm2, ymm2, 13
vpackssdw ymm2, ymm2, ymm3
vmovdqu [eax + edx - 32], ymm2
@@ -6206,22 +6200,22 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
float scale,
int width) {
__asm {
- mov eax, [esp + 4] /* src */
- mov edx, [esp + 8] /* dst */
+ mov eax, [esp + 4] /* src */
+ mov edx, [esp + 8] /* dst */
vbroadcastss ymm4, [esp + 12] /* scale */
- mov ecx, [esp + 16] /* width */
+ mov ecx, [esp + 16] /* width */
sub edx, eax
- // 16 pixel loop.
+ // 16 pixel loop.
convertloop:
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
add eax, 32
- vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
+ vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
vcvtdq2ps ymm3, ymm3
vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
vmulps ymm3, ymm3, ymm4
- vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
+ vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
vcvtps2ph xmm3, ymm3, 3
vmovdqu [eax + edx + 32], xmm2
vmovdqu [eax + edx + 32 + 16], xmm3
@@ -6240,8 +6234,8 @@ __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] /* dst_argb */
- mov esi, [esp + 4 + 8] /* table_argb */
+ mov eax, [esp + 4 + 4] /* dst_argb */
+ mov esi, [esp + 4 + 8] /* table_argb */
mov ecx, [esp + 4 + 12] /* width */
// 1 pixel loop.
@@ -6274,8 +6268,8 @@ __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] /* dst_argb */
- mov esi, [esp + 4 + 8] /* table_argb */
+ mov eax, [esp + 4 + 4] /* dst_argb */
+ mov esi, [esp + 4 + 8] /* table_argb */
mov ecx, [esp + 4 + 12] /* width */
// 1 pixel loop.
@@ -6309,8 +6303,8 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] /* src_argb */
- mov edi, [esp + 8 + 8] /* dst_argb */
+ mov eax, [esp + 8 + 4] /* src_argb */
+ mov edi, [esp + 8 + 8] /* dst_argb */
mov ecx, [esp + 8 + 12] /* width */
movd xmm2, dword ptr [esp + 8 + 16] // luma table
movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
@@ -6320,7 +6314,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
psllw xmm4, 8
pxor xmm5, xmm5
- // 4 pixel loop.
+ // 4 pixel loop.
convertloop:
movdqu xmm0, xmmword ptr [eax] // generate luma ptr
pmaddubsw xmm0, xmm3