summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorfbarchard@google.com <fbarchard@google.com>2014-10-01 01:16:04 +0000
committerfbarchard@google.com <fbarchard@google.com>2014-10-01 01:16:04 +0000
commit044f914c297bb4fd34125879e1ea820dc62ac7db (patch)
tree0832f9abe31475e5871d0706fcb0e2bb411f99af
parent9c4c82181bd5e5c9e9f973dc7cf591802118be7f (diff)
downloadlibyuv-044f914c297bb4fd34125879e1ea820dc62ac7db.tar.gz
Change scale to unaligned movdqu.
BUG=365 TESTED=scale unittests R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/22879004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1101 16f28f9a-4ce2-e073-06de-1de4eb20be90
-rw-r--r--README.chromium2
-rw-r--r--include/libyuv/row.h6
-rw-r--r--include/libyuv/version.h2
-rw-r--r--source/planar_functions.cc14
-rw-r--r--source/row_any.cc4
-rw-r--r--source/row_win.cc226
-rw-r--r--source/scale.cc67
-rw-r--r--source/scale_argb.cc35
-rw-r--r--source/scale_common.cc24
-rw-r--r--source/scale_posix.cc190
-rw-r--r--source/scale_win.cc219
11 files changed, 129 insertions, 660 deletions
diff --git a/README.chromium b/README.chromium
index a6134d7..0b76d05 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1099
+Version: 1101
License: BSD
License File: LICENSE
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index e6e2c98..a2c492f 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -1748,12 +1748,6 @@ void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
- int source_y_fraction);
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
- int source_y_fraction);
void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index ce33c17..a3f60e0 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1099
+#define LIBYUV_VERSION 1101
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 3cfa6ce..d5111dc 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1798,12 +1798,7 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
- IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
+ InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
@@ -1811,12 +1806,7 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
- IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
+ InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
diff --git a/source/row_any.cc b/source/row_any.cc
index aaa0378..31ab08a 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -579,11 +579,11 @@ NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,
InterpolateRow_C, 1, 1, 32)
#endif
#ifdef HAS_INTERPOLATEROW_SSSE3
-NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,
+NANY(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3,
InterpolateRow_C, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_SSE2
-NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2,
+NANY(InterpolateRow_Any_SSE2, InterpolateRow_SSE2,
InterpolateRow_C, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_NEON
diff --git a/source/row_win.cc b/source/row_win.cc
index f507718..c8f5550 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -6322,7 +6322,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
}
#endif // HAS_INTERPOLATEROW_AVX2
-#ifdef HAS_INTERPOLATEROW_SSSE3
// Bilinear filter 16x2 -> 16x1
__declspec(naked) __declspec(align(16))
void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
@@ -6358,225 +6357,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
align 4
xloop:
- movdqa xmm0, [esi]
- movdqa xmm2, [esi + edx]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2
- punpckhbw xmm1, xmm2
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm1, xmm5
- psrlw xmm0, 7
- psrlw xmm1, 7
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop
- jmp xloop99
-
- // Blend 25 / 75.
- align 4
- xloop25:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop25
- jmp xloop99
-
- // Blend 50 / 50.
- align 4
- xloop50:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + edx]
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop50
- jmp xloop99
-
- // Blend 75 / 25.
- align 4
- xloop75:
- movdqa xmm1, [esi]
- movdqa xmm0, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop75
- jmp xloop99
-
- // Blend 100 / 0 - Copy row unchanged.
- align 4
- xloop100:
- movdqa xmm0, [esi]
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop100
-
- xloop99:
- pop edi
- pop esi
- ret
- }
-}
-#endif // HAS_INTERPOLATEROW_SSSE3
-
-#ifdef HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
- // Dispatch to specialized filters if applicable.
- cmp eax, 0
- je xloop100 // 0 / 256. Blend 100 / 0.
- cmp eax, 64
- je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
- cmp eax, 128
- je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
- cmp eax, 192
- je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
-
- movd xmm5, eax // xmm5 = y fraction
- punpcklbw xmm5, xmm5
- psrlw xmm5, 1
- punpcklwd xmm5, xmm5
- punpckldq xmm5, xmm5
- punpcklqdq xmm5, xmm5
- pxor xmm4, xmm4
-
- align 4
- xloop:
- movdqa xmm0, [esi] // row0
- movdqa xmm2, [esi + edx] // row1
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- punpcklbw xmm2, xmm4
- punpckhbw xmm3, xmm4
- punpcklbw xmm0, xmm4
- punpckhbw xmm1, xmm4
- psubw xmm2, xmm0 // row1 - row0
- psubw xmm3, xmm1
- paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
- paddw xmm3, xmm3
- pmulhw xmm2, xmm5 // scale diff
- pmulhw xmm3, xmm5
- paddw xmm0, xmm2 // sum rows
- paddw xmm1, xmm3
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop
- jmp xloop99
-
- // Blend 25 / 75.
- align 4
- xloop25:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop25
- jmp xloop99
-
- // Blend 50 / 50.
- align 4
- xloop50:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + edx]
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop50
- jmp xloop99
-
- // Blend 75 / 25.
- align 4
- xloop75:
- movdqa xmm1, [esi]
- movdqa xmm0, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop75
- jmp xloop99
-
- // Blend 100 / 0 - Copy row unchanged.
- align 4
- xloop100:
- movdqa xmm0, [esi]
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop100
-
- xloop99:
- pop edi
- pop esi
- ret
- }
-}
-#endif // HAS_INTERPOLATEROW_SSE2
-
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
- shr eax, 1
- // Dispatch to specialized filters if applicable.
- cmp eax, 0
- je xloop100 // 0 / 128. Blend 100 / 0.
- cmp eax, 32
- je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
- cmp eax, 64
- je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
- cmp eax, 96
- je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
-
- movd xmm0, eax // high fraction 0..127
- neg eax
- add eax, 128
- movd xmm5, eax // low fraction 128..1
- punpcklbw xmm5, xmm0
- punpcklwd xmm5, xmm5
- pshufd xmm5, xmm5, 0
-
- align 4
- xloop:
movdqu xmm0, [esi]
movdqu xmm2, [esi + edx]
movdqu xmm1, xmm0
@@ -6650,9 +6430,9 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
#ifdef HAS_INTERPOLATEROW_SSE2
// Bilinear filter 16x2 -> 16x1
__declspec(naked) __declspec(align(16))
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
__asm {
push esi
push edi
diff --git a/source/scale.cc b/source/scale.cc
index 5b33b5f..f37da7e 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -59,16 +59,9 @@ static void ScalePlaneDown2(int src_width, int src_height,
}
#elif defined(HAS_SCALEROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 :
- ScaleRowDown2Box_Unaligned_SSE2);
- if (IS_ALIGNED(src_ptr, 16) &&
- IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
- ScaleRowDown2Box_SSE2);
- }
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
+ ScaleRowDown2Box_SSE2);
}
#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
@@ -114,17 +107,9 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
}
#elif defined(HAS_SCALEROWDOWN2_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ?
- ScaleRowDown2_Unaligned_16_SSE2 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_16_SSE2 :
- ScaleRowDown2Box_Unaligned_16_SSE2);
- if (IS_ALIGNED(src_ptr, 16) &&
- IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
- ScaleRowDown2Box_16_SSE2);
- }
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
+ ScaleRowDown2Box_16_SSE2);
}
#elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
@@ -889,10 +874,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
+ InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
@@ -900,10 +882,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
+ InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
@@ -991,10 +970,7 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
InterpolateRow = InterpolateRow_Any_16_SSE2;
if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
- if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSE2;
- }
+ InterpolateRow = InterpolateRow_16_SSE2;
}
}
#endif
@@ -1002,10 +978,7 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
InterpolateRow = InterpolateRow_Any_16_SSSE3;
if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
- if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSSE3;
- }
+ InterpolateRow = InterpolateRow_16_SSSE3;
}
}
#endif
@@ -1090,10 +1063,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
+ InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
@@ -1101,10 +1071,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
+ InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
@@ -1229,10 +1196,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
InterpolateRow = InterpolateRow_Any_16_SSE2;
if (IS_ALIGNED(dst_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
- if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSE2;
- }
+ InterpolateRow = InterpolateRow_16_SSE2;
}
}
#endif
@@ -1240,10 +1204,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
InterpolateRow = InterpolateRow_Any_16_SSSE3;
if (IS_ALIGNED(dst_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
- if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSSE3;
- }
+ InterpolateRow = InterpolateRow_16_SSSE3;
}
}
#endif
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index e339cd7..b6d5129 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -193,10 +193,7 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(clip_src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
+ InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
@@ -204,10 +201,7 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(clip_src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
+ InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
@@ -289,10 +283,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
+ InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
@@ -300,10 +291,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
+ InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
@@ -430,10 +418,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(src_width, 8)) {
- I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
@@ -470,10 +455,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
+ InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
@@ -481,10 +463,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
+ InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
diff --git a/source/scale_common.cc b/source/scale_common.cc
index e4b2acc..459c61a 100644
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -888,11 +888,7 @@ void ScalePlaneVertical(int src_height,
if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width_bytes, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
+ InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
@@ -900,11 +896,7 @@ void ScalePlaneVertical(int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width_bytes, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
+ InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
@@ -970,11 +962,7 @@ void ScalePlaneVertical_16(int src_height,
if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
InterpolateRow = InterpolateRow_Any_16_SSE2;
if (IS_ALIGNED(dst_width_bytes, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSE2;
- }
+ InterpolateRow = InterpolateRow_16_SSE2;
}
}
#endif
@@ -982,11 +970,7 @@ void ScalePlaneVertical_16(int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
InterpolateRow = InterpolateRow_Any_16_SSSE3;
if (IS_ALIGNED(dst_width_bytes, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSSE3;
- }
+ InterpolateRow = InterpolateRow_16_SSSE3;
}
}
#endif
diff --git a/source/scale_posix.cc b/source/scale_posix.cc
index 352e667..b929c93 100644
--- a/source/scale_posix.cc
+++ b/source/scale_posix.cc
@@ -101,110 +101,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm5,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
- BUNDLEALIGN
- MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm5,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- LABELALIGN
- "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
@@ -226,9 +122,8 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
@@ -236,7 +131,7 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n"
@@ -262,9 +157,8 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
);
}
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
@@ -273,9 +167,9 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
+ MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
BUNDLEALIGN
- MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
+ MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
@@ -315,8 +209,8 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
@@ -348,8 +242,8 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
BUNDLEALIGN
MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
@@ -412,8 +306,8 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm2,%%xmm1 \n"
"palignr $0x8,%%xmm0,%%xmm1 \n"
@@ -461,7 +355,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
"pshufb %%xmm2,%%xmm6 \n"
@@ -479,7 +373,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
"psrlw $0x2,%%xmm6 \n"
"packuswb %%xmm6,%%xmm6 \n"
"movq %%xmm6," MEMACCESS2(0x8,1) " \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7
"lea " MEMLEA(0x20,0) ",%0 \n"
@@ -533,7 +427,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7
"pavgb %%xmm6,%%xmm7 \n"
"pavgb %%xmm7,%%xmm6 \n"
@@ -553,7 +447,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
"psrlw $0x2,%%xmm6 \n"
"packuswb %%xmm6,%%xmm6 \n"
"movq %%xmm6," MEMACCESS2(0x8,1) " \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm6,%%xmm7 \n"
@@ -590,8 +484,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pshufb %%xmm4,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
@@ -631,7 +525,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@@ -679,7 +573,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6
"movhlps %%xmm0,%%xmm1 \n"
"movhlps %%xmm6,%%xmm7 \n"
@@ -741,7 +635,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"mov %0,%3 \n"
"add %6,%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@@ -753,7 +647,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"2: \n"
- "movdqa " MEMACCESS(0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
"add %6,%0 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n"
@@ -765,8 +659,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"3: \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x10,3) ",%0 \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x10,%4 \n"
@@ -870,14 +764,14 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm1 \n"
"sub $0x20,%2 \n"
- "movdqa %%xmm0," MEMACCESS(0) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"jg 1b \n"
@@ -898,12 +792,12 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"shufps $0xdd,%%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@@ -923,15 +817,15 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@@ -951,8 +845,8 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
BUNDLEALIGN
MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
@@ -964,7 +858,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
"shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@@ -1003,7 +897,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
"punpckldq %%xmm3,%%xmm2 \n"
"punpcklqdq %%xmm2,%%xmm0 \n"
"sub $0x4,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@@ -1056,7 +950,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
"shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@@ -1156,14 +1050,14 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpckldq %%xmm0,%%xmm0 \n"
"punpckhdq %%xmm1,%%xmm1 \n"
"sub $0x8,%2 \n"
- "movdqa %%xmm0," MEMACCESS(0) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"jg 1b \n"
diff --git a/source/scale_win.cc b/source/scale_win.cc
index 840b973..091587e 100644
--- a/source/scale_win.cc
+++ b/source/scale_win.cc
@@ -105,117 +105,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // isolate odd pixels.
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x1 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm5
- pand xmm3, xmm5
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm5
- pand xmm3, xmm5
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
-
- align 4
- wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
@@ -234,9 +123,8 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
// Blends 32x1 rectangle to 16x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride
@@ -273,9 +161,8 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
// Blends 32x2 rectangle to 16x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_ptr
@@ -331,8 +218,8 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pand xmm0, xmm5
pand xmm1, xmm5
@@ -366,16 +253,16 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
pavgb xmm0, xmm2 // average rows
pavgb xmm1, xmm3
- movdqa xmm2, [eax + esi * 2]
- movdqa xmm3, [eax + esi * 2 + 16]
- movdqa xmm4, [eax + edi]
- movdqa xmm5, [eax + edi + 16]
+ movdqu xmm2, [eax + esi * 2]
+ movdqu xmm3, [eax + esi * 2 + 16]
+ movdqu xmm4, [eax + edi]
+ movdqu xmm5, [eax + edi + 16]
lea eax, [eax + 32]
pavgb xmm2, xmm4
pavgb xmm3, xmm5
@@ -429,8 +316,8 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
movdqa xmm2, xmm1
palignr xmm1, xmm0, 8
@@ -483,8 +370,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
align 4
wloop:
- movdqa xmm0, [eax] // pixels 0..7
- movdqa xmm1, [eax + esi]
+ movdqu xmm0, [eax] // pixels 0..7
+ movdqu xmm1, [eax + esi]
pavgb xmm0, xmm1
pshufb xmm0, xmm2
pmaddubsw xmm0, xmm5
@@ -501,8 +388,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
psrlw xmm0, 2
packuswb xmm0, xmm0
movq qword ptr [edx + 8], xmm0
- movdqa xmm0, [eax + 16] // pixels 16..23
- movdqa xmm1, [eax + esi + 16]
+ movdqu xmm0, [eax + 16] // pixels 16..23
+ movdqu xmm1, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm1
pshufb xmm0, xmm4
@@ -542,8 +429,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
align 4
wloop:
- movdqa xmm0, [eax] // pixels 0..7
- movdqa xmm1, [eax + esi]
+ movdqu xmm0, [eax] // pixels 0..7
+ movdqu xmm1, [eax + esi]
pavgb xmm1, xmm0
pavgb xmm0, xmm1
pshufb xmm0, xmm2
@@ -562,8 +449,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
psrlw xmm0, 2
packuswb xmm0, xmm0
movq qword ptr [edx + 8], xmm0
- movdqa xmm0, [eax + 16] // pixels 16..23
- movdqa xmm1, [eax + esi + 16]
+ movdqu xmm0, [eax + 16] // pixels 16..23
+ movdqu xmm1, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm1, xmm0
pavgb xmm0, xmm1
@@ -599,8 +486,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
align 4
xloop:
- movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
- movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
+ movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
+ movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
lea eax, [eax + 32]
pshufb xmm0, xmm4
pshufb xmm1, xmm5
@@ -635,8 +522,8 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
align 4
xloop:
- movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
- movdqa xmm6, [eax + esi]
+ movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
+ movdqu xmm6, [eax + esi]
movhlps xmm1, xmm0
movhlps xmm7, xmm6
punpcklbw xmm0, xmm5
@@ -645,7 +532,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
punpcklbw xmm7, xmm5
paddusw xmm0, xmm6
paddusw xmm1, xmm7
- movdqa xmm6, [eax + esi * 2]
+ movdqu xmm6, [eax + esi * 2]
lea eax, [eax + 16]
movhlps xmm7, xmm6
punpcklbw xmm6, xmm5
@@ -701,7 +588,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
align 4
xloop:
- movdqa xmm0, [eax] // average 2 rows into xmm0
+ movdqu xmm0, [eax] // average 2 rows into xmm0
pavgb xmm0, [eax + esi]
lea eax, [eax + 16]
@@ -750,7 +637,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
align 4
xloop:
// first row
- movdqa xmm0, [esi]
+ movdqu xmm0, [esi]
lea eax, [esi + edx]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm4
@@ -763,7 +650,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// sum remaining rows
align 4
yloop:
- movdqa xmm2, [eax] // read 16 pixels
+ movdqu xmm2, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row
movdqa xmm3, xmm2
punpcklbw xmm2, xmm4
@@ -775,8 +662,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
align 4
ydone:
- movdqa [edi], xmm0
- movdqa [edi + 16], xmm1
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm1
lea edi, [edi + 32]
sub ecx, 16
@@ -891,14 +778,14 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
align 4
wloop:
- movdqa xmm0, [eax]
+ movdqu xmm0, [eax]
lea eax, [eax + 16]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm0
punpckhbw xmm1, xmm1
sub ecx, 32
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
jg wloop
@@ -920,12 +807,12 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
shufps xmm0, xmm1, 0xdd
sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg wloop
@@ -947,15 +834,15 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
movdqa xmm2, xmm0
shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg wloop
@@ -978,10 +865,10 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2 // average rows
pavgb xmm1, xmm3
@@ -990,7 +877,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg wloop
@@ -1027,7 +914,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
punpckldq xmm2, xmm3
punpcklqdq xmm0, xmm2
sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg wloop
@@ -1076,7 +963,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg wloop
@@ -1267,14 +1154,14 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
align 4
wloop:
- movdqa xmm0, [eax]
+ movdqu xmm0, [eax]
lea eax, [eax + 16]
movdqa xmm1, xmm0
punpckldq xmm0, xmm0
punpckhdq xmm1, xmm1
sub ecx, 8
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
jg wloop