summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorfbarchard@google.com <fbarchard@google.com>2014-10-03 21:11:37 +0000
committerfbarchard@google.com <fbarchard@google.com>2014-10-03 21:11:37 +0000
commitb720049a5483bc2e6fa78ea82369ddd476918b7f (patch)
tree316f00d02d35087e92fa32881b344182534718c2
parent147bbede9dad10ddfc3185d082e183537f64355b (diff)
downloadlibyuv-b720049a5483bc2e6fa78ea82369ddd476918b7f.tar.gz
Make row functions used for planarfunctions and convert use movdqu to relax alignment constraint. Step 1 - make functions unaligned.
BUG=365 TESTED=libyuv_unittest passes R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/26709004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1111 16f28f9a-4ce2-e073-06de-1de4eb20be90
-rw-r--r--README.chromium2
-rw-r--r--include/libyuv/version.h2
-rw-r--r--source/planar_functions.cc113
-rw-r--r--source/row_posix.cc418
-rw-r--r--source/row_win.cc432
5 files changed, 455 insertions, 512 deletions
diff --git a/README.chromium b/README.chromium
index 91c1ac3..02d8c23 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1109
+Version: 1111
License: BSD
License File: LICENSE
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 8895d54..6d723e5 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1109
+#define LIBYUV_VERSION 1111
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index d5111dc..a4d40fa 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -47,9 +47,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_SSE2;
}
#endif
@@ -101,9 +99,7 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
}
#endif
#if defined(HAS_COPYROW_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_16_SSE2;
}
#endif
@@ -254,9 +250,7 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_MIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSSE3;
}
#endif
@@ -307,14 +301,8 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
- YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- YUY2ToYRow = YUY2ToYRow_SSE2;
- }
- }
+ YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+ YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
#endif
@@ -385,14 +373,8 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
UYVYToYRow = UYVYToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
- UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
- UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
- UYVYToUV422Row = UYVYToUV422Row_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- UYVYToYRow = UYVYToYRow_SSE2;
- }
- }
+ UYVYToUV422Row = UYVYToUV422Row_SSE2;
+ UYVYToYRow = UYVYToYRow_SSE2;
}
}
#endif
@@ -504,9 +486,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
}
#if defined(HAS_ARGBMIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
ARGBMirrorRow = ARGBMirrorRow_SSSE3;
}
#endif
@@ -824,10 +804,7 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
- I422ToBGRARow = I422ToBGRARow_SSSE3;
- }
+ I422ToBGRARow = I422ToBGRARow_SSSE3;
}
}
#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
@@ -894,10 +871,7 @@ int I422ToABGR(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
- I422ToABGRRow = I422ToABGRRow_SSSE3;
- }
+ I422ToABGRRow = I422ToABGRRow_SSSE3;
}
}
#endif
@@ -956,10 +930,7 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
}
}
#endif
@@ -1084,9 +1055,7 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
dst_stride_y = 0;
}
#if defined(HAS_SETROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
SetRow = SetRow_NEON;
}
#endif
@@ -1150,8 +1119,7 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_SETROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height);
return 0;
}
@@ -1202,9 +1170,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBATTENUATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
@@ -1317,9 +1283,7 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBGRAYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_SSSE3;
}
#elif defined(HAS_ARGBGRAYROW_NEON)
@@ -1355,8 +1319,7 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_ARGBGRAYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_SSSE3;
}
#elif defined(HAS_ARGBGRAYROW_NEON)
@@ -1388,8 +1351,7 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_ARGBSEPIAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBSepiaRow = ARGBSepiaRow_SSSE3;
}
#elif defined(HAS_ARGBSEPIAROW_NEON)
@@ -1430,8 +1392,7 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
}
#elif defined(HAS_ARGBCOLORMATRIXROW_NEON)
@@ -1573,8 +1534,7 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_ARGBQUANTIZEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
}
#elif defined(HAS_ARGBQUANTIZEROW_NEON)
@@ -1748,9 +1708,7 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBSHADEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
ARGBShadeRow = ARGBShadeRow_SSE2;
}
#elif defined(HAS_ARGBSHADEROW_NEON)
@@ -1882,11 +1840,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBShuffleRow = ARGBShuffleRow_SSSE3;
- }
+ ARGBShuffleRow = ARGBShuffleRow_SSSE3;
}
}
#endif
@@ -1942,8 +1896,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
}
// ARGBToBayer used to select G channel from ARGB.
#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
@@ -1951,8 +1904,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOBAYERROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerRow_SSSE3;
@@ -2043,8 +1995,7 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelRow_C;
#if defined(HAS_SOBELROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelRow = SobelRow_SSE2;
}
#endif
@@ -2065,8 +2016,7 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_, int width) = SobelToPlaneRow_C;
#if defined(HAS_SOBELTOPLANEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelToPlaneRow = SobelToPlaneRow_SSE2;
}
#endif
@@ -2088,8 +2038,7 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelXYRow_C;
#if defined(HAS_SOBELXYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelXYRow = SobelXYRow_SSE2;
}
#endif
@@ -2213,10 +2162,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
- IS_ALIGNED(width, 8)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
}
#endif
@@ -2259,10 +2205,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
src_stride_y = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
- IS_ALIGNED(width, 8)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
}
#endif
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 36111c1..e92c9f5 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -221,7 +221,7 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
"1: \n"
"movq " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x8,0) ",%0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@@ -252,8 +252,8 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@@ -318,17 +318,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
- "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"por %%xmm5,%%xmm3 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_rgb24), // %0
@@ -359,17 +359,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
- "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"por %%xmm5,%%xmm3 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_raw), // %0
@@ -418,8 +418,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"punpcklbw %%xmm0,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm2 \n"
BUNDLEALIGN
- MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
- MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
+ MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
+ MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
"lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@@ -475,8 +475,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"punpcklbw %%xmm0,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm2 \n"
BUNDLEALIGN
- MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
- MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
+ MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
+ MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
"lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@@ -519,8 +519,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n"
BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2)
- MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2)
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
+ MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
"lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@@ -631,7 +631,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
"pslld $0xb,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"pslld $0x8,%%xmm0 \n"
@@ -672,7 +672,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
"pslld $0xf,%%xmm7 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm0,%%xmm3 \n"
@@ -712,7 +712,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
"psrlw $0x8,%%xmm3 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm3,%%xmm0 \n"
"pand %%xmm4,%%xmm1 \n"
@@ -744,10 +744,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
@@ -760,7 +760,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@@ -820,10 +820,10 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"movdqa %4,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
@@ -837,7 +837,7 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@@ -912,10 +912,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
@@ -979,10 +979,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
@@ -1187,10 +1187,10 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
@@ -1202,11 +1202,11 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"packsswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"pmaddubsw %%xmm3,%%xmm0 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm2 \n"
@@ -1219,7 +1219,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"paddb %%xmm5,%%xmm0 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1)
+ MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@@ -1317,10 +1317,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
@@ -1430,10 +1430,10 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
@@ -1446,7 +1446,7 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_bgra), // %0
@@ -1513,10 +1513,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
@@ -1640,10 +1640,10 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
@@ -1656,7 +1656,7 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_abgr), // %0
@@ -1714,10 +1714,10 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
@@ -1730,7 +1730,7 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_rgba), // %0
@@ -1797,10 +1797,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
@@ -1933,10 +1933,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
@@ -2199,8 +2199,8 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n"
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@@ -2354,8 +2354,8 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@@ -2393,8 +2393,8 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@@ -2430,8 +2430,8 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@@ -2464,8 +2464,8 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@@ -2686,8 +2686,8 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n"
- "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n"
- "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
+ "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n"
+ "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
"lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@@ -2725,8 +2725,8 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm2,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
+ "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
"lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@@ -2765,8 +2765,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n"
- "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n"
- "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
+ "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n"
+ "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
"lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@@ -2939,8 +2939,8 @@ void YToARGBRow_SSE2(const uint8* y_buf,
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"por %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
@@ -2970,7 +2970,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
"lea " MEMLEA(-0x10,0) ",%0 \n"
LABELALIGN
"1: \n"
- MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0
+ MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0
"pshufb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
@@ -3104,8 +3104,8 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
@@ -3115,8 +3115,8 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
"psrlw $0x8,%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"packuswb %%xmm3,%%xmm2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2)
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
@@ -3183,7 +3183,7 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
@@ -3405,16 +3405,16 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
"punpcklbw %%xmm2,%%xmm2 \n"
"punpckhwd %%xmm2,%%xmm3 \n"
"punpcklwd %%xmm2,%%xmm2 \n"
- "movdqa " MEMACCESS(1) ",%%xmm4 \n"
- "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm4 \n"
+ "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
"pand %%xmm0,%%xmm2 \n"
"pand %%xmm0,%%xmm3 \n"
"pand %%xmm1,%%xmm4 \n"
"pand %%xmm1,%%xmm5 \n"
"por %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm3 \n"
- "movdqa %%xmm2," MEMACCESS(1) " \n"
- "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm2," MEMACCESS(1) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@@ -3498,13 +3498,13 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
"psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
@@ -3527,11 +3527,11 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
BUNDLEALIGN
- MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
- MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
+ MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
@@ -3572,8 +3572,8 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
@@ -3722,14 +3722,14 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_uyvy), // %0
@@ -3751,11 +3751,11 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
BUNDLEALIGN
- MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
- MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
+ MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
@@ -3796,8 +3796,8 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
@@ -4014,7 +4014,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jge 41b \n"
@@ -4132,16 +4132,16 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
// 4 pixel loop.
LABELALIGN
"40: \n"
- "movdqa " MEMACCESS(0) ",%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm3 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
- "movdqa " MEMACCESS(1) ",%%xmm2 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
"pshufb %4,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
- "movdqa " MEMACCESS(1) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
@@ -4151,7 +4151,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jge 40b \n"
"jmp 49f \n"
@@ -4178,7 +4178,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jge 41b \n"
@@ -4237,17 +4237,17 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"pshufhw $0xff,%%xmm0,%%xmm2 \n"
"pshuflw $0xff,%%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
"punpckhbw %%xmm1,%%xmm1 \n"
"pshufhw $0xff,%%xmm1,%%xmm2 \n"
"pshuflw $0xff,%%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
- "movdqa " MEMACCESS(0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"pand %%xmm4,%%xmm2 \n"
@@ -4256,7 +4256,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"pand %%xmm5,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@@ -4389,16 +4389,16 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"phaddw %%xmm1,%%xmm0 \n"
"paddw %%xmm5,%%xmm0 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrld $0x18,%%xmm2 \n"
"psrld $0x18,%%xmm3 \n"
@@ -4411,8 +4411,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
"punpcklwd %%xmm3,%%xmm0 \n"
"punpckhwd %%xmm3,%%xmm1 \n"
"sub $0x8,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@@ -4455,30 +4455,30 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
"pmaddubsw %%xmm2,%%xmm0 \n"
"pmaddubsw %%xmm2,%%xmm6 \n"
"phaddw %%xmm6,%%xmm0 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm5 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm5 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm5 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"phaddw %%xmm1,%%xmm5 \n"
"psrlw $0x7,%%xmm5 \n"
"packuswb %%xmm5,%%xmm5 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm5 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm5 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm5 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"phaddw %%xmm1,%%xmm5 \n"
"psrlw $0x7,%%xmm5 \n"
"packuswb %%xmm5,%%xmm5 \n"
- "movdqa " MEMACCESS(0) ",%%xmm6 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"psrld $0x18,%%xmm6 \n"
"psrld $0x18,%%xmm1 \n"
"packuswb %%xmm1,%%xmm6 \n"
@@ -4488,8 +4488,8 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
"punpcklwd %%xmm5,%%xmm0 \n"
"punpckhwd %%xmm5,%%xmm1 \n"
"sub $0x8,%1 \n"
- "movdqa %%xmm0," MEMACCESS(0) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
@@ -4520,12 +4520,12 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"pmaddubsw %%xmm2,%%xmm0 \n"
"pmaddubsw %%xmm2,%%xmm7 \n"
- "movdqa " MEMACCESS(0) ",%%xmm6 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm6 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"phaddsw %%xmm7,%%xmm0 \n"
@@ -4535,13 +4535,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"packuswb %%xmm0,%%xmm0 \n"
"packuswb %%xmm6,%%xmm6 \n"
"punpcklbw %%xmm6,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm7 \n"
"phaddsw %%xmm7,%%xmm1 \n"
- "movdqa " MEMACCESS(0) ",%%xmm6 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"pmaddubsw %%xmm5,%%xmm6 \n"
"pmaddubsw %%xmm5,%%xmm7 \n"
"phaddsw %%xmm7,%%xmm6 \n"
@@ -4554,8 +4554,8 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"punpcklwd %%xmm1,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm6 \n"
"sub $0x8,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"jg 1b \n"
@@ -4593,14 +4593,14 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
"punpckhbw %%xmm5,%%xmm1 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
"pmullw %%xmm3,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm7 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm7 \n"
"pmullw %%xmm3,%%xmm1 \n"
"pand %%xmm6,%%xmm7 \n"
"paddw %%xmm4,%%xmm0 \n"
@@ -4608,7 +4608,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
"packuswb %%xmm1,%%xmm0 \n"
"por %%xmm7,%%xmm0 \n"
"sub $0x4,%1 \n"
- "movdqa %%xmm0," MEMACCESS(0) " \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
@@ -4637,7 +4637,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
@@ -4648,7 +4648,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@@ -4901,8 +4901,8 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"paddusb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm2 \n"
@@ -4919,10 +4919,10 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"por %%xmm5,%%xmm3 \n"
"por %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movdqa %%xmm1," MEMACCESS(2) " \n"
- "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n"
- "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n"
- "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n"
+ "movdqu %%xmm1," MEMACCESS(2) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
+ "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
"lea " MEMLEA(0x40,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
@@ -4953,12 +4953,12 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
@@ -4992,8 +4992,8 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"paddusb %%xmm1,%%xmm2 \n"
@@ -5010,10 +5010,10 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"punpcklwd %%xmm0,%%xmm7 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
"sub $0x10,%3 \n"
- "movdqa %%xmm6," MEMACCESS(2) " \n"
- "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n"
- "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n"
+ "movdqu %%xmm6," MEMACCESS(2) " \n"
+ "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
+ "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
"lea " MEMLEA(0x40,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
@@ -5060,22 +5060,22 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
"punpcklwd %%xmm1,%%xmm4 \n"
"punpckhwd %%xmm1,%%xmm5 \n"
"paddd %%xmm2,%%xmm0 \n"
- "movdqa " MEMACCESS(2) ",%%xmm2 \n"
+ "movdqu " MEMACCESS(2) ",%%xmm2 \n"
"paddd %%xmm0,%%xmm2 \n"
"paddd %%xmm3,%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n"
+ "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
"paddd %%xmm0,%%xmm3 \n"
"paddd %%xmm4,%%xmm0 \n"
- "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n"
+ "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
"paddd %%xmm0,%%xmm4 \n"
"paddd %%xmm5,%%xmm0 \n"
- "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n"
+ "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
"lea " MEMLEA(0x40,2) ",%2 \n"
"paddd %%xmm0,%%xmm5 \n"
- "movdqa %%xmm2," MEMACCESS(1) " \n"
- "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
- "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n"
- "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n"
+ "movdqu %%xmm2," MEMACCESS(1) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
+ "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"sub $0x4,%3 \n"
"jge 40b \n"
@@ -5140,10 +5140,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 4 pixel small loop \n"
LABELALIGN
"4: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
BUNDLEALIGN
MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
@@ -5174,10 +5174,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 4 pixel loop \n"
LABELALIGN
"40: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
BUNDLEALIGN
MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
@@ -5221,7 +5221,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 1 pixel loop \n"
LABELALIGN
"10: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
"lea " MEMLEA(0x10,0) ",%0 \n"
"psubd " MEMACCESS(1) ",%%xmm0 \n"
@@ -5822,8 +5822,8 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
"pshufd $0x0,%%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
@@ -5852,8 +5852,8 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
"psrld $0x18,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrld $0x8,%%xmm0 \n"
"psrld $0x8,%%xmm1 \n"
@@ -5882,17 +5882,17 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
- "movdqa " MEMACCESS(3) ",%%xmm5 \n"
+ "movdqu " MEMACCESS(3) ",%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
"sub $0x8,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@@ -5909,7 +5909,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
- "movdqa " MEMACCESS(3) ",%%xmm5 \n"
+ "movdqu " MEMACCESS(3) ",%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
diff --git a/source/row_win.cc b/source/row_win.cc
index 1cf0b9a..9996ccd 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -326,8 +326,8 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
punpckhwd xmm1, xmm1
por xmm0, xmm5
por xmm1, xmm5
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -386,17 +386,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
por xmm2, xmm5
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm4
- movdqa [edx + 32], xmm2
+ movdqu [edx + 32], xmm2
por xmm0, xmm5
pshufb xmm1, xmm4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
por xmm1, xmm5
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm4
- movdqa [edx + 16], xmm1
+ movdqu [edx + 16], xmm1
por xmm3, xmm5
sub ecx, 16
- movdqa [edx + 48], xmm3
+ movdqu [edx + 48], xmm3
lea edx, [edx + 64]
jg convertloop
ret
@@ -426,17 +426,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
por xmm2, xmm5
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm4
- movdqa [edx + 32], xmm2
+ movdqu [edx + 32], xmm2
por xmm0, xmm5
pshufb xmm1, xmm4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
por xmm1, xmm5
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm4
- movdqa [edx + 16], xmm1
+ movdqu [edx + 16], xmm1
por xmm3, xmm5
sub ecx, 16
- movdqa [edx + 48], xmm3
+ movdqu [edx + 48], xmm3
lea edx, [edx + 64]
jg convertloop
ret
@@ -491,8 +491,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
movdqa xmm2, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm2, xmm0
- movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
- movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
lea eax, [eax + 16]
sub ecx, 8
jg convertloop
@@ -545,8 +545,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
movdqa xmm2, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm2, xmm0
- movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
- movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
lea eax, [eax + 16]
sub ecx, 8
jg convertloop
@@ -585,8 +585,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
movdqa xmm1, xmm0
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
- movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
- movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
+ movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
+ movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
lea eax, [eax + 16]
sub ecx, 8
jg convertloop
@@ -688,7 +688,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
align 4
convertloop:
- movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B
movdqa xmm2, xmm0 // G
pslld xmm0, 8 // R
@@ -728,7 +728,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
align 4
convertloop:
- movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B
movdqa xmm2, xmm0 // G
movdqa xmm3, xmm0 // R
@@ -766,7 +766,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
align 4
convertloop:
- movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0
pand xmm0, xmm3 // low nibble
pand xmm1, xmm4 // high nibble
@@ -795,10 +795,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
align 4
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
@@ -811,7 +811,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
packuswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
@@ -830,10 +830,10 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
align 4
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
@@ -847,7 +847,7 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm2, 7
packuswb xmm0, xmm2
sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
@@ -1016,10 +1016,10 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
align 4
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
@@ -1032,7 +1032,7 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
packuswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
@@ -1084,10 +1084,10 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
align 4
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
@@ -1100,7 +1100,7 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
packuswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
@@ -1152,10 +1152,10 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
align 4
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
@@ -1168,7 +1168,7 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
packuswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
@@ -1228,10 +1228,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
@@ -1294,10 +1294,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
@@ -1567,10 +1567,10 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
align 4
convertloop:
/* convert to U and V */
- movdqa xmm0, [eax] // U
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax] // U
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm7
pmaddubsw xmm1, xmm7
pmaddubsw xmm2, xmm7
@@ -1582,12 +1582,12 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
- movdqa xmm0, [eax] // V
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax] // V
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm6
pmaddubsw xmm1, xmm6
pmaddubsw xmm2, xmm6
@@ -1599,7 +1599,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm2
paddb xmm0, xmm5
lea eax, [eax + 64]
- movdqa [edx + edi], xmm0
+ movdqu [edx + edi], xmm0
lea edx, [edx + 16]
jg convertloop
@@ -1683,10 +1683,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
@@ -1803,10 +1803,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
@@ -1939,10 +1939,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
@@ -2075,10 +2075,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
@@ -2423,8 +2423,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -2633,8 +2633,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -2678,8 +2678,8 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -2718,8 +2718,8 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -2756,8 +2756,8 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -3004,8 +3004,8 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
movdqa xmm0, xmm5
punpcklwd xmm5, xmm1 // BGRA first 4 pixels
punpckhwd xmm0, xmm1 // BGRA next 4 pixels
- movdqa [edx], xmm5
- movdqa [edx + 16], xmm0
+ movdqu [edx], xmm5
+ movdqu [edx + 16], xmm0
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -3086,8 +3086,8 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm2
punpcklwd xmm2, xmm0 // RGBA first 4 pixels
punpckhwd xmm1, xmm0 // RGBA next 4 pixels
- movdqa [edx], xmm2
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -3168,8 +3168,8 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
movdqa xmm0, xmm5
punpcklwd xmm5, xmm1 // RGBA first 4 pixels
punpckhwd xmm0, xmm1 // RGBA next 4 pixels
- movdqa [edx], xmm5
- movdqa [edx + 16], xmm0
+ movdqu [edx], xmm5
+ movdqu [edx + 16], xmm0
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -3260,8 +3260,8 @@ void YToARGBRow_SSE2(const uint8* y_buf,
punpckhwd xmm1, xmm1 // BGRA next 4 pixels
por xmm0, xmm4
por xmm1, xmm4
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -3468,8 +3468,8 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
align 4
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
movdqa xmm2, xmm0
movdqa xmm3, xmm1
@@ -3479,8 +3479,8 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
psrlw xmm2, 8 // odd bytes
psrlw xmm3, 8
packuswb xmm2, xmm3
- movdqa [edx], xmm0
- movdqa [edx + edi], xmm2
+ movdqu [edx], xmm0
+ movdqu [edx + edi], xmm2
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
@@ -3581,14 +3581,14 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
align 4
convertloop:
- movdqa xmm0, [eax] // read 16 U's
- movdqa xmm1, [eax + edx] // and 16 V's
+ movdqu xmm0, [eax] // read 16 U's
+ movdqu xmm1, [eax + edx] // and 16 V's
lea eax, [eax + 16]
movdqa xmm2, xmm0
punpcklbw xmm0, xmm1 // first 8 UV pairs
punpckhbw xmm2, xmm1 // next 8 UV pairs
- movdqa [edi], xmm0
- movdqa [edi + 16], xmm2
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm2
lea edi, [edi + 32]
sub ecx, 16
jg convertloop
@@ -3763,19 +3763,19 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
align 4
convertloop:
- movdqa xmm2, [eax]
- movdqa xmm3, [eax + 16]
+ movdqu xmm2, [eax]
+ movdqu xmm3, [eax + 16]
lea eax, [eax + 32]
- movdqa xmm4, [edx]
- movdqa xmm5, [edx + 16]
+ movdqu xmm4, [edx]
+ movdqu xmm5, [edx + 16]
pand xmm2, xmm0
pand xmm3, xmm0
pand xmm4, xmm1
pand xmm5, xmm1
por xmm2, xmm4
por xmm3, xmm5
- movdqa [edx], xmm2
- movdqa [edx + 16], xmm3
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm3
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -3835,16 +3835,16 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
punpcklbw xmm2, xmm2
punpckhwd xmm3, xmm2
punpcklwd xmm2, xmm2
- movdqa xmm4, [edx]
- movdqa xmm5, [edx + 16]
+ movdqu xmm4, [edx]
+ movdqu xmm5, [edx + 16]
pand xmm2, xmm0
pand xmm3, xmm0
pand xmm4, xmm1
pand xmm5, xmm1
por xmm2, xmm4
por xmm3, xmm5
- movdqa [edx], xmm2
- movdqa [edx + 16], xmm3
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm3
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -4173,14 +4173,14 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
align 4
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pand xmm0, xmm5 // even bytes are Y
pand xmm1, xmm5
packuswb xmm0, xmm1
sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
@@ -4204,10 +4204,10 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
align 4
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
@@ -4246,8 +4246,8 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
align 4
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
psrlw xmm0, 8 // YUYV -> UVUV
psrlw xmm1, 8
@@ -4385,14 +4385,14 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
align 4
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
psrlw xmm0, 8 // odd bytes are Y
psrlw xmm1, 8
packuswb xmm0, xmm1
sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
@@ -4416,10 +4416,10 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
align 4
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
@@ -4458,8 +4458,8 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
align 4
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pand xmm0, xmm5 // UYVY -> UVUV
pand xmm1, xmm5
@@ -4666,7 +4666,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jge convertloop4
@@ -4782,16 +4782,16 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
// 4 pixel loop.
convertloop4:
- movdqa xmm3, [eax] // src argb
+ movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
- movdqa xmm2, [esi] // _r_b
+ movdqu xmm2, [esi] // _r_b
pshufb xmm3, kShuffleAlpha // alpha
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
- movdqa xmm1, [esi] // _a_g
+ movdqu xmm1, [esi] // _a_g
lea esi, [esi + 16]
psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255
@@ -4801,7 +4801,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jge convertloop4
jmp convertloop4b
@@ -4827,7 +4827,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jge convertuloop4
@@ -4883,17 +4883,17 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
align 4
convertloop:
- movdqa xmm0, [eax] // read 4 pixels
+ movdqu xmm0, [eax] // read 4 pixels
punpcklbw xmm0, xmm0 // first 2
pshufhw xmm2, xmm0, 0FFh // 8 alpha words
pshuflw xmm2, xmm2, 0FFh
pmulhuw xmm0, xmm2 // rgb * a
- movdqa xmm1, [eax] // read 4 pixels
+ movdqu xmm1, [eax] // read 4 pixels
punpckhbw xmm1, xmm1 // next 2 pixels
pshufhw xmm2, xmm1, 0FFh // 8 alpha words
pshuflw xmm2, xmm2, 0FFh
pmulhuw xmm1, xmm2 // rgb * a
- movdqa xmm2, [eax] // alphas
+ movdqu xmm2, [eax] // alphas
lea eax, [eax + 16]
psrlw xmm0, 8
pand xmm2, xmm4
@@ -4902,7 +4902,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
pand xmm0, xmm5 // keep original alphas
por xmm0, xmm2
sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
@@ -5177,16 +5177,16 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
align 4
convertloop:
- movdqa xmm0, [eax] // G
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax] // G
+ movdqu xmm1, [eax + 16]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
phaddw xmm0, xmm1
paddw xmm0, xmm5 // Add .5 for rounding.
psrlw xmm0, 7
packuswb xmm0, xmm0 // 8 G bytes
- movdqa xmm2, [eax] // A
- movdqa xmm3, [eax + 16]
+ movdqu xmm2, [eax] // A
+ movdqu xmm3, [eax + 16]
lea eax, [eax + 32]
psrld xmm2, 24
psrld xmm3, 24
@@ -5199,8 +5199,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
punpcklwd xmm0, xmm3 // GGGA first 4
punpckhwd xmm1, xmm3 // GGGA next 4
sub ecx, 8
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
jg convertloop
ret
@@ -5237,30 +5237,30 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
align 4
convertloop:
- movdqa xmm0, [eax] // B
- movdqa xmm6, [eax + 16]
+ movdqu xmm0, [eax] // B
+ movdqu xmm6, [eax + 16]
pmaddubsw xmm0, xmm2
pmaddubsw xmm6, xmm2
phaddw xmm0, xmm6
psrlw xmm0, 7
packuswb xmm0, xmm0 // 8 B values
- movdqa xmm5, [eax] // G
- movdqa xmm1, [eax + 16]
+ movdqu xmm5, [eax] // G
+ movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm3
pmaddubsw xmm1, xmm3
phaddw xmm5, xmm1
psrlw xmm5, 7
packuswb xmm5, xmm5 // 8 G values
punpcklbw xmm0, xmm5 // 8 BG values
- movdqa xmm5, [eax] // R
- movdqa xmm1, [eax + 16]
+ movdqu xmm5, [eax] // R
+ movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm4
pmaddubsw xmm1, xmm4
phaddw xmm5, xmm1
psrlw xmm5, 7
packuswb xmm5, xmm5 // 8 R values
- movdqa xmm6, [eax] // A
- movdqa xmm1, [eax + 16]
+ movdqu xmm6, [eax] // A
+ movdqu xmm1, [eax + 16]
psrld xmm6, 24
psrld xmm1, 24
packuswb xmm6, xmm1
@@ -5270,8 +5270,8 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
punpcklwd xmm0, xmm5 // BGRA first 4
punpckhwd xmm1, xmm5 // BGRA next 4
sub ecx, 8
- movdqa [eax], xmm0
- movdqa [eax + 16], xmm1
+ movdqu [eax], xmm0
+ movdqu [eax + 16], xmm1
lea eax, [eax + 32]
jg convertloop
ret
@@ -5300,12 +5300,12 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
align 4
convertloop:
- movdqa xmm0, [eax] // B
- movdqa xmm7, [eax + 16]
+ movdqu xmm0, [eax] // B
+ movdqu xmm7, [eax + 16]
pmaddubsw xmm0, xmm2
pmaddubsw xmm7, xmm2
- movdqa xmm6, [eax] // G
- movdqa xmm1, [eax + 16]
+ movdqu xmm6, [eax] // G
+ movdqu xmm1, [eax + 16]
pmaddubsw xmm6, xmm3
pmaddubsw xmm1, xmm3
phaddsw xmm0, xmm7 // B
@@ -5315,13 +5315,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
packuswb xmm0, xmm0 // 8 B values
packuswb xmm6, xmm6 // 8 G values
punpcklbw xmm0, xmm6 // 8 BG values
- movdqa xmm1, [eax] // R
- movdqa xmm7, [eax + 16]
+ movdqu xmm1, [eax] // R
+ movdqu xmm7, [eax + 16]
pmaddubsw xmm1, xmm4
pmaddubsw xmm7, xmm4
phaddsw xmm1, xmm7 // R
- movdqa xmm6, [eax] // A
- movdqa xmm7, [eax + 16]
+ movdqu xmm6, [eax] // A
+ movdqu xmm7, [eax + 16]
pmaddubsw xmm6, xmm5
pmaddubsw xmm7, xmm5
phaddsw xmm6, xmm7 // A
@@ -5334,8 +5334,8 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
punpcklwd xmm0, xmm1 // BGRA first 4
punpckhwd xmm6, xmm1 // BGRA next 4
sub ecx, 8
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm6
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm6
lea eax, [eax + 32]
lea edx, [edx + 32]
jg convertloop
@@ -5368,14 +5368,14 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
align 4
convertloop:
- movdqa xmm0, [eax] // read 4 pixels
+ movdqu xmm0, [eax] // read 4 pixels
punpcklbw xmm0, xmm5 // first 2 pixels
pmulhuw xmm0, xmm2 // pixel * scale >> 16
- movdqa xmm1, [eax] // read 4 pixels
+ movdqu xmm1, [eax] // read 4 pixels
punpckhbw xmm1, xmm5 // next 2 pixels
pmulhuw xmm1, xmm2
pmullw xmm0, xmm3 // * interval_size
- movdqa xmm7, [eax] // read 4 pixels
+ movdqu xmm7, [eax] // read 4 pixels
pmullw xmm1, xmm3
pand xmm7, xmm6 // mask alpha
paddw xmm0, xmm4 // + interval_size / 2
@@ -5383,7 +5383,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
packuswb xmm0, xmm1
por xmm0, xmm7
sub ecx, 4
- movdqa [eax], xmm0
+ movdqu [eax], xmm0
lea eax, [eax + 16]
jg convertloop
ret
@@ -5407,7 +5407,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
align 4
convertloop:
- movdqa xmm0, [eax] // read 4 pixels
+ movdqu xmm0, [eax] // read 4 pixels
lea eax, [eax + 16]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm0 // first 2
@@ -5418,7 +5418,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
psrlw xmm1, 8
packuswb xmm0, xmm1
sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
@@ -5775,8 +5775,8 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
align 4
convertloop:
- movdqa xmm0, [eax] // read 16 pixels src_sobelx
- movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
paddusb xmm0, xmm1 // sobel = sobelx + sobely
movdqa xmm2, xmm0 // GG
@@ -5793,10 +5793,10 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
por xmm3, xmm5 // GGGA
por xmm0, xmm5
sub ecx, 16
- movdqa [edx], xmm1
- movdqa [edx + 16], xmm2
- movdqa [edx + 32], xmm3
- movdqa [edx + 48], xmm0
+ movdqu [edx], xmm1
+ movdqu [edx + 16], xmm2
+ movdqu [edx + 32], xmm3
+ movdqu [edx + 48], xmm0
lea edx, [edx + 64]
jg convertloop
@@ -5821,12 +5821,12 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
align 4
convertloop:
- movdqa xmm0, [eax] // read 16 pixels src_sobelx
- movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
paddusb xmm0, xmm1 // sobel = sobelx + sobely
sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
@@ -5856,8 +5856,8 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
align 4
convertloop:
- movdqa xmm0, [eax] // read 16 pixels src_sobelx
- movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
movdqa xmm2, xmm0
paddusb xmm2, xmm1 // sobel = sobelx + sobely
@@ -5874,10 +5874,10 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
punpcklwd xmm7, xmm0 // Next 4
punpckhwd xmm1, xmm0 // Last 4
sub ecx, 16
- movdqa [edx], xmm6
- movdqa [edx + 16], xmm4
- movdqa [edx + 32], xmm7
- movdqa [edx + 48], xmm1
+ movdqu [edx], xmm6
+ movdqu [edx + 16], xmm4
+ movdqu [edx + 32], xmm7
+ movdqu [edx + 48], xmm1
lea edx, [edx + 64]
jg convertloop
@@ -5933,10 +5933,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
align 4
s4:
// top left
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
// - top right
psubd xmm0, [eax + edx * 4]
@@ -5976,10 +5976,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
align 4
l4:
// top left
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
// - top right
psubd xmm0, [eax + edx * 4]
@@ -6028,7 +6028,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 1 pixel loop
align 4
l1:
- movdqa xmm0, [eax]
+ movdqu xmm0, [eax]
psubd xmm0, [eax + edx * 4]
lea eax, [eax + 16]
psubd xmm0, [esi]
@@ -6084,26 +6084,26 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
punpckhwd xmm5, xmm1
paddd xmm0, xmm2
- movdqa xmm2, [esi] // previous row above.
+ movdqu xmm2, [esi] // previous row above.
paddd xmm2, xmm0
paddd xmm0, xmm3
- movdqa xmm3, [esi + 16]
+ movdqu xmm3, [esi + 16]
paddd xmm3, xmm0
paddd xmm0, xmm4
- movdqa xmm4, [esi + 32]
+ movdqu xmm4, [esi + 32]
paddd xmm4, xmm0
paddd xmm0, xmm5
- movdqa xmm5, [esi + 48]
+ movdqu xmm5, [esi + 48]
lea esi, [esi + 64]
paddd xmm5, xmm0
- movdqa [edx], xmm2
- movdqa [edx + 16], xmm3
- movdqa [edx + 32], xmm4
- movdqa [edx + 48], xmm5
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm3
+ movdqu [edx + 32], xmm4
+ movdqu [edx + 48], xmm5
lea edx, [edx + 64]
sub ecx, 4
@@ -6552,8 +6552,8 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pshufb xmm0, xmm5
pshufb xmm1, xmm5
@@ -6580,8 +6580,8 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
psrld xmm0, 8 // Move green to bottom.
psrld xmm1, 8
@@ -6605,19 +6605,19 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler
- movdqa xmm5, [ecx]
+ movdqu xmm5, [ecx]
mov ecx, [esp + 16] // pix
align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pshufb xmm0, xmm5
pshufb xmm1, xmm5
sub ecx, 8
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
jg wloop
ret
@@ -6631,7 +6631,7 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler
- movdqa xmm5, [ecx]
+ movdqu xmm5, [ecx]
mov ecx, [esp + 16] // pix
align 4