diff options
author | fbarchard@google.com <fbarchard@google.com> | 2014-10-03 21:11:37 +0000 |
---|---|---|
committer | fbarchard@google.com <fbarchard@google.com> | 2014-10-03 21:11:37 +0000 |
commit | b720049a5483bc2e6fa78ea82369ddd476918b7f (patch) | |
tree | 316f00d02d35087e92fa32881b344182534718c2 | |
parent | 147bbede9dad10ddfc3185d082e183537f64355b (diff) | |
download | libyuv-b720049a5483bc2e6fa78ea82369ddd476918b7f.tar.gz |
Make row functions used for planarfunctions and convert use movdqu to relax alignment constraint. Step 1 - make functions unaligned.
BUG=365
TESTED=libyuv_unittest passes
R=harryjin@google.com
Review URL: https://webrtc-codereview.appspot.com/26709004
git-svn-id: http://libyuv.googlecode.com/svn/trunk@1111 16f28f9a-4ce2-e073-06de-1de4eb20be90
-rw-r--r-- | README.chromium | 2 | ||||
-rw-r--r-- | include/libyuv/version.h | 2 | ||||
-rw-r--r-- | source/planar_functions.cc | 113 | ||||
-rw-r--r-- | source/row_posix.cc | 418 | ||||
-rw-r--r-- | source/row_win.cc | 432 |
5 files changed, 455 insertions, 512 deletions
diff --git a/README.chromium b/README.chromium index 91c1ac3..02d8c23 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1109 +Version: 1111 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 8895d54..6d723e5 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1109 +#define LIBYUV_VERSION 1111 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index d5111dc..a4d40fa 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -47,9 +47,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_SSE2; } #endif @@ -101,9 +99,7 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y, } #endif #if defined(HAS_COPYROW_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_16_SSE2; } #endif @@ -254,9 +250,7 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_MIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { MirrorRow = MirrorRow_SSSE3; } #endif @@ -307,14 +301,8 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; - YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - YUY2ToYRow = YUY2ToYRow_SSE2; - } - } + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif @@ -385,14 +373,8 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2; - UYVYToYRow = UYVYToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { - UYVYToUV422Row = UYVYToUV422Row_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - UYVYToYRow = UYVYToYRow_SSE2; - } - } + UYVYToUV422Row = UYVYToUV422Row_SSE2; + UYVYToYRow = UYVYToYRow_SSE2; } } #endif @@ -504,9 +486,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb, } #if defined(HAS_ARGBMIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { ARGBMirrorRow = ARGBMirrorRow_SSSE3; } #endif @@ -824,10 +804,7 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToBGRARow = I422ToBGRARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { - I422ToBGRARow = I422ToBGRARow_SSSE3; - } + I422ToBGRARow = I422ToBGRARow_SSSE3; } } #elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2) @@ -894,10 +871,7 @@ int I422ToABGR(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToABGRRow = I422ToABGRRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { - I422ToABGRRow = I422ToABGRRow_SSSE3; - } + I422ToABGRRow = I422ToABGRRow_SSSE3; } } #endif @@ -956,10 +930,7 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; - } + I422ToRGBARow = I422ToRGBARow_SSSE3; } } #endif @@ -1084,9 +1055,7 @@ void SetPlane(uint8* dst_y, int dst_stride_y, dst_stride_y = 0; } #if defined(HAS_SETROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && - IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { SetRow = SetRow_NEON; } #endif @@ -1150,8 +1119,7 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, dst_stride_argb = 0; } #if defined(HAS_SETROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height); return 0; } @@ -1202,9 +1170,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBATTENUATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBAttenuateRow = ARGBAttenuateRow_SSE2; @@ -1317,9 +1283,7 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBGRAYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_SSSE3; } #elif defined(HAS_ARGBGRAYROW_NEON) @@ -1355,8 +1319,7 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb, dst_stride_argb = 0; } #if defined(HAS_ARGBGRAYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_SSSE3; } #elif defined(HAS_ARGBGRAYROW_NEON) @@ -1388,8 +1351,7 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, dst_stride_argb = 0; } #if defined(HAS_ARGBSEPIAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBSepiaRow = ARGBSepiaRow_SSSE3; } #elif defined(HAS_ARGBSEPIAROW_NEON) @@ -1430,8 +1392,7 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3; } #elif defined(HAS_ARGBCOLORMATRIXROW_NEON) @@ -1573,8 +1534,7 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, dst_stride_argb = 0; } #if defined(HAS_ARGBQUANTIZEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { ARGBQuantizeRow = ARGBQuantizeRow_SSE2; } #elif defined(HAS_ARGBQUANTIZEROW_NEON) @@ -1748,9 +1708,7 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBSHADEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { ARGBShadeRow = ARGBShadeRow_SSE2; } #elif defined(HAS_ARGBSHADEROW_NEON) @@ -1882,11 +1840,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBShuffleRow = ARGBShuffleRow_SSSE3; - } + ARGBShuffleRow = ARGBShuffleRow_SSSE3; } } #endif @@ -1942,8 +1896,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, } // ARGBToBayer used to select G channel from ARGB. #if defined(HAS_ARGBTOBAYERGGROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerGGRow_SSE2; @@ -1951,8 +1904,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerRow_SSSE3; @@ -2043,8 +1995,7 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) = SobelRow_C; #if defined(HAS_SOBELROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { SobelRow = SobelRow_SSE2; } #endif @@ -2065,8 +2016,7 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_, int width) = SobelToPlaneRow_C; #if defined(HAS_SOBELTOPLANEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { SobelToPlaneRow = SobelToPlaneRow_SSE2; } #endif @@ -2088,8 +2038,7 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) = SobelXYRow_C; #if defined(HAS_SOBELXYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { SobelXYRow = SobelXYRow_SSE2; } #endif @@ -2213,10 +2162,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBCOPYALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) && - IS_ALIGNED(width, 8)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; } #endif @@ -2259,10 +2205,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, src_stride_y = dst_stride_argb = 0; } #if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) && - IS_ALIGNED(width, 8)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; } #endif diff --git a/source/row_posix.cc b/source/row_posix.cc index 36111c1..e92c9f5 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -221,7 +221,7 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { "1: \n" "movq " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x8,0) ",%0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -252,8 +252,8 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { "punpckhwd %%xmm1,%%xmm1 \n" "por %%xmm5,%%xmm0 \n" "por %%xmm5,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -318,17 +318,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { "por %%xmm5,%%xmm2 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" - "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "por %%xmm5,%%xmm1 \n" "palignr $0x4,%%xmm3,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "por %%xmm5,%%xmm3 \n" "sub $0x10,%2 \n" - "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" "jg 1b \n" : "+r"(src_rgb24), // %0 @@ -359,17 +359,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { "por %%xmm5,%%xmm2 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" - "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "por %%xmm5,%%xmm1 \n" "palignr $0x4,%%xmm3,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "por %%xmm5,%%xmm3 \n" "sub $0x10,%2 \n" - "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" "jg 1b \n" : "+r"(src_raw), // %0 @@ -418,8 +418,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "punpcklbw %%xmm0,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm2 \n" BUNDLEALIGN - MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) - MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) + MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) + MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) "lea " MEMLEA(0x10,0) ",%0 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -475,8 +475,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "punpcklbw %%xmm0,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm2 \n" BUNDLEALIGN - MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) - MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) + MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) + MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) "lea " MEMLEA(0x10,0) ",%0 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -519,8 +519,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "punpcklbw %%xmm2,%%xmm0 \n" "punpckhbw %%xmm2,%%xmm1 \n" BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2) - MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2) + MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2) + MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2) "lea " MEMLEA(0x10,0) ",%0 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -631,7 +631,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { "pslld $0xb,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "pslld $0x8,%%xmm0 \n" @@ -672,7 +672,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { "pslld $0xf,%%xmm7 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm0,%%xmm3 \n" @@ -712,7 +712,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { "psrlw $0x8,%%xmm3 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pand %%xmm3,%%xmm0 \n" "pand %%xmm4,%%xmm1 \n" @@ -744,10 +744,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" @@ -760,7 +760,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -820,10 +820,10 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "movdqa %4,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" @@ -837,7 +837,7 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -912,10 +912,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" BUNDLEALIGN MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 @@ -979,10 +979,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" BUNDLEALIGN MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 @@ -1187,10 +1187,10 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" @@ -1202,11 +1202,11 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "packsswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" "pmaddubsw %%xmm3,%%xmm0 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm2 \n" @@ -1219,7 +1219,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "paddb %%xmm5,%%xmm0 \n" "lea " MEMLEA(0x40,0) ",%0 \n" BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1) + MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -1317,10 +1317,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -1430,10 +1430,10 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" @@ -1446,7 +1446,7 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_bgra), // %0 @@ -1513,10 +1513,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" BUNDLEALIGN MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 @@ -1640,10 +1640,10 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" @@ -1656,7 +1656,7 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_abgr), // %0 @@ -1714,10 +1714,10 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" @@ -1730,7 +1730,7 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_rgba), // %0 @@ -1797,10 +1797,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" BUNDLEALIGN MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 @@ -1933,10 +1933,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" BUNDLEALIGN MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 @@ -2199,8 +2199,8 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2354,8 +2354,8 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2393,8 +2393,8 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2430,8 +2430,8 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2464,8 +2464,8 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2686,8 +2686,8 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, "movdqa %%xmm5,%%xmm0 \n" "punpcklwd %%xmm1,%%xmm5 \n" "punpckhwd %%xmm1,%%xmm0 \n" - "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n" - "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" + "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n" + "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2725,8 +2725,8 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, "movdqa %%xmm2,%%xmm1 \n" "punpcklwd %%xmm0,%%xmm2 \n" "punpckhwd %%xmm0,%%xmm1 \n" - "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" + "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2765,8 +2765,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "movdqa %%xmm5,%%xmm0 \n" "punpcklwd %%xmm1,%%xmm5 \n" "punpckhwd %%xmm1,%%xmm0 \n" - "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n" - "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" + "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n" + "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2939,8 +2939,8 @@ void YToARGBRow_SSE2(const uint8* y_buf, "punpckhwd %%xmm1,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "por %%xmm4,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" @@ -2970,7 +2970,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { "lea " MEMLEA(-0x10,0) ",%0 \n" LABELALIGN "1: \n" - MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0 + MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0 "pshufb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" @@ -3104,8 +3104,8 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm1,%%xmm3 \n" @@ -3115,8 +3115,8 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { "psrlw $0x8,%%xmm2 \n" "psrlw $0x8,%%xmm3 \n" "packuswb %%xmm3,%%xmm2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2) + "movdqu %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x10,%3 \n" "jg 1b \n" @@ -3183,7 +3183,7 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" "punpcklbw %%xmm1,%%xmm0 \n" @@ -3405,16 +3405,16 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { "punpcklbw %%xmm2,%%xmm2 \n" "punpckhwd %%xmm2,%%xmm3 \n" "punpcklwd %%xmm2,%%xmm2 \n" - "movdqa " MEMACCESS(1) ",%%xmm4 \n" - "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" + "movdqu " MEMACCESS(1) ",%%xmm4 \n" + "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" "pand %%xmm0,%%xmm2 \n" "pand %%xmm0,%%xmm3 \n" "pand %%xmm1,%%xmm4 \n" "pand %%xmm1,%%xmm5 \n" "por %%xmm4,%%xmm2 \n" "por %%xmm5,%%xmm3 \n" - "movdqa %%xmm2," MEMACCESS(1) " \n" - "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -3498,13 +3498,13 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { "psrlw $0x8,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x10,%2 \n" "jg 1b \n" @@ -3527,11 +3527,11 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" BUNDLEALIGN - MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 - MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm3,%%xmm1 \n" @@ -3572,8 +3572,8 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" @@ -3722,14 +3722,14 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_uyvy), // %0 @@ -3751,11 +3751,11 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" BUNDLEALIGN - MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 - MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm3,%%xmm1 \n" @@ -3796,8 +3796,8 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" @@ -4014,7 +4014,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" "jge 41b \n" @@ -4132,16 +4132,16 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, // 4 pixel loop. LABELALIGN "40: \n" - "movdqa " MEMACCESS(0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm3 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm3,%%xmm0 \n" "pxor %%xmm4,%%xmm3 \n" - "movdqa " MEMACCESS(1) ",%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" "pshufb %4,%%xmm3 \n" "pand %%xmm6,%%xmm2 \n" "paddw %%xmm7,%%xmm3 \n" "pmullw %%xmm3,%%xmm2 \n" - "movdqa " MEMACCESS(1) ",%%xmm1 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" "lea " MEMLEA(0x10,1) ",%1 \n" "psrlw $0x8,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" @@ -4151,7 +4151,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" "jge 40b \n" "jmp 49f \n" @@ -4178,7 +4178,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" "jge 41b \n" @@ -4237,17 +4237,17 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { // 4 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n" "pshufhw $0xff,%%xmm0,%%xmm2 \n" "pshuflw $0xff,%%xmm2,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" "punpckhbw %%xmm1,%%xmm1 \n" "pshufhw $0xff,%%xmm1,%%xmm2 \n" "pshuflw $0xff,%%xmm2,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm1 \n" - "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "psrlw $0x8,%%xmm0 \n" "pand %%xmm4,%%xmm2 \n" @@ -4256,7 +4256,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { "pand %%xmm5,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -4389,16 +4389,16 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "phaddw %%xmm1,%%xmm0 \n" "paddw %%xmm5,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "psrld $0x18,%%xmm2 \n" "psrld $0x18,%%xmm3 \n" @@ -4411,8 +4411,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { "punpcklwd %%xmm3,%%xmm0 \n" "punpckhwd %%xmm3,%%xmm1 \n" "sub $0x8,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -4455,30 +4455,30 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" "pmaddubsw %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm2,%%xmm6 \n" "phaddw %%xmm6,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm5 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm5 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm5 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "phaddw %%xmm1,%%xmm5 \n" "psrlw $0x7,%%xmm5 \n" "packuswb %%xmm5,%%xmm5 \n" "punpcklbw %%xmm5,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm5 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm5 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm5 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "phaddw %%xmm1,%%xmm5 \n" "psrlw $0x7,%%xmm5 \n" "packuswb %%xmm5,%%xmm5 \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "psrld $0x18,%%xmm6 \n" "psrld $0x18,%%xmm1 \n" "packuswb %%xmm1,%%xmm6 \n" @@ -4488,8 +4488,8 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { "punpcklwd %%xmm5,%%xmm0 \n" "punpckhwd %%xmm5,%%xmm1 \n" "sub $0x8,%1 \n" - "movdqa %%xmm0," MEMACCESS(0) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" "jg 1b \n" : "+r"(dst_argb), // %0 @@ -4520,12 +4520,12 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" "pmaddubsw %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm2,%%xmm7 \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm6 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "phaddsw %%xmm7,%%xmm0 \n" @@ -4535,13 +4535,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm6,%%xmm6 \n" "punpcklbw %%xmm6,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm7 \n" "phaddsw %%xmm7,%%xmm1 \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" "pmaddubsw %%xmm5,%%xmm6 \n" "pmaddubsw %%xmm5,%%xmm7 \n" "phaddsw %%xmm7,%%xmm6 \n" @@ -4554,8 +4554,8 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "punpcklwd %%xmm1,%%xmm0 \n" "punpckhwd %%xmm1,%%xmm6 \n" "sub $0x8,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,1) ",%1 \n" "jg 1b \n" @@ -4593,14 +4593,14 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, // 4 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "punpcklbw %%xmm5,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" "punpckhbw %%xmm5,%%xmm1 \n" "pmulhuw %%xmm2,%%xmm1 \n" "pmullw %%xmm3,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm7 \n" "pmullw %%xmm3,%%xmm1 \n" "pand %%xmm6,%%xmm7 \n" "paddw %%xmm4,%%xmm0 \n" @@ -4608,7 +4608,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, "packuswb %%xmm1,%%xmm0 \n" "por %%xmm7,%%xmm0 \n" "sub $0x4,%1 \n" - "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" "lea " MEMLEA(0x10,0) ",%0 \n" "jg 1b \n" : "+r"(dst_argb), // %0 @@ -4637,7 +4637,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, // 4 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" @@ -4648,7 +4648,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -4901,8 +4901,8 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" "paddusb %%xmm1,%%xmm0 \n" "movdqa %%xmm0,%%xmm2 \n" @@ -4919,10 +4919,10 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, "por %%xmm5,%%xmm3 \n" "por %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movdqa %%xmm1," MEMACCESS(2) " \n" - "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" - "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n" - "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n" + "movdqu %%xmm1," MEMACCESS(2) " \n" + "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" + "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n" + "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n" "lea " MEMLEA(0x40,2) ",%2 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 @@ -4953,12 +4953,12 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x10,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 @@ -4992,8 +4992,8 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" "paddusb %%xmm1,%%xmm2 \n" @@ -5010,10 +5010,10 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, "punpcklwd %%xmm0,%%xmm7 \n" "punpckhwd %%xmm0,%%xmm1 \n" "sub $0x10,%3 \n" - "movdqa %%xmm6," MEMACCESS(2) " \n" - "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n" - "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n" - "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n" + "movdqu %%xmm6," MEMACCESS(2) " \n" + "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n" + "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n" + "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n" "lea " MEMLEA(0x40,2) ",%2 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 @@ -5060,22 +5060,22 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, "punpcklwd %%xmm1,%%xmm4 \n" "punpckhwd %%xmm1,%%xmm5 \n" "paddd %%xmm2,%%xmm0 \n" - "movdqa " MEMACCESS(2) ",%%xmm2 \n" + "movdqu " MEMACCESS(2) ",%%xmm2 \n" "paddd %%xmm0,%%xmm2 \n" "paddd %%xmm3,%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n" + "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n" "paddd %%xmm0,%%xmm3 \n" "paddd %%xmm4,%%xmm0 \n" - "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n" + "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n" "paddd %%xmm0,%%xmm4 \n" "paddd %%xmm5,%%xmm0 \n" - "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n" + "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n" "lea " MEMLEA(0x40,2) ",%2 \n" "paddd %%xmm0,%%xmm5 \n" - "movdqa %%xmm2," MEMACCESS(1) " \n" - "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" - "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n" - "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n" + "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" "sub $0x4,%3 \n" "jge 40b \n" @@ -5140,10 +5140,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 4 pixel small loop \n" LABELALIGN "4: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" BUNDLEALIGN MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 @@ -5174,10 +5174,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 4 pixel loop \n" LABELALIGN "40: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" BUNDLEALIGN MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 @@ -5221,7 +5221,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 1 pixel loop \n" LABELALIGN "10: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 "lea " MEMLEA(0x10,0) ",%0 \n" "psubd " MEMACCESS(1) ",%%xmm0 \n" @@ -5822,8 +5822,8 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, "pshufd $0x0,%%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" @@ -5852,8 +5852,8 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, "psrld $0x18,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "psrld $0x8,%%xmm0 \n" "psrld $0x8,%%xmm1 \n" @@ -5882,17 +5882,17 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { asm volatile ( - "movdqa " MEMACCESS(3) ",%%xmm5 \n" + "movdqu " MEMACCESS(3) ",%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" "sub $0x8,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -5909,7 +5909,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { asm volatile ( - "movdqa " MEMACCESS(3) ",%%xmm5 \n" + "movdqu " MEMACCESS(3) ",%%xmm5 \n" LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" diff --git a/source/row_win.cc b/source/row_win.cc index 1cf0b9a..9996ccd 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -326,8 +326,8 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { punpckhwd xmm1, xmm1 por xmm0, xmm5 por xmm1, xmm5 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -386,17 +386,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { por xmm2, xmm5 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 - movdqa [edx + 32], xmm2 + movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 por xmm1, xmm5 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 - movdqa [edx + 16], xmm1 + movdqu [edx + 16], xmm1 por xmm3, xmm5 sub ecx, 16 - movdqa [edx + 48], xmm3 + movdqu [edx + 48], xmm3 lea edx, [edx + 64] jg convertloop ret @@ -426,17 +426,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, por xmm2, xmm5 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 - movdqa [edx + 32], xmm2 + movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 por xmm1, xmm5 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 - movdqa [edx + 16], xmm1 + movdqu [edx + 16], xmm1 por xmm3, xmm5 sub ecx, 16 - movdqa [edx + 48], xmm3 + movdqu [edx + 48], xmm3 lea edx, [edx + 64] jg convertloop ret @@ -491,8 +491,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 - movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB - movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 jg convertloop @@ -545,8 +545,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 - movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB - movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 jg convertloop @@ -585,8 +585,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, movdqa xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 - movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB - movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB + movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 jg convertloop @@ -688,7 +688,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { align 4 convertloop: - movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 // B movdqa xmm2, xmm0 // G pslld xmm0, 8 // R @@ -728,7 +728,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { align 4 convertloop: - movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 // B movdqa xmm2, xmm0 // G movdqa xmm3, xmm0 // R @@ -766,7 +766,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { align 4 convertloop: - movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 pand xmm0, xmm3 // low nibble pand xmm1, xmm4 // high nibble @@ -795,10 +795,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 @@ -811,7 +811,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { packuswb xmm0, xmm2 paddb xmm0, xmm5 sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop ret @@ -830,10 +830,10 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 @@ -847,7 +847,7 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { psrlw xmm2, 7 packuswb xmm0, xmm2 sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop ret @@ -1016,10 +1016,10 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 @@ -1032,7 +1032,7 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { packuswb xmm0, xmm2 paddb xmm0, xmm5 sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop ret @@ -1084,10 +1084,10 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 @@ -1100,7 +1100,7 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { packuswb xmm0, xmm2 paddb xmm0, xmm5 sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop ret @@ -1152,10 +1152,10 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 @@ -1168,7 +1168,7 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { packuswb xmm0, xmm2 paddb xmm0, xmm5 sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop ret @@ -1228,10 +1228,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pavgb xmm0, [eax + esi] pavgb xmm1, [eax + esi + 16] pavgb xmm2, [eax + esi + 32] @@ -1294,10 +1294,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pavgb xmm0, [eax + esi] pavgb xmm1, [eax + esi + 16] pavgb xmm2, [eax + esi + 32] @@ -1567,10 +1567,10 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, align 4 convertloop: /* convert to U and V */ - movdqa xmm0, [eax] // U - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] // U + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm7 pmaddubsw xmm1, xmm7 pmaddubsw xmm2, xmm7 @@ -1582,12 +1582,12 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, packsswb xmm0, xmm2 paddb xmm0, xmm5 sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 - movdqa xmm0, [eax] // V - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] // V + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm6 pmaddubsw xmm1, xmm6 pmaddubsw xmm2, xmm6 @@ -1599,7 +1599,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, packsswb xmm0, xmm2 paddb xmm0, xmm5 lea eax, [eax + 64] - movdqa [edx + edi], xmm0 + movdqu [edx + edi], xmm0 lea edx, [edx + 16] jg convertloop @@ -1683,10 +1683,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -1803,10 +1803,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pavgb xmm0, [eax + esi] pavgb xmm1, [eax + esi + 16] pavgb xmm2, [eax + esi + 32] @@ -1939,10 +1939,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pavgb xmm0, [eax + esi] pavgb xmm1, [eax + esi + 16] pavgb xmm2, [eax + esi + 32] @@ -2075,10 +2075,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pavgb xmm0, [eax + esi] pavgb xmm1, [eax + esi + 16] pavgb xmm2, [eax + esi + 32] @@ -2423,8 +2423,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2633,8 +2633,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2678,8 +2678,8 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2718,8 +2718,8 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2756,8 +2756,8 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -3004,8 +3004,8 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf, movdqa xmm0, xmm5 punpcklwd xmm5, xmm1 // BGRA first 4 pixels punpckhwd xmm0, xmm1 // BGRA next 4 pixels - movdqa [edx], xmm5 - movdqa [edx + 16], xmm0 + movdqu [edx], xmm5 + movdqu [edx + 16], xmm0 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -3086,8 +3086,8 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm2 punpcklwd xmm2, xmm0 // RGBA first 4 pixels punpckhwd xmm1, xmm0 // RGBA next 4 pixels - movdqa [edx], xmm2 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -3168,8 +3168,8 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, movdqa xmm0, xmm5 punpcklwd xmm5, xmm1 // RGBA first 4 pixels punpckhwd xmm0, xmm1 // RGBA next 4 pixels - movdqa [edx], xmm5 - movdqa [edx + 16], xmm0 + movdqu [edx], xmm5 + movdqu [edx + 16], xmm0 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -3260,8 +3260,8 @@ void YToARGBRow_SSE2(const uint8* y_buf, punpckhwd xmm1, xmm1 // BGRA next 4 pixels por xmm0, xmm4 por xmm1, xmm4 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -3468,8 +3468,8 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm0 movdqa xmm3, xmm1 @@ -3479,8 +3479,8 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { psrlw xmm2, 8 // odd bytes psrlw xmm3, 8 packuswb xmm2, xmm3 - movdqa [edx], xmm0 - movdqa [edx + edi], xmm2 + movdqu [edx], xmm0 + movdqu [edx + edi], xmm2 lea edx, [edx + 16] sub ecx, 16 jg convertloop @@ -3581,14 +3581,14 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, align 4 convertloop: - movdqa xmm0, [eax] // read 16 U's - movdqa xmm1, [eax + edx] // and 16 V's + movdqu xmm0, [eax] // read 16 U's + movdqu xmm1, [eax + edx] // and 16 V's lea eax, [eax + 16] movdqa xmm2, xmm0 punpcklbw xmm0, xmm1 // first 8 UV pairs punpckhbw xmm2, xmm1 // next 8 UV pairs - movdqa [edi], xmm0 - movdqa [edi + 16], xmm2 + movdqu [edi], xmm0 + movdqu [edi + 16], xmm2 lea edi, [edi + 32] sub ecx, 16 jg convertloop @@ -3763,19 +3763,19 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { align 4 convertloop: - movdqa xmm2, [eax] - movdqa xmm3, [eax + 16] + movdqu xmm2, [eax] + movdqu xmm3, [eax + 16] lea eax, [eax + 32] - movdqa xmm4, [edx] - movdqa xmm5, [edx + 16] + movdqu xmm4, [edx] + movdqu xmm5, [edx + 16] pand xmm2, xmm0 pand xmm3, xmm0 pand xmm4, xmm1 pand xmm5, xmm1 por xmm2, xmm4 por xmm3, xmm5 - movdqa [edx], xmm2 - movdqa [edx + 16], xmm3 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -3835,16 +3835,16 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { punpcklbw xmm2, xmm2 punpckhwd xmm3, xmm2 punpcklwd xmm2, xmm2 - movdqa xmm4, [edx] - movdqa xmm5, [edx + 16] + movdqu xmm4, [edx] + movdqu xmm5, [edx + 16] pand xmm2, xmm0 pand xmm3, xmm0 pand xmm4, xmm1 pand xmm5, xmm1 por xmm2, xmm4 por xmm3, xmm5 - movdqa [edx], xmm2 - movdqa [edx + 16], xmm3 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -4173,14 +4173,14 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] pand xmm0, xmm5 // even bytes are Y pand xmm1, xmm5 packuswb xmm0, xmm1 sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop ret @@ -4204,10 +4204,10 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm2 pavgb xmm1, xmm3 @@ -4246,8 +4246,8 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] psrlw xmm0, 8 // YUYV -> UVUV psrlw xmm1, 8 @@ -4385,14 +4385,14 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] psrlw xmm0, 8 // odd bytes are Y psrlw xmm1, 8 packuswb xmm0, xmm1 sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop ret @@ -4416,10 +4416,10 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm2 pavgb xmm1, xmm3 @@ -4458,8 +4458,8 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] pand xmm0, xmm5 // UYVY -> UVUV pand xmm1, xmm5 @@ -4666,7 +4666,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jge convertloop4 @@ -4782,16 +4782,16 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, // 4 pixel loop. convertloop4: - movdqa xmm3, [eax] // src argb + movdqu xmm3, [eax] // src argb lea eax, [eax + 16] movdqa xmm0, xmm3 // src argb pxor xmm3, xmm4 // ~alpha - movdqa xmm2, [esi] // _r_b + movdqu xmm2, [esi] // _r_b pshufb xmm3, kShuffleAlpha // alpha pand xmm2, xmm6 // _r_b paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha - movdqa xmm1, [esi] // _a_g + movdqu xmm1, [esi] // _a_g lea esi, [esi + 16] psrlw xmm1, 8 // _a_g por xmm0, xmm4 // set alpha to 255 @@ -4801,7 +4801,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jge convertloop4 jmp convertloop4b @@ -4827,7 +4827,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jge convertuloop4 @@ -4883,17 +4883,17 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { align 4 convertloop: - movdqa xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels punpcklbw xmm0, xmm0 // first 2 pshufhw xmm2, xmm0, 0FFh // 8 alpha words pshuflw xmm2, xmm2, 0FFh pmulhuw xmm0, xmm2 // rgb * a - movdqa xmm1, [eax] // read 4 pixels + movdqu xmm1, [eax] // read 4 pixels punpckhbw xmm1, xmm1 // next 2 pixels pshufhw xmm2, xmm1, 0FFh // 8 alpha words pshuflw xmm2, xmm2, 0FFh pmulhuw xmm1, xmm2 // rgb * a - movdqa xmm2, [eax] // alphas + movdqu xmm2, [eax] // alphas lea eax, [eax + 16] psrlw xmm0, 8 pand xmm2, xmm4 @@ -4902,7 +4902,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { pand xmm0, xmm5 // keep original alphas por xmm0, xmm2 sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop @@ -5177,16 +5177,16 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { align 4 convertloop: - movdqa xmm0, [eax] // G - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] // G + movdqu xmm1, [eax + 16] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm0, xmm1 paddw xmm0, xmm5 // Add .5 for rounding. psrlw xmm0, 7 packuswb xmm0, xmm0 // 8 G bytes - movdqa xmm2, [eax] // A - movdqa xmm3, [eax + 16] + movdqu xmm2, [eax] // A + movdqu xmm3, [eax + 16] lea eax, [eax + 32] psrld xmm2, 24 psrld xmm3, 24 @@ -5199,8 +5199,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { punpcklwd xmm0, xmm3 // GGGA first 4 punpckhwd xmm1, xmm3 // GGGA next 4 sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] jg convertloop ret @@ -5237,30 +5237,30 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { align 4 convertloop: - movdqa xmm0, [eax] // B - movdqa xmm6, [eax + 16] + movdqu xmm0, [eax] // B + movdqu xmm6, [eax + 16] pmaddubsw xmm0, xmm2 pmaddubsw xmm6, xmm2 phaddw xmm0, xmm6 psrlw xmm0, 7 packuswb xmm0, xmm0 // 8 B values - movdqa xmm5, [eax] // G - movdqa xmm1, [eax + 16] + movdqu xmm5, [eax] // G + movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm3 pmaddubsw xmm1, xmm3 phaddw xmm5, xmm1 psrlw xmm5, 7 packuswb xmm5, xmm5 // 8 G values punpcklbw xmm0, xmm5 // 8 BG values - movdqa xmm5, [eax] // R - movdqa xmm1, [eax + 16] + movdqu xmm5, [eax] // R + movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm5, xmm1 psrlw xmm5, 7 packuswb xmm5, xmm5 // 8 R values - movdqa xmm6, [eax] // A - movdqa xmm1, [eax + 16] + movdqu xmm6, [eax] // A + movdqu xmm1, [eax + 16] psrld xmm6, 24 psrld xmm1, 24 packuswb xmm6, xmm1 @@ -5270,8 +5270,8 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { punpcklwd xmm0, xmm5 // BGRA first 4 punpckhwd xmm1, xmm5 // BGRA next 4 sub ecx, 8 - movdqa [eax], xmm0 - movdqa [eax + 16], xmm1 + movdqu [eax], xmm0 + movdqu [eax + 16], xmm1 lea eax, [eax + 32] jg convertloop ret @@ -5300,12 +5300,12 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, align 4 convertloop: - movdqa xmm0, [eax] // B - movdqa xmm7, [eax + 16] + movdqu xmm0, [eax] // B + movdqu xmm7, [eax + 16] pmaddubsw xmm0, xmm2 pmaddubsw xmm7, xmm2 - movdqa xmm6, [eax] // G - movdqa xmm1, [eax + 16] + movdqu xmm6, [eax] // G + movdqu xmm1, [eax + 16] pmaddubsw xmm6, xmm3 pmaddubsw xmm1, xmm3 phaddsw xmm0, xmm7 // B @@ -5315,13 +5315,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, packuswb xmm0, xmm0 // 8 B values packuswb xmm6, xmm6 // 8 G values punpcklbw xmm0, xmm6 // 8 BG values - movdqa xmm1, [eax] // R - movdqa xmm7, [eax + 16] + movdqu xmm1, [eax] // R + movdqu xmm7, [eax + 16] pmaddubsw xmm1, xmm4 pmaddubsw xmm7, xmm4 phaddsw xmm1, xmm7 // R - movdqa xmm6, [eax] // A - movdqa xmm7, [eax + 16] + movdqu xmm6, [eax] // A + movdqu xmm7, [eax + 16] pmaddubsw xmm6, xmm5 pmaddubsw xmm7, xmm5 phaddsw xmm6, xmm7 // A @@ -5334,8 +5334,8 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, punpcklwd xmm0, xmm1 // BGRA first 4 punpckhwd xmm6, xmm1 // BGRA next 4 sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm6 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm6 lea eax, [eax + 32] lea edx, [edx + 32] jg convertloop @@ -5368,14 +5368,14 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, align 4 convertloop: - movdqa xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels punpcklbw xmm0, xmm5 // first 2 pixels pmulhuw xmm0, xmm2 // pixel * scale >> 16 - movdqa xmm1, [eax] // read 4 pixels + movdqu xmm1, [eax] // read 4 pixels punpckhbw xmm1, xmm5 // next 2 pixels pmulhuw xmm1, xmm2 pmullw xmm0, xmm3 // * interval_size - movdqa xmm7, [eax] // read 4 pixels + movdqu xmm7, [eax] // read 4 pixels pmullw xmm1, xmm3 pand xmm7, xmm6 // mask alpha paddw xmm0, xmm4 // + interval_size / 2 @@ -5383,7 +5383,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, packuswb xmm0, xmm1 por xmm0, xmm7 sub ecx, 4 - movdqa [eax], xmm0 + movdqu [eax], xmm0 lea eax, [eax + 16] jg convertloop ret @@ -5407,7 +5407,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, align 4 convertloop: - movdqa xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels lea eax, [eax + 16] movdqa xmm1, xmm0 punpcklbw xmm0, xmm0 // first 2 @@ -5418,7 +5418,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, psrlw xmm1, 8 packuswb xmm0, xmm1 sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop @@ -5775,8 +5775,8 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, align 4 convertloop: - movdqa xmm0, [eax] // read 16 pixels src_sobelx - movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] paddusb xmm0, xmm1 // sobel = sobelx + sobely movdqa xmm2, xmm0 // GG @@ -5793,10 +5793,10 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, por xmm3, xmm5 // GGGA por xmm0, xmm5 sub ecx, 16 - movdqa [edx], xmm1 - movdqa [edx + 16], xmm2 - movdqa [edx + 32], xmm3 - movdqa [edx + 48], xmm0 + movdqu [edx], xmm1 + movdqu [edx + 16], xmm2 + movdqu [edx + 32], xmm3 + movdqu [edx + 48], xmm0 lea edx, [edx + 64] jg convertloop @@ -5821,12 +5821,12 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, align 4 convertloop: - movdqa xmm0, [eax] // read 16 pixels src_sobelx - movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] paddusb xmm0, xmm1 // sobel = sobelx + sobely sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop @@ -5856,8 +5856,8 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, align 4 convertloop: - movdqa xmm0, [eax] // read 16 pixels src_sobelx - movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] movdqa xmm2, xmm0 paddusb xmm2, xmm1 // sobel = sobelx + sobely @@ -5874,10 +5874,10 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, punpcklwd xmm7, xmm0 // Next 4 punpckhwd xmm1, xmm0 // Last 4 sub ecx, 16 - movdqa [edx], xmm6 - movdqa [edx + 16], xmm4 - movdqa [edx + 32], xmm7 - movdqa [edx + 48], xmm1 + movdqu [edx], xmm6 + movdqu [edx + 16], xmm4 + movdqu [edx + 32], xmm7 + movdqu [edx + 48], xmm1 lea edx, [edx + 64] jg convertloop @@ -5933,10 +5933,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, align 4 s4: // top left - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] // - top right psubd xmm0, [eax + edx * 4] @@ -5976,10 +5976,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, align 4 l4: // top left - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] // - top right psubd xmm0, [eax + edx * 4] @@ -6028,7 +6028,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 1 pixel loop align 4 l1: - movdqa xmm0, [eax] + movdqu xmm0, [eax] psubd xmm0, [eax + edx * 4] lea eax, [eax + 16] psubd xmm0, [esi] @@ -6084,26 +6084,26 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, punpckhwd xmm5, xmm1 paddd xmm0, xmm2 - movdqa xmm2, [esi] // previous row above. + movdqu xmm2, [esi] // previous row above. paddd xmm2, xmm0 paddd xmm0, xmm3 - movdqa xmm3, [esi + 16] + movdqu xmm3, [esi + 16] paddd xmm3, xmm0 paddd xmm0, xmm4 - movdqa xmm4, [esi + 32] + movdqu xmm4, [esi + 32] paddd xmm4, xmm0 paddd xmm0, xmm5 - movdqa xmm5, [esi + 48] + movdqu xmm5, [esi + 48] lea esi, [esi + 64] paddd xmm5, xmm0 - movdqa [edx], xmm2 - movdqa [edx + 16], xmm3 - movdqa [edx + 32], xmm4 - movdqa [edx + 48], xmm5 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 + movdqu [edx + 32], xmm4 + movdqu [edx + 48], xmm5 lea edx, [edx + 64] sub ecx, 4 @@ -6552,8 +6552,8 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] pshufb xmm0, xmm5 pshufb xmm1, xmm5 @@ -6580,8 +6580,8 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] psrld xmm0, 8 // Move green to bottom. psrld xmm1, 8 @@ -6605,19 +6605,19 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler - movdqa xmm5, [ecx] + movdqu xmm5, [ecx] mov ecx, [esp + 16] // pix align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] pshufb xmm0, xmm5 pshufb xmm1, xmm5 sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] jg wloop ret @@ -6631,7 +6631,7 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler - movdqa xmm5, [ecx] + movdqu xmm5, [ecx] mov ecx, [esp + 16] // pix align 4 |