summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorfbarchard@google.com <fbarchard@google.com>2014-10-02 17:56:48 +0000
committerfbarchard@google.com <fbarchard@google.com>2014-10-02 17:56:48 +0000
commit455ae94c60b6a58101cf303a467624bf0499cf21 (patch)
treeb4de8151ac9c77c7236c4de7c2bf6db35a7df4d2
parent044f914c297bb4fd34125879e1ea820dc62ac7db (diff)
downloadlibyuv-455ae94c60b6a58101cf303a467624bf0499cf21.tar.gz
Make rotate SIMD allow unaligned pointers.
BUG=365 TESTED=libyuv_unittest R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/22899004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1102 16f28f9a-4ce2-e073-06de-1de4eb20be90
-rw-r--r--README.chromium2
-rw-r--r--include/libyuv/version.h2
-rw-r--r--source/rotate.cc104
-rw-r--r--source/row_posix.cc10
-rw-r--r--source/row_win.cc10
5 files changed, 59 insertions, 69 deletions
diff --git a/README.chromium b/README.chromium
index 0b76d05..13c1a5d 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1101
+Version: 1102
License: BSD
License File: LICENSE
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index a3f60e0..5d27089 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1101
+#define LIBYUV_VERSION 1102
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
diff --git a/source/rotate.cc b/source/rotate.cc
index 890c4b5..34b6666 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -210,31 +210,31 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
convertloop:
// Read in the data from the source pointer.
// First round of bit swap.
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + edi]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm0 // use xmm7 as temp register.
punpcklbw xmm0, xmm1
punpckhbw xmm7, xmm1
movdqa xmm1, xmm7
- movdqa xmm2, [eax]
- movdqa xmm3, [eax + edi]
+ movdqu xmm2, [eax]
+ movdqu xmm3, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm2
punpcklbw xmm2, xmm3
punpckhbw xmm7, xmm3
movdqa xmm3, xmm7
- movdqa xmm4, [eax]
- movdqa xmm5, [eax + edi]
+ movdqu xmm4, [eax]
+ movdqu xmm5, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm4
punpcklbw xmm4, xmm5
punpckhbw xmm7, xmm5
movdqa xmm5, xmm7
- movdqa xmm6, [eax]
- movdqa xmm7, [eax + edi]
+ movdqu xmm6, [eax]
+ movdqu xmm7, [eax + edi]
lea eax, [eax + 2 * edi]
- movdqa [esp], xmm5 // backup xmm5
+ movdqu [esp], xmm5 // backup xmm5
neg edi
movdqa xmm5, xmm6 // use xmm5 as temp register.
punpcklbw xmm6, xmm7
@@ -255,8 +255,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
punpcklwd xmm4, xmm6
punpckhwd xmm5, xmm6
movdqa xmm6, xmm5
- movdqa xmm5, [esp] // restore xmm5
- movdqa [esp], xmm6 // backup xmm6
+ movdqu xmm5, [esp] // restore xmm5
+ movdqu [esp], xmm6 // backup xmm6
movdqa xmm6, xmm5 // use xmm6 as temp register.
punpcklwd xmm5, xmm7
punpckhwd xmm6, xmm7
@@ -267,7 +267,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
punpckldq xmm0, xmm4
punpckhdq xmm6, xmm4
movdqa xmm4, xmm6
- movdqa xmm6, [esp] // restore xmm6
+ movdqu xmm6, [esp] // restore xmm6
movlpd qword ptr [edx], xmm0
movhpd qword ptr [ebx], xmm0
movlpd qword ptr [edx + esi], xmm4
@@ -427,31 +427,31 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"mov 0x2c(%ecx),%ecx \n"
"1: \n"
- "movdqa (%eax),%xmm0 \n"
- "movdqa (%eax,%edi,1),%xmm1 \n"
+ "movdqu (%eax),%xmm0 \n"
+ "movdqu (%eax,%edi,1),%xmm1 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm0,%xmm7 \n"
"punpcklbw %xmm1,%xmm0 \n"
"punpckhbw %xmm1,%xmm7 \n"
"movdqa %xmm7,%xmm1 \n"
- "movdqa (%eax),%xmm2 \n"
- "movdqa (%eax,%edi,1),%xmm3 \n"
+ "movdqu (%eax),%xmm2 \n"
+ "movdqu (%eax,%edi,1),%xmm3 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm2,%xmm7 \n"
"punpcklbw %xmm3,%xmm2 \n"
"punpckhbw %xmm3,%xmm7 \n"
"movdqa %xmm7,%xmm3 \n"
- "movdqa (%eax),%xmm4 \n"
- "movdqa (%eax,%edi,1),%xmm5 \n"
+ "movdqu (%eax),%xmm4 \n"
+ "movdqu (%eax,%edi,1),%xmm5 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm4,%xmm7 \n"
"punpcklbw %xmm5,%xmm4 \n"
"punpckhbw %xmm5,%xmm7 \n"
"movdqa %xmm7,%xmm5 \n"
- "movdqa (%eax),%xmm6 \n"
- "movdqa (%eax,%edi,1),%xmm7 \n"
+ "movdqu (%eax),%xmm6 \n"
+ "movdqu (%eax,%edi,1),%xmm7 \n"
"lea (%eax,%edi,2),%eax \n"
- "movdqa %xmm5,(%esp) \n"
+ "movdqu %xmm5,(%esp) \n"
"neg %edi \n"
"movdqa %xmm6,%xmm5 \n"
"punpcklbw %xmm7,%xmm6 \n"
@@ -471,8 +471,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"punpcklwd %xmm6,%xmm4 \n"
"punpckhwd %xmm6,%xmm5 \n"
"movdqa %xmm5,%xmm6 \n"
- "movdqa (%esp),%xmm5 \n"
- "movdqa %xmm6,(%esp) \n"
+ "movdqu (%esp),%xmm5 \n"
+ "movdqu %xmm6,(%esp) \n"
"movdqa %xmm5,%xmm6 \n"
"punpcklwd %xmm7,%xmm5 \n"
"punpckhwd %xmm7,%xmm6 \n"
@@ -481,7 +481,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"punpckldq %xmm4,%xmm0 \n"
"punpckhdq %xmm4,%xmm6 \n"
"movdqa %xmm6,%xmm4 \n"
- "movdqa (%esp),%xmm6 \n"
+ "movdqu (%esp),%xmm6 \n"
"movlpd %xmm0,(%edx) \n"
"movhpd %xmm0,(%ebx) \n"
"movlpd %xmm4,(%edx,%esi,1) \n"
@@ -541,38 +541,38 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
// First round of bit swap.
".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%3),%%xmm1 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
- "movdqa (%0),%%xmm2 \n"
+ "movdqu (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n"
- "movdqa (%0,%3),%%xmm3 \n"
+ "movdqu (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n"
- "movdqa (%0),%%xmm4 \n"
+ "movdqu (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
- "movdqa (%0,%3),%%xmm5 \n"
+ "movdqu (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n"
- "movdqa (%0),%%xmm6 \n"
+ "movdqu (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
- "movdqa (%0,%3),%%xmm7 \n"
+ "movdqu (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
@@ -682,29 +682,29 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
// First round of bit swap.
".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%4),%%xmm1 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n"
- "movdqa (%0),%%xmm2 \n"
- "movdqa (%0,%4),%%xmm3 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n"
- "movdqa (%0),%%xmm4 \n"
- "movdqa (%0,%4),%%xmm5 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "movdqu (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n"
- "movdqa (%0),%%xmm6 \n"
- "movdqa (%0,%4),%%xmm7 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
@@ -834,9 +834,7 @@ void TransposePlane(const uint8* src, int src_stride,
}
#endif
#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
TransposeWx8 = TransposeWx8_FAST_SSSE3;
}
#endif
@@ -904,16 +902,12 @@ void RotatePlane180(const uint8* src, int src_stride,
}
#endif
#if defined(HAS_MIRRORROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSE2;
}
#endif
#if defined(HAS_MIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSSE3;
}
#endif
@@ -922,6 +916,7 @@ void RotatePlane180(const uint8* src, int src_stride,
MirrorRow = MirrorRow_AVX2;
}
#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
#if defined(HAS_MIRRORROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
@@ -940,9 +935,7 @@ void RotatePlane180(const uint8* src, int src_stride,
}
#endif
#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_SSE2;
}
#endif
@@ -1032,9 +1025,7 @@ void TransposeUV(const uint8* src, int src_stride,
TransposeUVWx8 = TransposeUVWx8_NEON;
}
#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 8) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
TransposeUVWx8 = TransposeUVWx8_SSE2;
}
#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
@@ -1106,8 +1097,7 @@ void RotateUV180(const uint8* src, int src_stride,
MirrorRowUV = MirrorUVRow_NEON;
}
#elif defined(HAS_MIRRORROW_UV_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
MirrorRowUV = MirrorUVRow_SSSE3;
}
#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 9eb6780..5eefec4 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -2970,10 +2970,10 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
"lea " MEMLEA(-0x10,0) ",%0 \n"
LABELALIGN
"1: \n"
- MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0
+ MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0
"pshufb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src), // %0
@@ -3039,7 +3039,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(-0x10,0) ",%0 \n"
"pshufb %%xmm1,%%xmm0 \n"
"sub $8,%3 \n"
@@ -3077,11 +3077,11 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
"movdqa %3,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"lea " MEMLEA(-0x10,0) ",%0 \n"
"sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src), // %0
diff --git a/source/row_win.cc b/source/row_win.cc
index c8f5550..61602d8 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -3288,10 +3288,10 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
align 4
convertloop:
- movdqa xmm0, [eax + ecx]
+ movdqu xmm0, [eax + ecx]
pshufb xmm0, xmm5
sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
@@ -3381,7 +3381,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
align 4
convertloop:
- movdqa xmm0, [eax]
+ movdqu xmm0, [eax]
lea eax, [eax - 16]
pshufb xmm0, xmm1
sub ecx, 8
@@ -3413,11 +3413,11 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
align 4
convertloop:
- movdqa xmm0, [eax]
+ movdqu xmm0, [eax]
lea eax, [eax - 16]
pshufb xmm0, xmm5
sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret