summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorfbarchard@google.com <fbarchard@google.com>2014-09-29 23:53:18 +0000
committerfbarchard@google.com <fbarchard@google.com>2014-09-29 23:53:18 +0000
commitd33bf86b25a6dfab7b2a1c9a4a08e1046fd8326b (patch)
treeca6e68c1d31d5d3c596468b5716af249cdb6d01b
parentc379d17195c31e0b32d0f61741adf8258305cc8b (diff)
downloadlibyuv-d33bf86b25a6dfab7b2a1c9a4a08e1046fd8326b.tar.gz
CopyRow_AVX which supports unaligned pointers for Sandy Bridge CPU.
BUG=363 TESTED=out\release\libyuv_unittest --gtest_filter=*ARGBToARGB_* R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/31489004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1097 16f28f9a-4ce2-e073-06de-1de4eb20be90
-rw-r--r--README.chromium2
-rw-r--r--include/libyuv/row.h2
-rw-r--r--include/libyuv/version.h2
-rw-r--r--source/convert.cc10
-rw-r--r--source/planar_functions.cc5
-rw-r--r--source/rotate.cc5
-rw-r--r--source/rotate_argb.cc5
-rw-r--r--source/row_posix.cc25
-rw-r--r--source/row_win.cc27
9 files changed, 81 insertions, 2 deletions
diff --git a/README.chromium b/README.chromium
index 895e95a..a2b74e7 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1096
+Version: 1097
License: BSD
License File: LICENSE
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 05a7888..e6e2c98 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -122,6 +122,7 @@ extern "C" {
#define HAS_BGRATOUVROW_SSSE3
#define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_ERMS
+#define HAS_COPYROW_AVX
#define HAS_COPYROW_SSE2
#define HAS_COPYROW_X86
#define HAS_HALFROW_SSE2
@@ -891,6 +892,7 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
void CopyRow_X86(const uint8* src, uint8* dst, int count);
void CopyRow_NEON(const uint8* src, uint8* dst, int count);
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index ab370ff..ca5be9b 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1096
+#define LIBYUV_VERSION 1097
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
diff --git a/source/convert.cc b/source/convert.cc
index c31ecf2..f205143 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -201,6 +201,11 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
CopyRow = CopyRow_SSE2;
}
#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
+ CopyRow = CopyRow_AVX;
+ }
+#endif
#if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS;
@@ -441,6 +446,11 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
CopyRow = CopyRow_SSE2;
}
#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
+ CopyRow = CopyRow_AVX;
+ }
+#endif
#if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS;
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 3857008..3cfa6ce 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -53,6 +53,11 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
CopyRow = CopyRow_SSE2;
}
#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
+ CopyRow = CopyRow_AVX;
+ }
+#endif
#if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS;
diff --git a/source/rotate.cc b/source/rotate.cc
index 7f9b13d..890c4b5 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -946,6 +946,11 @@ void RotatePlane180(const uint8* src, int src_stride,
CopyRow = CopyRow_SSE2;
}
#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
+ CopyRow = CopyRow_AVX;
+ }
+#endif
#if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS;
diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc
index ab0f9ce..a8d7fc2 100644
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -136,6 +136,11 @@ void ARGBRotate180(const uint8* src, int src_stride,
CopyRow = CopyRow_SSE2;
}
#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
+ CopyRow = CopyRow_AVX;
+ }
+#endif
#if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS;
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 106fda5..b21002a 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -3266,6 +3266,31 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
}
#endif // HAS_COPYROW_SSE2
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%ymm0 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%ymm0," MEMACCESS(1) " \n"
+ "movdqa %%ymm1," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "ymm0", "ymm1"
+#endif
+ );
+}
+#endif // HAS_COPYROW_AVX
+
#ifdef HAS_COPYROW_X86
void CopyRow_X86(const uint8* src, uint8* dst, int width) {
size_t width_tmp = (size_t)(width);
diff --git a/source/row_win.cc b/source/row_win.cc
index d79c353..f507718 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -3687,6 +3687,32 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
}
#endif // HAS_COPYROW_SSE2
+#ifdef HAS_COPYROW_AVX
+// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked) __declspec(align(16))
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+
+ align 4
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 64
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_COPYROW_AVX
+
// Unaligned Multiple of 1.
__declspec(naked) __declspec(align(16))
void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
@@ -3704,6 +3730,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
}
#ifdef HAS_COPYROW_X86
+// Unaligned Multiple of 4.
__declspec(naked) __declspec(align(16))
void CopyRow_X86(const uint8* src, uint8* dst, int count) {
__asm {