diff options
author | fbarchard@google.com <fbarchard@google.com> | 2014-09-29 23:53:18 +0000 |
---|---|---|
committer | fbarchard@google.com <fbarchard@google.com> | 2014-09-29 23:53:18 +0000 |
commit | d33bf86b25a6dfab7b2a1c9a4a08e1046fd8326b (patch) | |
tree | ca6e68c1d31d5d3c596468b5716af249cdb6d01b | |
parent | c379d17195c31e0b32d0f61741adf8258305cc8b (diff) | |
download | libyuv-d33bf86b25a6dfab7b2a1c9a4a08e1046fd8326b.tar.gz |
CopyRow_AVX which supports unaligned pointers for Sandy Bridge CPU.
BUG=363
TESTED=out\release\libyuv_unittest --gtest_filter=*ARGBToARGB_*
R=tpsiaki@google.com
Review URL: https://webrtc-codereview.appspot.com/31489004
git-svn-id: http://libyuv.googlecode.com/svn/trunk@1097 16f28f9a-4ce2-e073-06de-1de4eb20be90
-rw-r--r-- | README.chromium | 2 | ||||
-rw-r--r-- | include/libyuv/row.h | 2 | ||||
-rw-r--r-- | include/libyuv/version.h | 2 | ||||
-rw-r--r-- | source/convert.cc | 10 | ||||
-rw-r--r-- | source/planar_functions.cc | 5 | ||||
-rw-r--r-- | source/rotate.cc | 5 | ||||
-rw-r--r-- | source/rotate_argb.cc | 5 | ||||
-rw-r--r-- | source/row_posix.cc | 25 | ||||
-rw-r--r-- | source/row_win.cc | 27 |
9 files changed, 81 insertions, 2 deletions
diff --git a/README.chromium b/README.chromium index 895e95a..a2b74e7 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1096 +Version: 1097 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 05a7888..e6e2c98 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -122,6 +122,7 @@ extern "C" { #define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_ERMS +#define HAS_COPYROW_AVX #define HAS_COPYROW_SSE2 #define HAS_COPYROW_X86 #define HAS_HALFROW_SSE2 @@ -891,6 +892,7 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); void CopyRow_SSE2(const uint8* src, uint8* dst, int count); +void CopyRow_AVX(const uint8* src, uint8* dst, int count); void CopyRow_ERMS(const uint8* src, uint8* dst, int count); void CopyRow_X86(const uint8* src, uint8* dst, int count); void CopyRow_NEON(const uint8* src, uint8* dst, int count); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index ab370ff..ca5be9b 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1096 +#define LIBYUV_VERSION 1097 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index c31ecf2..f205143 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -201,6 +201,11 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, CopyRow = CopyRow_SSE2; } #endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_AVX; + } +#endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; @@ -441,6 +446,11 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, CopyRow = CopyRow_SSE2; } #endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_AVX; + } +#endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 3857008..3cfa6ce 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -53,6 +53,11 @@ void CopyPlane(const uint8* src_y, int src_stride_y, CopyRow = CopyRow_SSE2; } #endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_AVX; + } +#endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; diff --git a/source/rotate.cc b/source/rotate.cc index 7f9b13d..890c4b5 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -946,6 +946,11 @@ void RotatePlane180(const uint8* src, int src_stride, CopyRow = CopyRow_SSE2; } #endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_AVX; + } +#endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index ab0f9ce..a8d7fc2 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -136,6 +136,11 @@ void ARGBRotate180(const uint8* src, int src_stride, CopyRow = CopyRow_SSE2; } #endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_AVX; + } +#endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; diff --git a/source/row_posix.cc b/source/row_posix.cc index 106fda5..b21002a 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -3266,6 +3266,31 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { } #endif // HAS_COPYROW_SSE2 +#ifdef HAS_COPYROW_AVX +void CopyRow_AVX(const uint8* src, uint8* dst, int count) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%ymm0 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%ymm0," MEMACCESS(1) " \n" + "movdqa %%ymm1," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "ymm0", "ymm1" +#endif + ); +} +#endif // HAS_COPYROW_AVX + #ifdef HAS_COPYROW_X86 void CopyRow_X86(const uint8* src, uint8* dst, int width) { size_t width_tmp = (size_t)(width); diff --git a/source/row_win.cc b/source/row_win.cc index d79c353..f507718 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -3687,6 +3687,32 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { } #endif // HAS_COPYROW_SSE2 +#ifdef HAS_COPYROW_AVX +// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. +__declspec(naked) __declspec(align(16)) +void CopyRow_AVX(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 64 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_COPYROW_AVX + // Unaligned Multiple of 1. __declspec(naked) __declspec(align(16)) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { @@ -3704,6 +3730,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { } #ifdef HAS_COPYROW_X86 +// Unaligned Multiple of 4. __declspec(naked) __declspec(align(16)) void CopyRow_X86(const uint8* src, uint8* dst, int count) { __asm { |