diff options
author | Harish Mahendrakar <harish.mahendrakar@ittiam.com> | 2022-08-08 18:29:15 +0000 |
---|---|---|
committer | Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> | 2022-08-08 18:29:15 +0000 |
commit | f7ab9bbfd700b50012e041cc2a823c7ac681482e (patch) | |
tree | 4bd714368888e70bad2e0709002e0edffa3334ee /celt/x86 | |
parent | 975ff1d0441ac6688eec6d0281c621603788573b (diff) | |
parent | 0c5459731645f6137cf70a2ff47887d07f89e63f (diff) | |
download | libopus-f7ab9bbfd700b50012e041cc2a823c7ac681482e.tar.gz |
Merge commit 'c9d5bea13e3cb7381bfa897a45d8bab4e7b767a7' into HEAD am: 170fd0b529 am: 0a0c005652 am: 8e70d76446 am: 0c54597316
Original change: https://android-review.googlesource.com/c/platform/external/libopus/+/2164282
Change-Id: I567bb020c60e7bb36177c155e44cd5bc1891a3b2
Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
Diffstat (limited to 'celt/x86')
-rw-r--r-- | celt/x86/celt_lpc_sse.h | 5 | ||||
-rw-r--r-- | celt/x86/pitch_sse.h | 6 | ||||
-rw-r--r-- | celt/x86/pitch_sse4_1.c | 51 | ||||
-rw-r--r-- | celt/x86/x86cpu.c | 37 | ||||
-rw-r--r-- | celt/x86/x86cpu.h | 46 |
5 files changed, 95 insertions, 50 deletions
diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h index 7d1ecf75..90e69ecf 100644 --- a/celt/x86/celt_lpc_sse.h +++ b/celt/x86/celt_lpc_sse.h @@ -33,7 +33,6 @@ #endif #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) -#define OVERRIDE_CELT_FIR void celt_fir_sse4_1( const opus_val16 *x, @@ -44,10 +43,11 @@ void celt_fir_sse4_1( int arch); #if defined(OPUS_X86_PRESUME_SSE4_1) +#define OVERRIDE_CELT_FIR #define celt_fir(x, num, y, N, ord, arch) \ ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch)) -#else +#elif defined(OPUS_HAVE_RTCD) extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, @@ -57,6 +57,7 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( int ord, int arch); +#define OVERRIDE_CELT_FIR # define celt_fir(x, num, y, N, ord, arch) \ ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch)) diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h index f7a014b6..964aef50 100644 --- a/celt/x86/pitch_sse.h +++ b/celt/x86/pitch_sse.h @@ -63,7 +63,7 @@ void xcorr_kernel_sse( #define xcorr_kernel(x, y, sum, len, arch) \ ((void)arch, xcorr_kernel_sse(x, y, sum, len)) -#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) +#elif defined(OPUS_HAVE_RTCD) && ((defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))) extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, @@ -115,8 +115,8 @@ opus_val32 celt_inner_prod_sse( ((void)arch, celt_inner_prod_sse(x, y, N)) -#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \ - (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) +#elif defined(OPUS_HAVE_RTCD) && (((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))) extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c index a092c68b..2bc57830 100644 --- a/celt/x86/pitch_sse4_1.c +++ b/celt/x86/pitch_sse4_1.c @@ -117,6 +117,14 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 __m128i sum0, sum1, sum2, sum3, vecSum; __m128i initSum; +#ifdef OPUS_CHECK_ASM + opus_val32 sum_c[4]; + for (j=0;j<4;j++) { + sum_c[j] = sum[j]; + } + xcorr_kernel_c(x, y, sum_c, len); +#endif + celt_assert(len >= 3); sum0 = _mm_setzero_si128(); @@ -177,19 +185,56 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 vecSum = _mm_add_epi32(vecSum, sum2); } - for (;j<len;j++) + vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]); + if (len - j == 3) { - vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); - vecX0 = _mm_shuffle_epi32(vecX, 0x00); + vecX0 = _mm_shuffle_epi32(vecX, 0x55); + vecX1 = _mm_shuffle_epi32(vecX, 0xaa); + vecX2 = _mm_shuffle_epi32(vecX, 0xff); vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); + vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); sum0 = _mm_mullo_epi32(vecX0, vecY0); + sum1 = _mm_mullo_epi32(vecX1, vecY1); + sum2 = _mm_mullo_epi32(vecX2, vecY2); + + vecSum = _mm_add_epi32(vecSum, sum0); + vecSum = _mm_add_epi32(vecSum, sum1); + vecSum = _mm_add_epi32(vecSum, sum2); + } + else if (len - j == 2) + { + vecX0 = _mm_shuffle_epi32(vecX, 0xaa); + vecX1 = _mm_shuffle_epi32(vecX, 0xff); + + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); + + sum0 = _mm_mullo_epi32(vecX0, vecY0); + sum1 = _mm_mullo_epi32(vecX1, vecY1); + + vecSum = _mm_add_epi32(vecSum, sum0); + vecSum = _mm_add_epi32(vecSum, sum1); + } + else if (len - j == 1) + { + vecX0 = _mm_shuffle_epi32(vecX, 0xff); + + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + + sum0 = _mm_mullo_epi32(vecX0, vecY0); + vecSum = _mm_add_epi32(vecSum, sum0); } initSum = _mm_loadu_si128((__m128i *)(&sum[0])); initSum = _mm_add_epi32(initSum, vecSum); _mm_storeu_si128((__m128i *)sum, initSum); + +#ifdef OPUS_CHECK_ASM + celt_assert(!memcmp(sum_c, sum, sizeof(sum_c))); +#endif } #endif diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c index 080eb25e..6a1914de 100644 --- a/celt/x86/x86cpu.c +++ b/celt/x86/x86cpu.c @@ -35,11 +35,11 @@ #include "pitch.h" #include "x86cpu.h" -#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ +#if defined(OPUS_HAVE_RTCD) && \ + ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ - (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)) - + (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))) #if defined(_MSC_VER) @@ -68,7 +68,8 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) "=r" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3]) : - "0" (InfoType) + /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */ + "0" (InfoType), "2" (0) ); #else __asm__ __volatile__ ( @@ -77,11 +78,22 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3]) : - "0" (InfoType) + /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */ + "0" (InfoType), "2" (0) ); #endif #elif defined(CPU_INFO_BY_C) - __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3])); + /* We use __get_cpuid_count to clear ECX to avoid a valgrind false-positive + prior to v3.17.0.*/ + if (!__get_cpuid_count(InfoType, 0, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]))) { + /* Our function cannot fail, but __get_cpuid{_count} can. + Returning all zeroes will effectively disable all SIMD, which is + what we want on CPUs that don't support CPUID. */ + CPUInfo[3] = CPUInfo[2] = CPUInfo[1] = CPUInfo[0] = 0; + } +#else +# error "Configured to use x86 RTCD, but no CPU detection method available. " \ + "Reconfigure with --disable-rtcd (or send patches)." #endif } @@ -98,7 +110,7 @@ typedef struct CPU_Feature{ static void opus_cpu_feature_check(CPU_Feature *cpu_feature) { - unsigned int info[4] = {0}; + unsigned int info[4]; unsigned int nIds = 0; cpuid(info, 0); @@ -119,7 +131,7 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature) } } -int opus_select_arch(void) +static int opus_select_arch_impl(void) { CPU_Feature cpu_feature; int arch; @@ -154,4 +166,13 @@ int opus_select_arch(void) return arch; } +int opus_select_arch(void) { + int arch = opus_select_arch_impl(); +#ifdef FUZZING + /* Randomly downgrade the architecture. */ + arch = rand()%(arch+1); +#endif + return arch; +} + #endif diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h index 1e2bf17b..04e80489 100644 --- a/celt/x86/x86cpu.h +++ b/celt/x86/x86cpu.h @@ -56,40 +56,18 @@ int opus_select_arch(void); # endif -/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32() - or _mm_cvtepi16_epi32() when optimizations are disabled, even though the - actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory - reference, these require 16-byte alignment and load a full 16 bytes (instead - of 4 or 8), possibly reading out of bounds. - - We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or - _mm_loadl_epi64(), which should have the same semantics as an m32 or m64 - reference in the PMOVSXWD instruction itself, but gcc is not smart enough to - optimize this out when optimizations ARE enabled. - - Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32 - (which is fair, since technically the compiler is always allowed to do the - dereference before invoking the function implementing the intrinsic). - However, it is smart enough to eliminate the extra MOVD instruction. - For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out - the extra MOVQ if it's specified explicitly */ - -# if defined(__clang__) || !defined(__OPTIMIZE__) -# define OP_CVTEPI8_EPI32_M32(x) \ - (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x)))) -# else -# define OP_CVTEPI8_EPI32_M32(x) \ - (_mm_cvtepi8_epi32(*(__m128i *)(x))) -#endif - -/* similar reasoning about the instruction sequence as in the 32-bit macro above, - */ -# if defined(__clang__) || !defined(__OPTIMIZE__) -# define OP_CVTEPI16_EPI32_M64(x) \ +/*MOVD should not impose any alignment restrictions, but the C standard does, + and UBSan will report errors if we actually make unaligned accesses. + Use this to work around those restrictions (which should hopefully all get + optimized to a single MOVD instruction).*/ +#define OP_LOADU_EPI32(x) \ + (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\ + *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U)) + +#define OP_CVTEPI8_EPI32_M32(x) \ + (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x)))) + +#define OP_CVTEPI16_EPI32_M64(x) \ (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x)))) -# else -# define OP_CVTEPI16_EPI32_M64(x) \ - (_mm_cvtepi16_epi32(*(__m128i *)(x))) -# endif #endif |