aboutsummaryrefslogtreecommitdiff
path: root/celt/x86
diff options
context:
space:
mode:
authorHarish Mahendrakar <harish.mahendrakar@ittiam.com>2022-08-08 18:29:15 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2022-08-08 18:29:15 +0000
commitf7ab9bbfd700b50012e041cc2a823c7ac681482e (patch)
tree4bd714368888e70bad2e0709002e0edffa3334ee /celt/x86
parent975ff1d0441ac6688eec6d0281c621603788573b (diff)
parent0c5459731645f6137cf70a2ff47887d07f89e63f (diff)
downloadlibopus-f7ab9bbfd700b50012e041cc2a823c7ac681482e.tar.gz
Merge commit 'c9d5bea13e3cb7381bfa897a45d8bab4e7b767a7' into HEAD am: 170fd0b529 am: 0a0c005652 am: 8e70d76446 am: 0c54597316
Original change: https://android-review.googlesource.com/c/platform/external/libopus/+/2164282 Change-Id: I567bb020c60e7bb36177c155e44cd5bc1891a3b2 Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
Diffstat (limited to 'celt/x86')
-rw-r--r--celt/x86/celt_lpc_sse.h5
-rw-r--r--celt/x86/pitch_sse.h6
-rw-r--r--celt/x86/pitch_sse4_1.c51
-rw-r--r--celt/x86/x86cpu.c37
-rw-r--r--celt/x86/x86cpu.h46
5 files changed, 95 insertions, 50 deletions
diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h
index 7d1ecf75..90e69ecf 100644
--- a/celt/x86/celt_lpc_sse.h
+++ b/celt/x86/celt_lpc_sse.h
@@ -33,7 +33,6 @@
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
-#define OVERRIDE_CELT_FIR
void celt_fir_sse4_1(
const opus_val16 *x,
@@ -44,10 +43,11 @@ void celt_fir_sse4_1(
int arch);
#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define OVERRIDE_CELT_FIR
#define celt_fir(x, num, y, N, ord, arch) \
((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch))
-#else
+#elif defined(OPUS_HAVE_RTCD)
extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
@@ -57,6 +57,7 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
int ord,
int arch);
+#define OVERRIDE_CELT_FIR
# define celt_fir(x, num, y, N, ord, arch) \
((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch))
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index f7a014b6..964aef50 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -63,7 +63,7 @@ void xcorr_kernel_sse(
#define xcorr_kernel(x, y, sum, len, arch) \
((void)arch, xcorr_kernel_sse(x, y, sum, len))
-#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+#elif defined(OPUS_HAVE_RTCD) && ((defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)))
extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
@@ -115,8 +115,8 @@ opus_val32 celt_inner_prod_sse(
((void)arch, celt_inner_prod_sse(x, y, N))
-#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
- (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+#elif defined(OPUS_HAVE_RTCD) && (((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
+ (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)))
extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c
index a092c68b..2bc57830 100644
--- a/celt/x86/pitch_sse4_1.c
+++ b/celt/x86/pitch_sse4_1.c
@@ -117,6 +117,14 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
__m128i sum0, sum1, sum2, sum3, vecSum;
__m128i initSum;
+#ifdef OPUS_CHECK_ASM
+ opus_val32 sum_c[4];
+ for (j=0;j<4;j++) {
+ sum_c[j] = sum[j];
+ }
+ xcorr_kernel_c(x, y, sum_c, len);
+#endif
+
celt_assert(len >= 3);
sum0 = _mm_setzero_si128();
@@ -177,19 +185,56 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
vecSum = _mm_add_epi32(vecSum, sum2);
}
- for (;j<len;j++)
+ vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]);
+ if (len - j == 3)
{
- vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
- vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+ vecX0 = _mm_shuffle_epi32(vecX, 0x55);
+ vecX1 = _mm_shuffle_epi32(vecX, 0xaa);
+ vecX2 = _mm_shuffle_epi32(vecX, 0xff);
vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+ vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+ vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
sum0 = _mm_mullo_epi32(vecX0, vecY0);
+ sum1 = _mm_mullo_epi32(vecX1, vecY1);
+ sum2 = _mm_mullo_epi32(vecX2, vecY2);
+
+ vecSum = _mm_add_epi32(vecSum, sum0);
+ vecSum = _mm_add_epi32(vecSum, sum1);
+ vecSum = _mm_add_epi32(vecSum, sum2);
+ }
+ else if (len - j == 2)
+ {
+ vecX0 = _mm_shuffle_epi32(vecX, 0xaa);
+ vecX1 = _mm_shuffle_epi32(vecX, 0xff);
+
+ vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+ vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+
+ sum0 = _mm_mullo_epi32(vecX0, vecY0);
+ sum1 = _mm_mullo_epi32(vecX1, vecY1);
+
+ vecSum = _mm_add_epi32(vecSum, sum0);
+ vecSum = _mm_add_epi32(vecSum, sum1);
+ }
+ else if (len - j == 1)
+ {
+ vecX0 = _mm_shuffle_epi32(vecX, 0xff);
+
+ vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+
+ sum0 = _mm_mullo_epi32(vecX0, vecY0);
+
vecSum = _mm_add_epi32(vecSum, sum0);
}
initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
initSum = _mm_add_epi32(initSum, vecSum);
_mm_storeu_si128((__m128i *)sum, initSum);
+
+#ifdef OPUS_CHECK_ASM
+ celt_assert(!memcmp(sum_c, sum, sizeof(sum_c)));
+#endif
}
#endif
diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c
index 080eb25e..6a1914de 100644
--- a/celt/x86/x86cpu.c
+++ b/celt/x86/x86cpu.c
@@ -35,11 +35,11 @@
#include "pitch.h"
#include "x86cpu.h"
-#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+#if defined(OPUS_HAVE_RTCD) && \
+ ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
(defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
(defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
- (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
-
+ (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
#if defined(_MSC_VER)
@@ -68,7 +68,8 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
"=r" (CPUInfo[1]),
"=c" (CPUInfo[2]),
"=d" (CPUInfo[3]) :
- "0" (InfoType)
+ /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */
+ "0" (InfoType), "2" (0)
);
#else
__asm__ __volatile__ (
@@ -77,11 +78,22 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
"=b" (CPUInfo[1]),
"=c" (CPUInfo[2]),
"=d" (CPUInfo[3]) :
- "0" (InfoType)
+ /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */
+ "0" (InfoType), "2" (0)
);
#endif
#elif defined(CPU_INFO_BY_C)
- __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]));
+ /* We use __get_cpuid_count to clear ECX to avoid a valgrind false-positive
+ prior to v3.17.0.*/
+ if (!__get_cpuid_count(InfoType, 0, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]))) {
+ /* Our function cannot fail, but __get_cpuid{_count} can.
+ Returning all zeroes will effectively disable all SIMD, which is
+ what we want on CPUs that don't support CPUID. */
+ CPUInfo[3] = CPUInfo[2] = CPUInfo[1] = CPUInfo[0] = 0;
+ }
+#else
+# error "Configured to use x86 RTCD, but no CPU detection method available. " \
+ "Reconfigure with --disable-rtcd (or send patches)."
#endif
}
@@ -98,7 +110,7 @@ typedef struct CPU_Feature{
static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
{
- unsigned int info[4] = {0};
+ unsigned int info[4];
unsigned int nIds = 0;
cpuid(info, 0);
@@ -119,7 +131,7 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
}
}
-int opus_select_arch(void)
+static int opus_select_arch_impl(void)
{
CPU_Feature cpu_feature;
int arch;
@@ -154,4 +166,13 @@ int opus_select_arch(void)
return arch;
}
+int opus_select_arch(void) {
+ int arch = opus_select_arch_impl();
+#ifdef FUZZING
+ /* Randomly downgrade the architecture. */
+ arch = rand()%(arch+1);
+#endif
+ return arch;
+}
+
#endif
diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h
index 1e2bf17b..04e80489 100644
--- a/celt/x86/x86cpu.h
+++ b/celt/x86/x86cpu.h
@@ -56,40 +56,18 @@
int opus_select_arch(void);
# endif
-/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32()
- or _mm_cvtepi16_epi32() when optimizations are disabled, even though the
- actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory
- reference, these require 16-byte alignment and load a full 16 bytes (instead
- of 4 or 8), possibly reading out of bounds.
-
- We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or
- _mm_loadl_epi64(), which should have the same semantics as an m32 or m64
- reference in the PMOVSXWD instruction itself, but gcc is not smart enough to
- optimize this out when optimizations ARE enabled.
-
- Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32
- (which is fair, since technically the compiler is always allowed to do the
- dereference before invoking the function implementing the intrinsic).
- However, it is smart enough to eliminate the extra MOVD instruction.
- For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out
- the extra MOVQ if it's specified explicitly */
-
-# if defined(__clang__) || !defined(__OPTIMIZE__)
-# define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x))))
-# else
-# define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(*(__m128i *)(x)))
-#endif
-
-/* similar reasoning about the instruction sequence as in the 32-bit macro above,
- */
-# if defined(__clang__) || !defined(__OPTIMIZE__)
-# define OP_CVTEPI16_EPI32_M64(x) \
+/*MOVD should not impose any alignment restrictions, but the C standard does,
+ and UBSan will report errors if we actually make unaligned accesses.
+ Use this to work around those restrictions (which should hopefully all get
+ optimized to a single MOVD instruction).*/
+#define OP_LOADU_EPI32(x) \
+ (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\
+ *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U))
+
+#define OP_CVTEPI8_EPI32_M32(x) \
+ (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x))))
+
+#define OP_CVTEPI16_EPI32_M64(x) \
(_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
-# else
-# define OP_CVTEPI16_EPI32_M64(x) \
- (_mm_cvtepi16_epi32(*(__m128i *)(x)))
-# endif
#endif