diff options
Diffstat (limited to 'src/xnnpack/intrinsics-polyfill.h')
-rw-r--r-- | src/xnnpack/intrinsics-polyfill.h | 33 |
1 files changed, 27 insertions, 6 deletions
diff --git a/src/xnnpack/intrinsics-polyfill.h b/src/xnnpack/intrinsics-polyfill.h index 51da04e82..caa192d30 100644 --- a/src/xnnpack/intrinsics-polyfill.h +++ b/src/xnnpack/intrinsics-polyfill.h @@ -5,6 +5,27 @@ #pragma once +#include <xnnpack/common.h> + + +#if defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1)) +#include <xmmintrin.h> + +static XNN_INTRINSIC XNN_DISABLE_TSAN +__m128 _mm_loadu_ps_notsan(const float* address) { + return _mm_loadu_ps(address); +} +#endif + +#if defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) +#include <emmintrin.h> + +static XNN_INTRINSIC XNN_DISABLE_TSAN +__m128i _mm_loadu_si128_notsan(const __m128i* address) { + return _mm_loadu_si128(address); +} +#endif + #ifdef __AVX512F__ #include <immintrin.h> @@ -15,8 +36,8 @@ (defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \ (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800)) -static inline __mmask16 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_cvtu32_mask16(unsigned int mask) { +static XNN_INTRINSIC +__mmask16 _cvtu32_mask16(unsigned int mask) { return (__mmask16) mask; } @@ -27,8 +48,8 @@ _cvtu32_mask16(unsigned int mask) { (defined(__clang__) && (__clang_major__ < 4)) || \ (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800)) -static inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_add_ps(__m512 v) { +static XNN_INTRINSIC +float _mm512_reduce_add_ps(__m512 v) { #if __AVX512DQ__ const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1)); #else @@ -40,8 +61,8 @@ _mm512_reduce_add_ps(__m512 v) { return _mm_cvtss_f32(sum16); } -static inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_max_ps(__m512 v) { +static XNN_INTRINSIC +float _mm512_reduce_max_ps(__m512 v) { #if __AVX512DQ__ const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1)); #else |