diff options
author | Marat Dukhan <maratek@google.com> | 2020-04-28 10:26:13 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2020-04-28 10:26:50 -0700 |
commit | 8aaf186f756f4ca7937e0ad24468cb1e17f0da65 (patch) | |
tree | 839033e40db2381fd27c8c2d1c527d988a4bb232 /src/xnnpack/intrinsics-polyfill.h | |
parent | 01849010c146d681fbde8a482f94ef9b213c9e64 (diff) | |
download | XNNPACK-8aaf186f756f4ca7937e0ad24468cb1e17f0da65.tar.gz |
Disable TSan in potentially in-place micro-kernels with OOB reads
SIMD micro-kernels read beyound end of input, and when the operation is in-place and the out-of-bounds elements were written by a different thread, TSan goes crazy. This doesn't cause any errors, though, because all computations out-of-bounds
elements contribute to are discarded.
PiperOrigin-RevId: 308848666
Diffstat (limited to 'src/xnnpack/intrinsics-polyfill.h')
-rw-r--r-- | src/xnnpack/intrinsics-polyfill.h | 33 |
1 files changed, 27 insertions, 6 deletions
diff --git a/src/xnnpack/intrinsics-polyfill.h b/src/xnnpack/intrinsics-polyfill.h index 51da04e82..caa192d30 100644 --- a/src/xnnpack/intrinsics-polyfill.h +++ b/src/xnnpack/intrinsics-polyfill.h @@ -5,6 +5,27 @@ #pragma once +#include <xnnpack/common.h> + + +#if defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1)) +#include <xmmintrin.h> + +static XNN_INTRINSIC XNN_DISABLE_TSAN +__m128 _mm_loadu_ps_notsan(const float* address) { + return _mm_loadu_ps(address); +} +#endif + +#if defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) +#include <emmintrin.h> + +static XNN_INTRINSIC XNN_DISABLE_TSAN +__m128i _mm_loadu_si128_notsan(const __m128i* address) { + return _mm_loadu_si128(address); +} +#endif + #ifdef __AVX512F__ #include <immintrin.h> @@ -15,8 +36,8 @@ (defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \ (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800)) -static inline __mmask16 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_cvtu32_mask16(unsigned int mask) { +static XNN_INTRINSIC +__mmask16 _cvtu32_mask16(unsigned int mask) { return (__mmask16) mask; } @@ -27,8 +48,8 @@ _cvtu32_mask16(unsigned int mask) { (defined(__clang__) && (__clang_major__ < 4)) || \ (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800)) -static inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_add_ps(__m512 v) { +static XNN_INTRINSIC +float _mm512_reduce_add_ps(__m512 v) { #if __AVX512DQ__ const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1)); #else @@ -40,8 +61,8 @@ _mm512_reduce_add_ps(__m512 v) { return _mm_cvtss_f32(sum16); } -static inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_max_ps(__m512 v) { +static XNN_INTRINSIC +float _mm512_reduce_max_ps(__m512 v) { #if __AVX512DQ__ const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1)); #else |