aboutsummaryrefslogtreecommitdiff
path: root/src/xnnpack/intrinsics-polyfill.h
diff options
context:
space:
mode:
authorMarat Dukhan <maratek@google.com>2020-04-28 10:26:13 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2020-04-28 10:26:50 -0700
commit8aaf186f756f4ca7937e0ad24468cb1e17f0da65 (patch)
tree839033e40db2381fd27c8c2d1c527d988a4bb232 /src/xnnpack/intrinsics-polyfill.h
parent01849010c146d681fbde8a482f94ef9b213c9e64 (diff)
downloadXNNPACK-8aaf186f756f4ca7937e0ad24468cb1e17f0da65.tar.gz
Disable TSan in potentially in-place micro-kernels with OOB reads
SIMD micro-kernels read beyound end of input, and when the operation is in-place and the out-of-bounds elements were written by a different thread, TSan goes crazy. This doesn't cause any errors, though, because all computations out-of-bounds elements contribute to are discarded. PiperOrigin-RevId: 308848666
Diffstat (limited to 'src/xnnpack/intrinsics-polyfill.h')
-rw-r--r--src/xnnpack/intrinsics-polyfill.h33
1 files changed, 27 insertions, 6 deletions
diff --git a/src/xnnpack/intrinsics-polyfill.h b/src/xnnpack/intrinsics-polyfill.h
index 51da04e82..caa192d30 100644
--- a/src/xnnpack/intrinsics-polyfill.h
+++ b/src/xnnpack/intrinsics-polyfill.h
@@ -5,6 +5,27 @@
#pragma once
+#include <xnnpack/common.h>
+
+
+#if defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1))
+#include <xmmintrin.h>
+
+static XNN_INTRINSIC XNN_DISABLE_TSAN
+__m128 _mm_loadu_ps_notsan(const float* address) {
+ return _mm_loadu_ps(address);
+}
+#endif
+
+#if defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2))
+#include <emmintrin.h>
+
+static XNN_INTRINSIC XNN_DISABLE_TSAN
+__m128i _mm_loadu_si128_notsan(const __m128i* address) {
+ return _mm_loadu_si128(address);
+}
+#endif
+
#ifdef __AVX512F__
#include <immintrin.h>
@@ -15,8 +36,8 @@
(defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \
(defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800))
-static inline __mmask16 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_cvtu32_mask16(unsigned int mask) {
+static XNN_INTRINSIC
+__mmask16 _cvtu32_mask16(unsigned int mask) {
return (__mmask16) mask;
}
@@ -27,8 +48,8 @@ _cvtu32_mask16(unsigned int mask) {
(defined(__clang__) && (__clang_major__ < 4)) || \
(defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800))
-static inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_reduce_add_ps(__m512 v) {
+static XNN_INTRINSIC
+float _mm512_reduce_add_ps(__m512 v) {
#if __AVX512DQ__
const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
#else
@@ -40,8 +61,8 @@ _mm512_reduce_add_ps(__m512 v) {
return _mm_cvtss_f32(sum16);
}
-static inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_reduce_max_ps(__m512 v) {
+static XNN_INTRINSIC
+float _mm512_reduce_max_ps(__m512 v) {
#if __AVX512DQ__
const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
#else