aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/f32-hswish/gen/sse-x4.c3
-rw-r--r--src/f32-hswish/gen/sse-x8.c3
-rw-r--r--src/f32-hswish/sse.c.in3
-rw-r--r--src/f32-maxpool/9p8x-minmax-sse-c4.c39
-rw-r--r--src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c3
-rw-r--r--src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c3
-rw-r--r--src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c3
-rw-r--r--src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c3
-rw-r--r--src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c3
-rw-r--r--src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c3
-rw-r--r--src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c3
-rw-r--r--src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c3
-rw-r--r--src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c3
-rw-r--r--src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c3
-rw-r--r--src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c3
-rw-r--r--src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c3
-rw-r--r--src/f32-raddstoreexpminusmax/sse2-p5.c.in3
-rw-r--r--src/f32-sigmoid/gen/sse2-p5-div-x12.c3
-rw-r--r--src/f32-sigmoid/gen/sse2-p5-div-x16.c3
-rw-r--r--src/f32-sigmoid/gen/sse2-p5-div-x20.c3
-rw-r--r--src/f32-sigmoid/gen/sse2-p5-div-x24.c3
-rw-r--r--src/f32-sigmoid/gen/sse2-p5-div-x4.c3
-rw-r--r--src/f32-sigmoid/gen/sse2-p5-div-x8.c3
-rw-r--r--src/f32-sigmoid/gen/sse41-p5-div-x12.c3
-rw-r--r--src/f32-sigmoid/gen/sse41-p5-div-x16.c3
-rw-r--r--src/f32-sigmoid/gen/sse41-p5-div-x20.c3
-rw-r--r--src/f32-sigmoid/gen/sse41-p5-div-x24.c3
-rw-r--r--src/f32-sigmoid/gen/sse41-p5-div-x4.c3
-rw-r--r--src/f32-sigmoid/gen/sse41-p5-div-x8.c3
-rw-r--r--src/f32-sigmoid/sse-p5-div.c.in3
-rw-r--r--src/f32-vbinary/gen/vadd-minmax-sse-x4.c5
-rw-r--r--src/f32-vbinary/gen/vadd-minmax-sse-x8.c5
-rw-r--r--src/f32-vbinary/gen/vaddc-minmax-sse-x4.c3
-rw-r--r--src/f32-vbinary/gen/vaddc-minmax-sse-x8.c3
-rw-r--r--src/f32-vbinary/gen/vdiv-minmax-sse-x4.c5
-rw-r--r--src/f32-vbinary/gen/vdiv-minmax-sse-x8.c5
-rw-r--r--src/f32-vbinary/gen/vdivc-minmax-sse-x4.c3
-rw-r--r--src/f32-vbinary/gen/vdivc-minmax-sse-x8.c3
-rw-r--r--src/f32-vbinary/gen/vmax-sse-x4.c5
-rw-r--r--src/f32-vbinary/gen/vmax-sse-x8.c5
-rw-r--r--src/f32-vbinary/gen/vmaxc-sse-x4.c3
-rw-r--r--src/f32-vbinary/gen/vmaxc-sse-x8.c3
-rw-r--r--src/f32-vbinary/gen/vmin-sse-x4.c5
-rw-r--r--src/f32-vbinary/gen/vmin-sse-x8.c5
-rw-r--r--src/f32-vbinary/gen/vminc-sse-x4.c3
-rw-r--r--src/f32-vbinary/gen/vminc-sse-x8.c3
-rw-r--r--src/f32-vbinary/gen/vmul-minmax-sse-x4.c5
-rw-r--r--src/f32-vbinary/gen/vmul-minmax-sse-x8.c5
-rw-r--r--src/f32-vbinary/gen/vmulc-minmax-sse-x4.c3
-rw-r--r--src/f32-vbinary/gen/vmulc-minmax-sse-x8.c3
-rw-r--r--src/f32-vbinary/gen/vrdivc-minmax-sse-x4.c3
-rw-r--r--src/f32-vbinary/gen/vrdivc-minmax-sse-x8.c3
-rw-r--r--src/f32-vbinary/gen/vrsubc-minmax-sse-x4.c3
-rw-r--r--src/f32-vbinary/gen/vrsubc-minmax-sse-x8.c3
-rw-r--r--src/f32-vbinary/gen/vsub-minmax-sse-x4.c5
-rw-r--r--src/f32-vbinary/gen/vsub-minmax-sse-x8.c5
-rw-r--r--src/f32-vbinary/gen/vsubc-minmax-sse-x4.c3
-rw-r--r--src/f32-vbinary/gen/vsubc-minmax-sse-x8.c3
-rw-r--r--src/f32-vbinary/vop-sse.c.in5
-rw-r--r--src/f32-vbinary/vopc-sse.c.in3
-rw-r--r--src/u8-maxpool/9p8x-minmax-sse2-c16.c39
-rw-r--r--src/xnnpack/common.h8
-rw-r--r--src/xnnpack/intrinsics-polyfill.h33
63 files changed, 206 insertions, 116 deletions
diff --git a/src/f32-hswish/gen/sse-x4.c b/src/f32-hswish/gen/sse-x4.c
index a09107e9e..f1b8c6c96 100644
--- a/src/f32-hswish/gen/sse-x4.c
+++ b/src/f32-hswish/gen/sse-x4.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -58,7 +59,7 @@ void xnn_f32_hswish_ukernel__sse_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx0123 = _mm_loadu_ps(x);
+ const __m128 vx0123 = _mm_loadu_ps_notsan(x);
__m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
vacc0123 = _mm_add_ps(vacc0123, vhalf);
vacc0123 = _mm_max_ps(vacc0123, vzero);
diff --git a/src/f32-hswish/gen/sse-x8.c b/src/f32-hswish/gen/sse-x8.c
index 87e2d50b5..f4639a069 100644
--- a/src/f32-hswish/gen/sse-x8.c
+++ b/src/f32-hswish/gen/sse-x8.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -65,7 +66,7 @@ void xnn_f32_hswish_ukernel__sse_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx0123 = _mm_loadu_ps(x);
+ const __m128 vx0123 = _mm_loadu_ps_notsan(x);
__m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
vacc0123 = _mm_add_ps(vacc0123, vhalf);
vacc0123 = _mm_max_ps(vacc0123, vzero);
diff --git a/src/f32-hswish/sse.c.in b/src/f32-hswish/sse.c.in
index 88c057cc8..40ea6813c 100644
--- a/src/f32-hswish/sse.c.in
+++ b/src/f32-hswish/sse.c.in
@@ -11,6 +11,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -67,7 +68,7 @@ void xnn_f32_hswish_ukernel__sse_x${BATCH_TILE}(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx0123 = _mm_loadu_ps(x);
+ const __m128 vx0123 = _mm_loadu_ps_notsan(x);
__m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
vacc0123 = _mm_add_ps(vacc0123, vhalf);
vacc0123 = _mm_max_ps(vacc0123, vzero);
diff --git a/src/f32-maxpool/9p8x-minmax-sse-c4.c b/src/f32-maxpool/9p8x-minmax-sse-c4.c
index 3606b76e6..81abdab32 100644
--- a/src/f32-maxpool/9p8x-minmax-sse-c4.c
+++ b/src/f32-maxpool/9p8x-minmax-sse-c4.c
@@ -7,10 +7,11 @@
#include <xmmintrin.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/maxpool.h>
-XNN_DISABLE_TSAN void xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4(
+void xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4(
size_t output_pixels,
size_t kernel_elements,
size_t channels,
@@ -108,23 +109,23 @@ XNN_DISABLE_TSAN void xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4(
o += 4;
}
if (c != 0) {
- const __m128 vi0 = _mm_loadu_ps(i0);
+ const __m128 vi0 = _mm_loadu_ps_notsan(i0);
i0 += 4;
- const __m128 vi1 = _mm_loadu_ps(i1);
+ const __m128 vi1 = _mm_loadu_ps_notsan(i1);
i1 += 4;
- const __m128 vi2 = _mm_loadu_ps(i2);
+ const __m128 vi2 = _mm_loadu_ps_notsan(i2);
i2 += 4;
- const __m128 vi3 = _mm_loadu_ps(i3);
+ const __m128 vi3 = _mm_loadu_ps_notsan(i3);
i3 += 4;
- const __m128 vi4 = _mm_loadu_ps(i4);
+ const __m128 vi4 = _mm_loadu_ps_notsan(i4);
i4 += 4;
- const __m128 vi5 = _mm_loadu_ps(i5);
+ const __m128 vi5 = _mm_loadu_ps_notsan(i5);
i5 += 4;
- const __m128 vi6 = _mm_loadu_ps(i6);
+ const __m128 vi6 = _mm_loadu_ps_notsan(i6);
i6 += 4;
- const __m128 vi7 = _mm_loadu_ps(i7);
+ const __m128 vi7 = _mm_loadu_ps_notsan(i7);
i7 += 4;
- const __m128 vi8 = _mm_loadu_ps(i8);
+ const __m128 vi8 = _mm_loadu_ps_notsan(i8);
i8 += 4;
const __m128 vmax018 = _mm_max_ps(_mm_max_ps(vi0, vi1), vi8);
@@ -223,15 +224,15 @@ XNN_DISABLE_TSAN void xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4(
o += 4;
}
if (c != 0) {
- const __m128 vi0 = _mm_loadu_ps(i0);
- const __m128 vi1 = _mm_loadu_ps(i1);
- const __m128 vi2 = _mm_loadu_ps(i2);
- const __m128 vi3 = _mm_loadu_ps(i3);
- const __m128 vi4 = _mm_loadu_ps(i4);
- const __m128 vi5 = _mm_loadu_ps(i5);
- const __m128 vi6 = _mm_loadu_ps(i6);
- const __m128 vi7 = _mm_loadu_ps(i7);
- const __m128 vo = _mm_loadu_ps(o);
+ const __m128 vi0 = _mm_loadu_ps_notsan(i0);
+ const __m128 vi1 = _mm_loadu_ps_notsan(i1);
+ const __m128 vi2 = _mm_loadu_ps_notsan(i2);
+ const __m128 vi3 = _mm_loadu_ps_notsan(i3);
+ const __m128 vi4 = _mm_loadu_ps_notsan(i4);
+ const __m128 vi5 = _mm_loadu_ps_notsan(i5);
+ const __m128 vi6 = _mm_loadu_ps_notsan(i6);
+ const __m128 vi7 = _mm_loadu_ps_notsan(i7);
+ const __m128 vo = _mm_loadu_ps_notsan(o);
const __m128 vmax01 = _mm_max_ps(_mm_max_ps(vi0, vi1), vo);
const __m128 vmax23 = _mm_max_ps(vi2, vi3);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c
index ae36a7616..5ae3399a0 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/raddstoreexpminusmax.h>
@@ -181,7 +182,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc2(
assert(elements >= 1 * sizeof(float));
assert(elements <= 3 * sizeof(float));
// Load 4 inputs at a time.
- const __m128 vi = _mm_loadu_ps(input);
+ const __m128 vi = _mm_loadu_ps_notsan(input);
// Subtract maximum input x := i - i_max. This implies x <= 0.
const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c
index 9761b8fe9..785b0c760 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/raddstoreexpminusmax.h>
@@ -183,7 +184,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc3(
assert(elements >= 1 * sizeof(float));
assert(elements <= 3 * sizeof(float));
// Load 4 inputs at a time.
- const __m128 vi = _mm_loadu_ps(input);
+ const __m128 vi = _mm_loadu_ps_notsan(input);
// Subtract maximum input x := i - i_max. This implies x <= 0.
const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c
index 9f8cf6979..40c6ce9b3 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/raddstoreexpminusmax.h>
@@ -178,7 +179,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12(
assert(elements >= 1 * sizeof(float));
assert(elements <= 3 * sizeof(float));
// Load 4 inputs at a time.
- const __m128 vi = _mm_loadu_ps(input);
+ const __m128 vi = _mm_loadu_ps_notsan(input);
// Subtract maximum input x := i - i_max. This implies x <= 0.
const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c
index 238022710..cfb462bb7 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/raddstoreexpminusmax.h>
@@ -197,7 +198,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc2(
assert(elements >= 1 * sizeof(float));
assert(elements <= 3 * sizeof(float));
// Load 4 inputs at a time.
- const __m128 vi = _mm_loadu_ps(input);
+ const __m128 vi = _mm_loadu_ps_notsan(input);
// Subtract maximum input x := i - i_max. This implies x <= 0.
const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c
index 1e60b6534..346dea131 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/raddstoreexpminusmax.h>
@@ -201,7 +202,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc4(
assert(elements >= 1 * sizeof(float));
assert(elements <= 3 * sizeof(float));
// Load 4 inputs at a time.
- const __m128 vi = _mm_loadu_ps(input);
+ const __m128 vi = _mm_loadu_ps_notsan(input);
// Subtract maximum input x := i - i_max. This implies x <= 0.
const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c
index 09f542fd7..53cd00e17 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/raddstoreexpminusmax.h>
@@ -194,7 +195,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16(
assert(elements >= 1 * sizeof(float));
assert(elements <= 3 * sizeof(float));
// Load 4 inputs at a time.
- const __m128 vi = _mm_loadu_ps(input);
+ const __m128 vi = _mm_loadu_ps_notsan(input);
// Subtract maximum input x := i - i_max. This implies x <= 0.
const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c
index f92b7919d..4e9bf2be4 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/raddstoreexpminusmax.h>
@@ -213,7 +214,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc2(
assert(elements >= 1 * sizeof(float));
assert(elements <= 3 * sizeof(float));
// Load 4 inputs at a time.
- const __m128 vi = _mm_loadu_ps(input);
+ const __m128 vi = _mm_loadu_ps_notsan(input);
// Subtract maximum input x := i - i_max. This implies x <= 0.
const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c
index d58661b86..7c7445e59 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/raddstoreexpminusmax.h>
@@ -219,7 +220,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc5(
assert(elements >= 1 * sizeof(float));
assert(elements <= 3 * sizeof(float));
// Load 4 inputs at a time.
- const __m128 vi = _mm_loadu_ps(input);
+ const __m128 vi = _mm_loadu_ps_notsan(input);
// Subtract maximum input x := i - i_max. This implies x <= 0.
const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c
index 3ab5db388..ca0fde9af 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/raddstoreexpminusmax.h>
@@ -210,7 +211,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20(
assert(elements >= 1 * sizeof(float));
assert(elements <= 3 * sizeof(float));
// Load 4 inputs at a time.
- const __m128 vi = _mm_loadu_ps(input);
+ const __m128 vi = _mm_loadu_ps_notsan(input);
// Subtract maximum input x := i - i_max. This implies x <= 0.
const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c
index 157b57689..172fdd392 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/raddstoreexpminusmax.h>
@@ -146,7 +147,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x4(
assert(elements >= 1 * sizeof(float));
assert(elements <= 3 * sizeof(float));
// Load 4 inputs at a time.
- const __m128 vi = _mm_loadu_ps(input);
+ const __m128 vi = _mm_loadu_ps_notsan(input);
// Subtract maximum input x := i - i_max. This implies x <= 0.
const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c
index d3ef0b3aa..05d2aa88b 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/raddstoreexpminusmax.h>
@@ -165,7 +166,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8_acc2(
assert(elements >= 1 * sizeof(float));
assert(elements <= 3 * sizeof(float));
// Load 4 inputs at a time.
- const __m128 vi = _mm_loadu_ps(input);
+ const __m128 vi = _mm_loadu_ps_notsan(input);
// Subtract maximum input x := i - i_max. This implies x <= 0.
const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c
index 0460e422b..751020fbe 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/raddstoreexpminusmax.h>
@@ -162,7 +163,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8(
assert(elements >= 1 * sizeof(float));
assert(elements <= 3 * sizeof(float));
// Load 4 inputs at a time.
- const __m128 vi = _mm_loadu_ps(input);
+ const __m128 vi = _mm_loadu_ps_notsan(input);
// Subtract maximum input x := i - i_max. This implies x <= 0.
const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/sse2-p5.c.in b/src/f32-raddstoreexpminusmax/sse2-p5.c.in
index 51a33c888..6582df80a 100644
--- a/src/f32-raddstoreexpminusmax/sse2-p5.c.in
+++ b/src/f32-raddstoreexpminusmax/sse2-p5.c.in
@@ -12,6 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/raddstoreexpminusmax.h>
@@ -173,7 +174,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x${ELEMENTS_TILE}${"" if ACCU
assert(elements >= 1 * sizeof(float));
assert(elements <= 3 * sizeof(float));
// Load 4 inputs at a time.
- const __m128 vi = _mm_loadu_ps(input);
+ const __m128 vi = _mm_loadu_ps_notsan(input);
// Subtract maximum input x := i - i_max. This implies x <= 0.
const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x12.c b/src/f32-sigmoid/gen/sse2-p5-div-x12.c
index da8f0a288..388281d39 100644
--- a/src/f32-sigmoid/gen/sse2-p5-div-x12.c
+++ b/src/f32-sigmoid/gen/sse2-p5-div-x12.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vunary.h>
@@ -213,7 +214,7 @@ void xnn_f32_sigmoid_ukernel__sse2_p5_div_x12(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx = _mm_loadu_ps(x);
+ const __m128 vx = _mm_loadu_ps_notsan(x);
// General structure of the algorithm:
// / exp(x) / (1 + exp(x)) if x <= 0
diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x16.c b/src/f32-sigmoid/gen/sse2-p5-div-x16.c
index 65e43cda4..f96d98bd8 100644
--- a/src/f32-sigmoid/gen/sse2-p5-div-x16.c
+++ b/src/f32-sigmoid/gen/sse2-p5-div-x16.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vunary.h>
@@ -232,7 +233,7 @@ void xnn_f32_sigmoid_ukernel__sse2_p5_div_x16(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx = _mm_loadu_ps(x);
+ const __m128 vx = _mm_loadu_ps_notsan(x);
// General structure of the algorithm:
// / exp(x) / (1 + exp(x)) if x <= 0
diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x20.c b/src/f32-sigmoid/gen/sse2-p5-div-x20.c
index 916fff048..d0de04a12 100644
--- a/src/f32-sigmoid/gen/sse2-p5-div-x20.c
+++ b/src/f32-sigmoid/gen/sse2-p5-div-x20.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vunary.h>
@@ -251,7 +252,7 @@ void xnn_f32_sigmoid_ukernel__sse2_p5_div_x20(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx = _mm_loadu_ps(x);
+ const __m128 vx = _mm_loadu_ps_notsan(x);
// General structure of the algorithm:
// / exp(x) / (1 + exp(x)) if x <= 0
diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x24.c b/src/f32-sigmoid/gen/sse2-p5-div-x24.c
index 920204f93..72363b4ba 100644
--- a/src/f32-sigmoid/gen/sse2-p5-div-x24.c
+++ b/src/f32-sigmoid/gen/sse2-p5-div-x24.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vunary.h>
@@ -270,7 +271,7 @@ void xnn_f32_sigmoid_ukernel__sse2_p5_div_x24(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx = _mm_loadu_ps(x);
+ const __m128 vx = _mm_loadu_ps_notsan(x);
// General structure of the algorithm:
// / exp(x) / (1 + exp(x)) if x <= 0
diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x4.c b/src/f32-sigmoid/gen/sse2-p5-div-x4.c
index f543ef817..89dd0a99a 100644
--- a/src/f32-sigmoid/gen/sse2-p5-div-x4.c
+++ b/src/f32-sigmoid/gen/sse2-p5-div-x4.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vunary.h>
@@ -105,7 +106,7 @@ void xnn_f32_sigmoid_ukernel__sse2_p5_div_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx = _mm_loadu_ps(x);
+ const __m128 vx = _mm_loadu_ps_notsan(x);
// General structure of the algorithm:
// / exp(x) / (1 + exp(x)) if x <= 0
diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x8.c b/src/f32-sigmoid/gen/sse2-p5-div-x8.c
index 21f6fde0f..43c29516f 100644
--- a/src/f32-sigmoid/gen/sse2-p5-div-x8.c
+++ b/src/f32-sigmoid/gen/sse2-p5-div-x8.c
@@ -12,6 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vunary.h>
@@ -194,7 +195,7 @@ void xnn_f32_sigmoid_ukernel__sse2_p5_div_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx = _mm_loadu_ps(x);
+ const __m128 vx = _mm_loadu_ps_notsan(x);
// General structure of the algorithm:
// / exp(x) / (1 + exp(x)) if x <= 0
diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x12.c b/src/f32-sigmoid/gen/sse41-p5-div-x12.c
index 7753dd313..ef61c6aea 100644
--- a/src/f32-sigmoid/gen/sse41-p5-div-x12.c
+++ b/src/f32-sigmoid/gen/sse41-p5-div-x12.c
@@ -12,6 +12,7 @@
#include <smmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vunary.h>
@@ -208,7 +209,7 @@ void xnn_f32_sigmoid_ukernel__sse41_p5_div_x12(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx = _mm_loadu_ps(x);
+ const __m128 vx = _mm_loadu_ps_notsan(x);
// General structure of the algorithm:
// / exp(x) / (1 + exp(x)) if x <= 0
diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x16.c b/src/f32-sigmoid/gen/sse41-p5-div-x16.c
index 92f989a23..a8e38ea5f 100644
--- a/src/f32-sigmoid/gen/sse41-p5-div-x16.c
+++ b/src/f32-sigmoid/gen/sse41-p5-div-x16.c
@@ -12,6 +12,7 @@
#include <smmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vunary.h>
@@ -226,7 +227,7 @@ void xnn_f32_sigmoid_ukernel__sse41_p5_div_x16(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx = _mm_loadu_ps(x);
+ const __m128 vx = _mm_loadu_ps_notsan(x);
// General structure of the algorithm:
// / exp(x) / (1 + exp(x)) if x <= 0
diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x20.c b/src/f32-sigmoid/gen/sse41-p5-div-x20.c
index a58a9cd65..f42ac8693 100644
--- a/src/f32-sigmoid/gen/sse41-p5-div-x20.c
+++ b/src/f32-sigmoid/gen/sse41-p5-div-x20.c
@@ -12,6 +12,7 @@
#include <smmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vunary.h>
@@ -244,7 +245,7 @@ void xnn_f32_sigmoid_ukernel__sse41_p5_div_x20(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx = _mm_loadu_ps(x);
+ const __m128 vx = _mm_loadu_ps_notsan(x);
// General structure of the algorithm:
// / exp(x) / (1 + exp(x)) if x <= 0
diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x24.c b/src/f32-sigmoid/gen/sse41-p5-div-x24.c
index 71979d3a5..72b8fd7b3 100644
--- a/src/f32-sigmoid/gen/sse41-p5-div-x24.c
+++ b/src/f32-sigmoid/gen/sse41-p5-div-x24.c
@@ -12,6 +12,7 @@
#include <smmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vunary.h>
@@ -262,7 +263,7 @@ void xnn_f32_sigmoid_ukernel__sse41_p5_div_x24(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx = _mm_loadu_ps(x);
+ const __m128 vx = _mm_loadu_ps_notsan(x);
// General structure of the algorithm:
// / exp(x) / (1 + exp(x)) if x <= 0
diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x4.c b/src/f32-sigmoid/gen/sse41-p5-div-x4.c
index 54b600a7e..27679fb2d 100644
--- a/src/f32-sigmoid/gen/sse41-p5-div-x4.c
+++ b/src/f32-sigmoid/gen/sse41-p5-div-x4.c
@@ -12,6 +12,7 @@
#include <smmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vunary.h>
@@ -104,7 +105,7 @@ void xnn_f32_sigmoid_ukernel__sse41_p5_div_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx = _mm_loadu_ps(x);
+ const __m128 vx = _mm_loadu_ps_notsan(x);
// General structure of the algorithm:
// / exp(x) / (1 + exp(x)) if x <= 0
diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x8.c b/src/f32-sigmoid/gen/sse41-p5-div-x8.c
index 6a9e9db78..5c28a64d6 100644
--- a/src/f32-sigmoid/gen/sse41-p5-div-x8.c
+++ b/src/f32-sigmoid/gen/sse41-p5-div-x8.c
@@ -12,6 +12,7 @@
#include <smmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vunary.h>
@@ -190,7 +191,7 @@ void xnn_f32_sigmoid_ukernel__sse41_p5_div_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx = _mm_loadu_ps(x);
+ const __m128 vx = _mm_loadu_ps_notsan(x);
// General structure of the algorithm:
// / exp(x) / (1 + exp(x)) if x <= 0
diff --git a/src/f32-sigmoid/sse-p5-div.c.in b/src/f32-sigmoid/sse-p5-div.c.in
index c509e8d3d..05a0607cc 100644
--- a/src/f32-sigmoid/sse-p5-div.c.in
+++ b/src/f32-sigmoid/sse-p5-div.c.in
@@ -14,6 +14,7 @@ $else:
#include <emmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vunary.h>
@@ -206,7 +207,7 @@ void xnn_f32_sigmoid_ukernel__${"sse41" if BLEND else "sse2"}_p5_div_x${BATCH_TI
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 vx = _mm_loadu_ps(x);
+ const __m128 vx = _mm_loadu_ps_notsan(x);
// General structure of the algorithm:
// / exp(x) / (1 + exp(x)) if x <= 0
diff --git a/src/f32-vbinary/gen/vadd-minmax-sse-x4.c b/src/f32-vbinary/gen/vadd-minmax-sse-x4.c
index 0d940c30e..4eed934aa 100644
--- a/src/f32-vbinary/gen/vadd-minmax-sse-x4.c
+++ b/src/f32-vbinary/gen/vadd-minmax-sse-x4.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -58,8 +59,8 @@ void xnn_f32_vadd_minmax_ukernel__sse_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
- const __m128 vb0123 = _mm_loadu_ps(b);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
+ const __m128 vb0123 = _mm_loadu_ps_notsan(b);
__m128 vy0123 = _mm_add_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vadd-minmax-sse-x8.c b/src/f32-vbinary/gen/vadd-minmax-sse-x8.c
index 529e6a5d7..48b29a939 100644
--- a/src/f32-vbinary/gen/vadd-minmax-sse-x8.c
+++ b/src/f32-vbinary/gen/vadd-minmax-sse-x8.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -64,8 +65,8 @@ void xnn_f32_vadd_minmax_ukernel__sse_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
- const __m128 vb0123 = _mm_loadu_ps(b);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
+ const __m128 vb0123 = _mm_loadu_ps_notsan(b);
__m128 vy0123 = _mm_add_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vaddc-minmax-sse-x4.c b/src/f32-vbinary/gen/vaddc-minmax-sse-x4.c
index c9d9ae52d..48ef5fecf 100644
--- a/src/f32-vbinary/gen/vaddc-minmax-sse-x4.c
+++ b/src/f32-vbinary/gen/vaddc-minmax-sse-x4.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -53,7 +54,7 @@ void xnn_f32_vaddc_minmax_ukernel__sse_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_add_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vaddc-minmax-sse-x8.c b/src/f32-vbinary/gen/vaddc-minmax-sse-x8.c
index 21853a63f..3a1b371cf 100644
--- a/src/f32-vbinary/gen/vaddc-minmax-sse-x8.c
+++ b/src/f32-vbinary/gen/vaddc-minmax-sse-x8.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -58,7 +59,7 @@ void xnn_f32_vaddc_minmax_ukernel__sse_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_add_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vdiv-minmax-sse-x4.c b/src/f32-vbinary/gen/vdiv-minmax-sse-x4.c
index 3fda9b4b7..1944f4417 100644
--- a/src/f32-vbinary/gen/vdiv-minmax-sse-x4.c
+++ b/src/f32-vbinary/gen/vdiv-minmax-sse-x4.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -58,8 +59,8 @@ void xnn_f32_vdiv_minmax_ukernel__sse_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
- const __m128 vb0123 = _mm_loadu_ps(b);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
+ const __m128 vb0123 = _mm_loadu_ps_notsan(b);
__m128 vy0123 = _mm_div_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vdiv-minmax-sse-x8.c b/src/f32-vbinary/gen/vdiv-minmax-sse-x8.c
index 3f75e7fd9..0d7da93f6 100644
--- a/src/f32-vbinary/gen/vdiv-minmax-sse-x8.c
+++ b/src/f32-vbinary/gen/vdiv-minmax-sse-x8.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -64,8 +65,8 @@ void xnn_f32_vdiv_minmax_ukernel__sse_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
- const __m128 vb0123 = _mm_loadu_ps(b);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
+ const __m128 vb0123 = _mm_loadu_ps_notsan(b);
__m128 vy0123 = _mm_div_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vdivc-minmax-sse-x4.c b/src/f32-vbinary/gen/vdivc-minmax-sse-x4.c
index 2c19a848d..2b9042788 100644
--- a/src/f32-vbinary/gen/vdivc-minmax-sse-x4.c
+++ b/src/f32-vbinary/gen/vdivc-minmax-sse-x4.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -53,7 +54,7 @@ void xnn_f32_vdivc_minmax_ukernel__sse_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_div_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vdivc-minmax-sse-x8.c b/src/f32-vbinary/gen/vdivc-minmax-sse-x8.c
index c2b24059b..6b9191d6a 100644
--- a/src/f32-vbinary/gen/vdivc-minmax-sse-x8.c
+++ b/src/f32-vbinary/gen/vdivc-minmax-sse-x8.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -58,7 +59,7 @@ void xnn_f32_vdivc_minmax_ukernel__sse_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_div_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vmax-sse-x4.c b/src/f32-vbinary/gen/vmax-sse-x4.c
index 862013901..6e04ea491 100644
--- a/src/f32-vbinary/gen/vmax-sse-x4.c
+++ b/src/f32-vbinary/gen/vmax-sse-x4.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -51,8 +52,8 @@ void xnn_f32_vmax_ukernel__sse_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
- const __m128 vb0123 = _mm_loadu_ps(b);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
+ const __m128 vb0123 = _mm_loadu_ps_notsan(b);
__m128 vy0123 = _mm_max_ps(va0123, vb0123);
if (n & (2 * sizeof(float))) {
diff --git a/src/f32-vbinary/gen/vmax-sse-x8.c b/src/f32-vbinary/gen/vmax-sse-x8.c
index 7a506af9c..20ceef219 100644
--- a/src/f32-vbinary/gen/vmax-sse-x8.c
+++ b/src/f32-vbinary/gen/vmax-sse-x8.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -55,8 +56,8 @@ void xnn_f32_vmax_ukernel__sse_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
- const __m128 vb0123 = _mm_loadu_ps(b);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
+ const __m128 vb0123 = _mm_loadu_ps_notsan(b);
__m128 vy0123 = _mm_max_ps(va0123, vb0123);
if (n & (2 * sizeof(float))) {
diff --git a/src/f32-vbinary/gen/vmaxc-sse-x4.c b/src/f32-vbinary/gen/vmaxc-sse-x4.c
index 2702effe4..235fce19e 100644
--- a/src/f32-vbinary/gen/vmaxc-sse-x4.c
+++ b/src/f32-vbinary/gen/vmaxc-sse-x4.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -46,7 +47,7 @@ void xnn_f32_vmaxc_ukernel__sse_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_max_ps(va0123, vb);
if (n & (2 * sizeof(float))) {
diff --git a/src/f32-vbinary/gen/vmaxc-sse-x8.c b/src/f32-vbinary/gen/vmaxc-sse-x8.c
index 642bfe669..30bbf23bb 100644
--- a/src/f32-vbinary/gen/vmaxc-sse-x8.c
+++ b/src/f32-vbinary/gen/vmaxc-sse-x8.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -49,7 +50,7 @@ void xnn_f32_vmaxc_ukernel__sse_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_max_ps(va0123, vb);
if (n & (2 * sizeof(float))) {
diff --git a/src/f32-vbinary/gen/vmin-sse-x4.c b/src/f32-vbinary/gen/vmin-sse-x4.c
index c9ca659ef..7875e8869 100644
--- a/src/f32-vbinary/gen/vmin-sse-x4.c
+++ b/src/f32-vbinary/gen/vmin-sse-x4.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -51,8 +52,8 @@ void xnn_f32_vmin_ukernel__sse_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
- const __m128 vb0123 = _mm_loadu_ps(b);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
+ const __m128 vb0123 = _mm_loadu_ps_notsan(b);
__m128 vy0123 = _mm_min_ps(va0123, vb0123);
if (n & (2 * sizeof(float))) {
diff --git a/src/f32-vbinary/gen/vmin-sse-x8.c b/src/f32-vbinary/gen/vmin-sse-x8.c
index 92770a692..28b51877e 100644
--- a/src/f32-vbinary/gen/vmin-sse-x8.c
+++ b/src/f32-vbinary/gen/vmin-sse-x8.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -55,8 +56,8 @@ void xnn_f32_vmin_ukernel__sse_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
- const __m128 vb0123 = _mm_loadu_ps(b);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
+ const __m128 vb0123 = _mm_loadu_ps_notsan(b);
__m128 vy0123 = _mm_min_ps(va0123, vb0123);
if (n & (2 * sizeof(float))) {
diff --git a/src/f32-vbinary/gen/vminc-sse-x4.c b/src/f32-vbinary/gen/vminc-sse-x4.c
index 4fac969d2..cea189135 100644
--- a/src/f32-vbinary/gen/vminc-sse-x4.c
+++ b/src/f32-vbinary/gen/vminc-sse-x4.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -46,7 +47,7 @@ void xnn_f32_vminc_ukernel__sse_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_min_ps(va0123, vb);
if (n & (2 * sizeof(float))) {
diff --git a/src/f32-vbinary/gen/vminc-sse-x8.c b/src/f32-vbinary/gen/vminc-sse-x8.c
index e3c579925..5b3e65646 100644
--- a/src/f32-vbinary/gen/vminc-sse-x8.c
+++ b/src/f32-vbinary/gen/vminc-sse-x8.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -49,7 +50,7 @@ void xnn_f32_vminc_ukernel__sse_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_min_ps(va0123, vb);
if (n & (2 * sizeof(float))) {
diff --git a/src/f32-vbinary/gen/vmul-minmax-sse-x4.c b/src/f32-vbinary/gen/vmul-minmax-sse-x4.c
index e50094109..a1fc3057f 100644
--- a/src/f32-vbinary/gen/vmul-minmax-sse-x4.c
+++ b/src/f32-vbinary/gen/vmul-minmax-sse-x4.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -58,8 +59,8 @@ void xnn_f32_vmul_minmax_ukernel__sse_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
- const __m128 vb0123 = _mm_loadu_ps(b);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
+ const __m128 vb0123 = _mm_loadu_ps_notsan(b);
__m128 vy0123 = _mm_mul_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vmul-minmax-sse-x8.c b/src/f32-vbinary/gen/vmul-minmax-sse-x8.c
index 7c3186708..2678c4b25 100644
--- a/src/f32-vbinary/gen/vmul-minmax-sse-x8.c
+++ b/src/f32-vbinary/gen/vmul-minmax-sse-x8.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -64,8 +65,8 @@ void xnn_f32_vmul_minmax_ukernel__sse_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
- const __m128 vb0123 = _mm_loadu_ps(b);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
+ const __m128 vb0123 = _mm_loadu_ps_notsan(b);
__m128 vy0123 = _mm_mul_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vmulc-minmax-sse-x4.c b/src/f32-vbinary/gen/vmulc-minmax-sse-x4.c
index 19c9cb1af..e01cf39b6 100644
--- a/src/f32-vbinary/gen/vmulc-minmax-sse-x4.c
+++ b/src/f32-vbinary/gen/vmulc-minmax-sse-x4.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -53,7 +54,7 @@ void xnn_f32_vmulc_minmax_ukernel__sse_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_mul_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vmulc-minmax-sse-x8.c b/src/f32-vbinary/gen/vmulc-minmax-sse-x8.c
index a1fb4a733..6a0d03367 100644
--- a/src/f32-vbinary/gen/vmulc-minmax-sse-x8.c
+++ b/src/f32-vbinary/gen/vmulc-minmax-sse-x8.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -58,7 +59,7 @@ void xnn_f32_vmulc_minmax_ukernel__sse_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_mul_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vrdivc-minmax-sse-x4.c b/src/f32-vbinary/gen/vrdivc-minmax-sse-x4.c
index 51f0811c5..d18b9d1ed 100644
--- a/src/f32-vbinary/gen/vrdivc-minmax-sse-x4.c
+++ b/src/f32-vbinary/gen/vrdivc-minmax-sse-x4.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -53,7 +54,7 @@ void xnn_f32_vrdivc_minmax_ukernel__sse_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_div_ps(vb, va0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vrdivc-minmax-sse-x8.c b/src/f32-vbinary/gen/vrdivc-minmax-sse-x8.c
index 48eb7411b..01ac18c1e 100644
--- a/src/f32-vbinary/gen/vrdivc-minmax-sse-x8.c
+++ b/src/f32-vbinary/gen/vrdivc-minmax-sse-x8.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -58,7 +59,7 @@ void xnn_f32_vrdivc_minmax_ukernel__sse_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_div_ps(vb, va0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vrsubc-minmax-sse-x4.c b/src/f32-vbinary/gen/vrsubc-minmax-sse-x4.c
index a9c0df140..0523b6900 100644
--- a/src/f32-vbinary/gen/vrsubc-minmax-sse-x4.c
+++ b/src/f32-vbinary/gen/vrsubc-minmax-sse-x4.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -53,7 +54,7 @@ void xnn_f32_vrsubc_minmax_ukernel__sse_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_sub_ps(vb, va0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vrsubc-minmax-sse-x8.c b/src/f32-vbinary/gen/vrsubc-minmax-sse-x8.c
index 7be1c4681..e03085dc3 100644
--- a/src/f32-vbinary/gen/vrsubc-minmax-sse-x8.c
+++ b/src/f32-vbinary/gen/vrsubc-minmax-sse-x8.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -58,7 +59,7 @@ void xnn_f32_vrsubc_minmax_ukernel__sse_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_sub_ps(vb, va0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vsub-minmax-sse-x4.c b/src/f32-vbinary/gen/vsub-minmax-sse-x4.c
index 37268fced..e3ca419dc 100644
--- a/src/f32-vbinary/gen/vsub-minmax-sse-x4.c
+++ b/src/f32-vbinary/gen/vsub-minmax-sse-x4.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -58,8 +59,8 @@ void xnn_f32_vsub_minmax_ukernel__sse_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
- const __m128 vb0123 = _mm_loadu_ps(b);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
+ const __m128 vb0123 = _mm_loadu_ps_notsan(b);
__m128 vy0123 = _mm_sub_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vsub-minmax-sse-x8.c b/src/f32-vbinary/gen/vsub-minmax-sse-x8.c
index ff1866560..33568d0ba 100644
--- a/src/f32-vbinary/gen/vsub-minmax-sse-x8.c
+++ b/src/f32-vbinary/gen/vsub-minmax-sse-x8.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -64,8 +65,8 @@ void xnn_f32_vsub_minmax_ukernel__sse_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
- const __m128 vb0123 = _mm_loadu_ps(b);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
+ const __m128 vb0123 = _mm_loadu_ps_notsan(b);
__m128 vy0123 = _mm_sub_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vsubc-minmax-sse-x4.c b/src/f32-vbinary/gen/vsubc-minmax-sse-x4.c
index b404ad45b..fde8b8efd 100644
--- a/src/f32-vbinary/gen/vsubc-minmax-sse-x4.c
+++ b/src/f32-vbinary/gen/vsubc-minmax-sse-x4.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -53,7 +54,7 @@ void xnn_f32_vsubc_minmax_ukernel__sse_x4(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_sub_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/gen/vsubc-minmax-sse-x8.c b/src/f32-vbinary/gen/vsubc-minmax-sse-x8.c
index 4db9df9c0..11cc16d43 100644
--- a/src/f32-vbinary/gen/vsubc-minmax-sse-x8.c
+++ b/src/f32-vbinary/gen/vsubc-minmax-sse-x8.c
@@ -12,6 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -58,7 +59,7 @@ void xnn_f32_vsubc_minmax_ukernel__sse_x8(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = _mm_sub_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
diff --git a/src/f32-vbinary/vop-sse.c.in b/src/f32-vbinary/vop-sse.c.in
index e0425d292..fd096ea1a 100644
--- a/src/f32-vbinary/vop-sse.c.in
+++ b/src/f32-vbinary/vop-sse.c.in
@@ -13,6 +13,7 @@ $assert ACTIVATION in ["LINEAR", "MINMAX"]
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -82,8 +83,8 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__sse_x${BATCH_TILE}(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
- const __m128 vb0123 = _mm_loadu_ps(b);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
+ const __m128 vb0123 = _mm_loadu_ps_notsan(b);
__m128 vy0123 = ${_MM_OP_PS("va0123", "vb0123")};
$if ACTIVATION == "MINMAX":
diff --git a/src/f32-vbinary/vopc-sse.c.in b/src/f32-vbinary/vopc-sse.c.in
index 75fb3fe16..0f27ca3d9 100644
--- a/src/f32-vbinary/vopc-sse.c.in
+++ b/src/f32-vbinary/vopc-sse.c.in
@@ -13,6 +13,7 @@ $assert ACTIVATION in ["LINEAR", "MINMAX"]
#include <xmmintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/vbinary.h>
@@ -77,7 +78,7 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__sse_x${BATCH_TILE}(
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const __m128 va0123 = _mm_loadu_ps(a);
+ const __m128 va0123 = _mm_loadu_ps_notsan(a);
__m128 vy0123 = ${_MM_OP_PS("va0123")};
$if ACTIVATION == "MINMAX":
diff --git a/src/u8-maxpool/9p8x-minmax-sse2-c16.c b/src/u8-maxpool/9p8x-minmax-sse2-c16.c
index 1997fe73d..ba79f6986 100644
--- a/src/u8-maxpool/9p8x-minmax-sse2-c16.c
+++ b/src/u8-maxpool/9p8x-minmax-sse2-c16.c
@@ -10,10 +10,11 @@
#include <emmintrin.h>
+#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/maxpool.h>
-XNN_DISABLE_TSAN void xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16(
+void xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16(
size_t output_pixels,
size_t kernel_elements,
size_t channels,
@@ -102,15 +103,15 @@ XNN_DISABLE_TSAN void xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16(
_mm_storeu_si128((__m128i*) o, vout); o += 16;
}
if (c != 0) {
- const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0);
- const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1);
- const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2);
- const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3);
- const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4);
- const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5);
- const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6);
- const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7);
- const __m128i vi8 = _mm_loadu_si128((const __m128i*) i8);
+ const __m128i vi0 = _mm_loadu_si128_notsan((const __m128i*) i0);
+ const __m128i vi1 = _mm_loadu_si128_notsan((const __m128i*) i1);
+ const __m128i vi2 = _mm_loadu_si128_notsan((const __m128i*) i2);
+ const __m128i vi3 = _mm_loadu_si128_notsan((const __m128i*) i3);
+ const __m128i vi4 = _mm_loadu_si128_notsan((const __m128i*) i4);
+ const __m128i vi5 = _mm_loadu_si128_notsan((const __m128i*) i5);
+ const __m128i vi6 = _mm_loadu_si128_notsan((const __m128i*) i6);
+ const __m128i vi7 = _mm_loadu_si128_notsan((const __m128i*) i7);
+ const __m128i vi8 = _mm_loadu_si128_notsan((const __m128i*) i8);
const __m128i vmax018 = _mm_max_epu8(_mm_max_epu8(vi0, vi1), vi8);
const __m128i vmax23 = _mm_max_epu8(vi2, vi3);
@@ -210,15 +211,15 @@ XNN_DISABLE_TSAN void xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16(
o += 16;
}
if (c != 0) {
- const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0);
- const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1);
- const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2);
- const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3);
- const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4);
- const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5);
- const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6);
- const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7);
- const __m128i vo = _mm_loadu_si128((const __m128i*) o);
+ const __m128i vi0 = _mm_loadu_si128_notsan((const __m128i*) i0);
+ const __m128i vi1 = _mm_loadu_si128_notsan((const __m128i*) i1);
+ const __m128i vi2 = _mm_loadu_si128_notsan((const __m128i*) i2);
+ const __m128i vi3 = _mm_loadu_si128_notsan((const __m128i*) i3);
+ const __m128i vi4 = _mm_loadu_si128_notsan((const __m128i*) i4);
+ const __m128i vi5 = _mm_loadu_si128_notsan((const __m128i*) i5);
+ const __m128i vi6 = _mm_loadu_si128_notsan((const __m128i*) i6);
+ const __m128i vi7 = _mm_loadu_si128_notsan((const __m128i*) i7);
+ const __m128i vo = _mm_loadu_si128_notsan((const __m128i*) o);
const __m128i vmax01 = _mm_max_epu8(_mm_max_epu8(vi0, vi1), vo);
const __m128i vmax23 = _mm_max_epu8(vi2, vi3);
diff --git a/src/xnnpack/common.h b/src/xnnpack/common.h
index 86e9b3ac7..4063a5e14 100644
--- a/src/xnnpack/common.h
+++ b/src/xnnpack/common.h
@@ -188,6 +188,14 @@
#endif
#if defined(__GNUC__)
+ #define XNN_INTRINSIC inline __attribute__((__always_inline__, __artificial__))
+#elif defined(_MSC_VER)
+ #define XNN_INTRINSIC __forceinline
+#else
+ #define XNN_INTRINSIC inline
+#endif
+
+#if defined(__GNUC__)
#define XNN_INLINE inline __attribute__((__always_inline__))
#elif defined(_MSC_VER)
#define XNN_INLINE __forceinline
diff --git a/src/xnnpack/intrinsics-polyfill.h b/src/xnnpack/intrinsics-polyfill.h
index 51da04e82..caa192d30 100644
--- a/src/xnnpack/intrinsics-polyfill.h
+++ b/src/xnnpack/intrinsics-polyfill.h
@@ -5,6 +5,27 @@
#pragma once
+#include <xnnpack/common.h>
+
+
+#if defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1))
+#include <xmmintrin.h>
+
+static XNN_INTRINSIC XNN_DISABLE_TSAN
+__m128 _mm_loadu_ps_notsan(const float* address) {
+ return _mm_loadu_ps(address);
+}
+#endif
+
+#if defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2))
+#include <emmintrin.h>
+
+static XNN_INTRINSIC XNN_DISABLE_TSAN
+__m128i _mm_loadu_si128_notsan(const __m128i* address) {
+ return _mm_loadu_si128(address);
+}
+#endif
+
#ifdef __AVX512F__
#include <immintrin.h>
@@ -15,8 +36,8 @@
(defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \
(defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800))
-static inline __mmask16 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_cvtu32_mask16(unsigned int mask) {
+static XNN_INTRINSIC
+__mmask16 _cvtu32_mask16(unsigned int mask) {
return (__mmask16) mask;
}
@@ -27,8 +48,8 @@ _cvtu32_mask16(unsigned int mask) {
(defined(__clang__) && (__clang_major__ < 4)) || \
(defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800))
-static inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_reduce_add_ps(__m512 v) {
+static XNN_INTRINSIC
+float _mm512_reduce_add_ps(__m512 v) {
#if __AVX512DQ__
const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
#else
@@ -40,8 +61,8 @@ _mm512_reduce_add_ps(__m512 v) {
return _mm_cvtss_f32(sum16);
}
-static inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_reduce_max_ps(__m512 v) {
+static XNN_INTRINSIC
+float _mm512_reduce_max_ps(__m512 v) {
#if __AVX512DQ__
const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
#else