diff options
Diffstat (limited to 'src')
63 files changed, 206 insertions, 116 deletions
diff --git a/src/f32-hswish/gen/sse-x4.c b/src/f32-hswish/gen/sse-x4.c index a09107e9e..f1b8c6c96 100644 --- a/src/f32-hswish/gen/sse-x4.c +++ b/src/f32-hswish/gen/sse-x4.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -58,7 +59,7 @@ void xnn_f32_hswish_ukernel__sse_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx0123 = _mm_loadu_ps(x); + const __m128 vx0123 = _mm_loadu_ps_notsan(x); __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth); vacc0123 = _mm_add_ps(vacc0123, vhalf); vacc0123 = _mm_max_ps(vacc0123, vzero); diff --git a/src/f32-hswish/gen/sse-x8.c b/src/f32-hswish/gen/sse-x8.c index 87e2d50b5..f4639a069 100644 --- a/src/f32-hswish/gen/sse-x8.c +++ b/src/f32-hswish/gen/sse-x8.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -65,7 +66,7 @@ void xnn_f32_hswish_ukernel__sse_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx0123 = _mm_loadu_ps(x); + const __m128 vx0123 = _mm_loadu_ps_notsan(x); __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth); vacc0123 = _mm_add_ps(vacc0123, vhalf); vacc0123 = _mm_max_ps(vacc0123, vzero); diff --git a/src/f32-hswish/sse.c.in b/src/f32-hswish/sse.c.in index 88c057cc8..40ea6813c 100644 --- a/src/f32-hswish/sse.c.in +++ b/src/f32-hswish/sse.c.in @@ -11,6 +11,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -67,7 +68,7 @@ void xnn_f32_hswish_ukernel__sse_x${BATCH_TILE}( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx0123 = _mm_loadu_ps(x); + const __m128 vx0123 = _mm_loadu_ps_notsan(x); __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth); vacc0123 = _mm_add_ps(vacc0123, vhalf); vacc0123 = _mm_max_ps(vacc0123, vzero); diff --git a/src/f32-maxpool/9p8x-minmax-sse-c4.c b/src/f32-maxpool/9p8x-minmax-sse-c4.c index 3606b76e6..81abdab32 100644 --- a/src/f32-maxpool/9p8x-minmax-sse-c4.c +++ b/src/f32-maxpool/9p8x-minmax-sse-c4.c @@ -7,10 +7,11 @@ #include <xmmintrin.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/maxpool.h> -XNN_DISABLE_TSAN void xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4( +void xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4( size_t output_pixels, size_t kernel_elements, size_t channels, @@ -108,23 +109,23 @@ XNN_DISABLE_TSAN void xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4( o += 4; } if (c != 0) { - const __m128 vi0 = _mm_loadu_ps(i0); + const __m128 vi0 = _mm_loadu_ps_notsan(i0); i0 += 4; - const __m128 vi1 = _mm_loadu_ps(i1); + const __m128 vi1 = _mm_loadu_ps_notsan(i1); i1 += 4; - const __m128 vi2 = _mm_loadu_ps(i2); + const __m128 vi2 = _mm_loadu_ps_notsan(i2); i2 += 4; - const __m128 vi3 = _mm_loadu_ps(i3); + const __m128 vi3 = _mm_loadu_ps_notsan(i3); i3 += 4; - const __m128 vi4 = _mm_loadu_ps(i4); + const __m128 vi4 = _mm_loadu_ps_notsan(i4); i4 += 4; - const __m128 vi5 = _mm_loadu_ps(i5); + const __m128 vi5 = _mm_loadu_ps_notsan(i5); i5 += 4; - const __m128 vi6 = _mm_loadu_ps(i6); + const __m128 vi6 = _mm_loadu_ps_notsan(i6); i6 += 4; - const __m128 vi7 = _mm_loadu_ps(i7); + const __m128 vi7 = _mm_loadu_ps_notsan(i7); i7 += 4; - const __m128 vi8 = _mm_loadu_ps(i8); + const __m128 vi8 = _mm_loadu_ps_notsan(i8); i8 += 4; const __m128 vmax018 = _mm_max_ps(_mm_max_ps(vi0, vi1), vi8); @@ -223,15 +224,15 @@ XNN_DISABLE_TSAN void xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4( o += 4; } if (c != 0) { - const __m128 vi0 = _mm_loadu_ps(i0); - const __m128 vi1 = _mm_loadu_ps(i1); - const __m128 vi2 = _mm_loadu_ps(i2); - const __m128 vi3 = _mm_loadu_ps(i3); - const __m128 vi4 = _mm_loadu_ps(i4); - const __m128 vi5 = _mm_loadu_ps(i5); - const __m128 vi6 = _mm_loadu_ps(i6); - const __m128 vi7 = _mm_loadu_ps(i7); - const __m128 vo = _mm_loadu_ps(o); + const __m128 vi0 = _mm_loadu_ps_notsan(i0); + const __m128 vi1 = _mm_loadu_ps_notsan(i1); + const __m128 vi2 = _mm_loadu_ps_notsan(i2); + const __m128 vi3 = _mm_loadu_ps_notsan(i3); + const __m128 vi4 = _mm_loadu_ps_notsan(i4); + const __m128 vi5 = _mm_loadu_ps_notsan(i5); + const __m128 vi6 = _mm_loadu_ps_notsan(i6); + const __m128 vi7 = _mm_loadu_ps_notsan(i7); + const __m128 vo = _mm_loadu_ps_notsan(o); const __m128 vmax01 = _mm_max_ps(_mm_max_ps(vi0, vi1), vo); const __m128 vmax23 = _mm_max_ps(vi2, vi3); diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c index ae36a7616..5ae3399a0 100644 --- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c +++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/raddstoreexpminusmax.h> @@ -181,7 +182,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc2( assert(elements >= 1 * sizeof(float)); assert(elements <= 3 * sizeof(float)); // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); + const __m128 vi = _mm_loadu_ps_notsan(input); // Subtract maximum input x := i - i_max. This implies x <= 0. const __m128 vx = _mm_sub_ps(vi, vi_max); diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c index 9761b8fe9..785b0c760 100644 --- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c +++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/raddstoreexpminusmax.h> @@ -183,7 +184,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc3( assert(elements >= 1 * sizeof(float)); assert(elements <= 3 * sizeof(float)); // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); + const __m128 vi = _mm_loadu_ps_notsan(input); // Subtract maximum input x := i - i_max. This implies x <= 0. const __m128 vx = _mm_sub_ps(vi, vi_max); diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c index 9f8cf6979..40c6ce9b3 100644 --- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c +++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/raddstoreexpminusmax.h> @@ -178,7 +179,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12( assert(elements >= 1 * sizeof(float)); assert(elements <= 3 * sizeof(float)); // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); + const __m128 vi = _mm_loadu_ps_notsan(input); // Subtract maximum input x := i - i_max. This implies x <= 0. const __m128 vx = _mm_sub_ps(vi, vi_max); diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c index 238022710..cfb462bb7 100644 --- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c +++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/raddstoreexpminusmax.h> @@ -197,7 +198,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc2( assert(elements >= 1 * sizeof(float)); assert(elements <= 3 * sizeof(float)); // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); + const __m128 vi = _mm_loadu_ps_notsan(input); // Subtract maximum input x := i - i_max. This implies x <= 0. const __m128 vx = _mm_sub_ps(vi, vi_max); diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c index 1e60b6534..346dea131 100644 --- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c +++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/raddstoreexpminusmax.h> @@ -201,7 +202,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc4( assert(elements >= 1 * sizeof(float)); assert(elements <= 3 * sizeof(float)); // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); + const __m128 vi = _mm_loadu_ps_notsan(input); // Subtract maximum input x := i - i_max. This implies x <= 0. const __m128 vx = _mm_sub_ps(vi, vi_max); diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c index 09f542fd7..53cd00e17 100644 --- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c +++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/raddstoreexpminusmax.h> @@ -194,7 +195,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16( assert(elements >= 1 * sizeof(float)); assert(elements <= 3 * sizeof(float)); // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); + const __m128 vi = _mm_loadu_ps_notsan(input); // Subtract maximum input x := i - i_max. This implies x <= 0. const __m128 vx = _mm_sub_ps(vi, vi_max); diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c index f92b7919d..4e9bf2be4 100644 --- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c +++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/raddstoreexpminusmax.h> @@ -213,7 +214,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc2( assert(elements >= 1 * sizeof(float)); assert(elements <= 3 * sizeof(float)); // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); + const __m128 vi = _mm_loadu_ps_notsan(input); // Subtract maximum input x := i - i_max. This implies x <= 0. const __m128 vx = _mm_sub_ps(vi, vi_max); diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c index d58661b86..7c7445e59 100644 --- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c +++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/raddstoreexpminusmax.h> @@ -219,7 +220,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc5( assert(elements >= 1 * sizeof(float)); assert(elements <= 3 * sizeof(float)); // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); + const __m128 vi = _mm_loadu_ps_notsan(input); // Subtract maximum input x := i - i_max. This implies x <= 0. const __m128 vx = _mm_sub_ps(vi, vi_max); diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c index 3ab5db388..ca0fde9af 100644 --- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c +++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/raddstoreexpminusmax.h> @@ -210,7 +211,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20( assert(elements >= 1 * sizeof(float)); assert(elements <= 3 * sizeof(float)); // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); + const __m128 vi = _mm_loadu_ps_notsan(input); // Subtract maximum input x := i - i_max. This implies x <= 0. const __m128 vx = _mm_sub_ps(vi, vi_max); diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c index 157b57689..172fdd392 100644 --- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c +++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/raddstoreexpminusmax.h> @@ -146,7 +147,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x4( assert(elements >= 1 * sizeof(float)); assert(elements <= 3 * sizeof(float)); // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); + const __m128 vi = _mm_loadu_ps_notsan(input); // Subtract maximum input x := i - i_max. This implies x <= 0. const __m128 vx = _mm_sub_ps(vi, vi_max); diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c index d3ef0b3aa..05d2aa88b 100644 --- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c +++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/raddstoreexpminusmax.h> @@ -165,7 +166,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8_acc2( assert(elements >= 1 * sizeof(float)); assert(elements <= 3 * sizeof(float)); // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); + const __m128 vi = _mm_loadu_ps_notsan(input); // Subtract maximum input x := i - i_max. This implies x <= 0. const __m128 vx = _mm_sub_ps(vi, vi_max); diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c index 0460e422b..751020fbe 100644 --- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c +++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/raddstoreexpminusmax.h> @@ -162,7 +163,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8( assert(elements >= 1 * sizeof(float)); assert(elements <= 3 * sizeof(float)); // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); + const __m128 vi = _mm_loadu_ps_notsan(input); // Subtract maximum input x := i - i_max. This implies x <= 0. const __m128 vx = _mm_sub_ps(vi, vi_max); diff --git a/src/f32-raddstoreexpminusmax/sse2-p5.c.in b/src/f32-raddstoreexpminusmax/sse2-p5.c.in index 51a33c888..6582df80a 100644 --- a/src/f32-raddstoreexpminusmax/sse2-p5.c.in +++ b/src/f32-raddstoreexpminusmax/sse2-p5.c.in @@ -12,6 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/raddstoreexpminusmax.h> @@ -173,7 +174,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x${ELEMENTS_TILE}${"" if ACCU assert(elements >= 1 * sizeof(float)); assert(elements <= 3 * sizeof(float)); // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); + const __m128 vi = _mm_loadu_ps_notsan(input); // Subtract maximum input x := i - i_max. This implies x <= 0. const __m128 vx = _mm_sub_ps(vi, vi_max); diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x12.c b/src/f32-sigmoid/gen/sse2-p5-div-x12.c index da8f0a288..388281d39 100644 --- a/src/f32-sigmoid/gen/sse2-p5-div-x12.c +++ b/src/f32-sigmoid/gen/sse2-p5-div-x12.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vunary.h> @@ -213,7 +214,7 @@ void xnn_f32_sigmoid_ukernel__sse2_p5_div_x12( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx = _mm_loadu_ps(x); + const __m128 vx = _mm_loadu_ps_notsan(x); // General structure of the algorithm: // / exp(x) / (1 + exp(x)) if x <= 0 diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x16.c b/src/f32-sigmoid/gen/sse2-p5-div-x16.c index 65e43cda4..f96d98bd8 100644 --- a/src/f32-sigmoid/gen/sse2-p5-div-x16.c +++ b/src/f32-sigmoid/gen/sse2-p5-div-x16.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vunary.h> @@ -232,7 +233,7 @@ void xnn_f32_sigmoid_ukernel__sse2_p5_div_x16( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx = _mm_loadu_ps(x); + const __m128 vx = _mm_loadu_ps_notsan(x); // General structure of the algorithm: // / exp(x) / (1 + exp(x)) if x <= 0 diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x20.c b/src/f32-sigmoid/gen/sse2-p5-div-x20.c index 916fff048..d0de04a12 100644 --- a/src/f32-sigmoid/gen/sse2-p5-div-x20.c +++ b/src/f32-sigmoid/gen/sse2-p5-div-x20.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vunary.h> @@ -251,7 +252,7 @@ void xnn_f32_sigmoid_ukernel__sse2_p5_div_x20( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx = _mm_loadu_ps(x); + const __m128 vx = _mm_loadu_ps_notsan(x); // General structure of the algorithm: // / exp(x) / (1 + exp(x)) if x <= 0 diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x24.c b/src/f32-sigmoid/gen/sse2-p5-div-x24.c index 920204f93..72363b4ba 100644 --- a/src/f32-sigmoid/gen/sse2-p5-div-x24.c +++ b/src/f32-sigmoid/gen/sse2-p5-div-x24.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vunary.h> @@ -270,7 +271,7 @@ void xnn_f32_sigmoid_ukernel__sse2_p5_div_x24( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx = _mm_loadu_ps(x); + const __m128 vx = _mm_loadu_ps_notsan(x); // General structure of the algorithm: // / exp(x) / (1 + exp(x)) if x <= 0 diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x4.c b/src/f32-sigmoid/gen/sse2-p5-div-x4.c index f543ef817..89dd0a99a 100644 --- a/src/f32-sigmoid/gen/sse2-p5-div-x4.c +++ b/src/f32-sigmoid/gen/sse2-p5-div-x4.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vunary.h> @@ -105,7 +106,7 @@ void xnn_f32_sigmoid_ukernel__sse2_p5_div_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx = _mm_loadu_ps(x); + const __m128 vx = _mm_loadu_ps_notsan(x); // General structure of the algorithm: // / exp(x) / (1 + exp(x)) if x <= 0 diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x8.c b/src/f32-sigmoid/gen/sse2-p5-div-x8.c index 21f6fde0f..43c29516f 100644 --- a/src/f32-sigmoid/gen/sse2-p5-div-x8.c +++ b/src/f32-sigmoid/gen/sse2-p5-div-x8.c @@ -12,6 +12,7 @@ #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vunary.h> @@ -194,7 +195,7 @@ void xnn_f32_sigmoid_ukernel__sse2_p5_div_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx = _mm_loadu_ps(x); + const __m128 vx = _mm_loadu_ps_notsan(x); // General structure of the algorithm: // / exp(x) / (1 + exp(x)) if x <= 0 diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x12.c b/src/f32-sigmoid/gen/sse41-p5-div-x12.c index 7753dd313..ef61c6aea 100644 --- a/src/f32-sigmoid/gen/sse41-p5-div-x12.c +++ b/src/f32-sigmoid/gen/sse41-p5-div-x12.c @@ -12,6 +12,7 @@ #include <smmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vunary.h> @@ -208,7 +209,7 @@ void xnn_f32_sigmoid_ukernel__sse41_p5_div_x12( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx = _mm_loadu_ps(x); + const __m128 vx = _mm_loadu_ps_notsan(x); // General structure of the algorithm: // / exp(x) / (1 + exp(x)) if x <= 0 diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x16.c b/src/f32-sigmoid/gen/sse41-p5-div-x16.c index 92f989a23..a8e38ea5f 100644 --- a/src/f32-sigmoid/gen/sse41-p5-div-x16.c +++ b/src/f32-sigmoid/gen/sse41-p5-div-x16.c @@ -12,6 +12,7 @@ #include <smmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vunary.h> @@ -226,7 +227,7 @@ void xnn_f32_sigmoid_ukernel__sse41_p5_div_x16( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx = _mm_loadu_ps(x); + const __m128 vx = _mm_loadu_ps_notsan(x); // General structure of the algorithm: // / exp(x) / (1 + exp(x)) if x <= 0 diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x20.c b/src/f32-sigmoid/gen/sse41-p5-div-x20.c index a58a9cd65..f42ac8693 100644 --- a/src/f32-sigmoid/gen/sse41-p5-div-x20.c +++ b/src/f32-sigmoid/gen/sse41-p5-div-x20.c @@ -12,6 +12,7 @@ #include <smmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vunary.h> @@ -244,7 +245,7 @@ void xnn_f32_sigmoid_ukernel__sse41_p5_div_x20( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx = _mm_loadu_ps(x); + const __m128 vx = _mm_loadu_ps_notsan(x); // General structure of the algorithm: // / exp(x) / (1 + exp(x)) if x <= 0 diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x24.c b/src/f32-sigmoid/gen/sse41-p5-div-x24.c index 71979d3a5..72b8fd7b3 100644 --- a/src/f32-sigmoid/gen/sse41-p5-div-x24.c +++ b/src/f32-sigmoid/gen/sse41-p5-div-x24.c @@ -12,6 +12,7 @@ #include <smmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vunary.h> @@ -262,7 +263,7 @@ void xnn_f32_sigmoid_ukernel__sse41_p5_div_x24( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx = _mm_loadu_ps(x); + const __m128 vx = _mm_loadu_ps_notsan(x); // General structure of the algorithm: // / exp(x) / (1 + exp(x)) if x <= 0 diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x4.c b/src/f32-sigmoid/gen/sse41-p5-div-x4.c index 54b600a7e..27679fb2d 100644 --- a/src/f32-sigmoid/gen/sse41-p5-div-x4.c +++ b/src/f32-sigmoid/gen/sse41-p5-div-x4.c @@ -12,6 +12,7 @@ #include <smmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vunary.h> @@ -104,7 +105,7 @@ void xnn_f32_sigmoid_ukernel__sse41_p5_div_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx = _mm_loadu_ps(x); + const __m128 vx = _mm_loadu_ps_notsan(x); // General structure of the algorithm: // / exp(x) / (1 + exp(x)) if x <= 0 diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x8.c b/src/f32-sigmoid/gen/sse41-p5-div-x8.c index 6a9e9db78..5c28a64d6 100644 --- a/src/f32-sigmoid/gen/sse41-p5-div-x8.c +++ b/src/f32-sigmoid/gen/sse41-p5-div-x8.c @@ -12,6 +12,7 @@ #include <smmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vunary.h> @@ -190,7 +191,7 @@ void xnn_f32_sigmoid_ukernel__sse41_p5_div_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx = _mm_loadu_ps(x); + const __m128 vx = _mm_loadu_ps_notsan(x); // General structure of the algorithm: // / exp(x) / (1 + exp(x)) if x <= 0 diff --git a/src/f32-sigmoid/sse-p5-div.c.in b/src/f32-sigmoid/sse-p5-div.c.in index c509e8d3d..05a0607cc 100644 --- a/src/f32-sigmoid/sse-p5-div.c.in +++ b/src/f32-sigmoid/sse-p5-div.c.in @@ -14,6 +14,7 @@ $else: #include <emmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vunary.h> @@ -206,7 +207,7 @@ void xnn_f32_sigmoid_ukernel__${"sse41" if BLEND else "sse2"}_p5_div_x${BATCH_TI y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 vx = _mm_loadu_ps(x); + const __m128 vx = _mm_loadu_ps_notsan(x); // General structure of the algorithm: // / exp(x) / (1 + exp(x)) if x <= 0 diff --git a/src/f32-vbinary/gen/vadd-minmax-sse-x4.c b/src/f32-vbinary/gen/vadd-minmax-sse-x4.c index 0d940c30e..4eed934aa 100644 --- a/src/f32-vbinary/gen/vadd-minmax-sse-x4.c +++ b/src/f32-vbinary/gen/vadd-minmax-sse-x4.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -58,8 +59,8 @@ void xnn_f32_vadd_minmax_ukernel__sse_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); - const __m128 vb0123 = _mm_loadu_ps(b); + const __m128 va0123 = _mm_loadu_ps_notsan(a); + const __m128 vb0123 = _mm_loadu_ps_notsan(b); __m128 vy0123 = _mm_add_ps(va0123, vb0123); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vadd-minmax-sse-x8.c b/src/f32-vbinary/gen/vadd-minmax-sse-x8.c index 529e6a5d7..48b29a939 100644 --- a/src/f32-vbinary/gen/vadd-minmax-sse-x8.c +++ b/src/f32-vbinary/gen/vadd-minmax-sse-x8.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -64,8 +65,8 @@ void xnn_f32_vadd_minmax_ukernel__sse_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); - const __m128 vb0123 = _mm_loadu_ps(b); + const __m128 va0123 = _mm_loadu_ps_notsan(a); + const __m128 vb0123 = _mm_loadu_ps_notsan(b); __m128 vy0123 = _mm_add_ps(va0123, vb0123); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vaddc-minmax-sse-x4.c b/src/f32-vbinary/gen/vaddc-minmax-sse-x4.c index c9d9ae52d..48ef5fecf 100644 --- a/src/f32-vbinary/gen/vaddc-minmax-sse-x4.c +++ b/src/f32-vbinary/gen/vaddc-minmax-sse-x4.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -53,7 +54,7 @@ void xnn_f32_vaddc_minmax_ukernel__sse_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_add_ps(va0123, vb); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vaddc-minmax-sse-x8.c b/src/f32-vbinary/gen/vaddc-minmax-sse-x8.c index 21853a63f..3a1b371cf 100644 --- a/src/f32-vbinary/gen/vaddc-minmax-sse-x8.c +++ b/src/f32-vbinary/gen/vaddc-minmax-sse-x8.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -58,7 +59,7 @@ void xnn_f32_vaddc_minmax_ukernel__sse_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_add_ps(va0123, vb); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vdiv-minmax-sse-x4.c b/src/f32-vbinary/gen/vdiv-minmax-sse-x4.c index 3fda9b4b7..1944f4417 100644 --- a/src/f32-vbinary/gen/vdiv-minmax-sse-x4.c +++ b/src/f32-vbinary/gen/vdiv-minmax-sse-x4.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -58,8 +59,8 @@ void xnn_f32_vdiv_minmax_ukernel__sse_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); - const __m128 vb0123 = _mm_loadu_ps(b); + const __m128 va0123 = _mm_loadu_ps_notsan(a); + const __m128 vb0123 = _mm_loadu_ps_notsan(b); __m128 vy0123 = _mm_div_ps(va0123, vb0123); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vdiv-minmax-sse-x8.c b/src/f32-vbinary/gen/vdiv-minmax-sse-x8.c index 3f75e7fd9..0d7da93f6 100644 --- a/src/f32-vbinary/gen/vdiv-minmax-sse-x8.c +++ b/src/f32-vbinary/gen/vdiv-minmax-sse-x8.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -64,8 +65,8 @@ void xnn_f32_vdiv_minmax_ukernel__sse_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); - const __m128 vb0123 = _mm_loadu_ps(b); + const __m128 va0123 = _mm_loadu_ps_notsan(a); + const __m128 vb0123 = _mm_loadu_ps_notsan(b); __m128 vy0123 = _mm_div_ps(va0123, vb0123); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vdivc-minmax-sse-x4.c b/src/f32-vbinary/gen/vdivc-minmax-sse-x4.c index 2c19a848d..2b9042788 100644 --- a/src/f32-vbinary/gen/vdivc-minmax-sse-x4.c +++ b/src/f32-vbinary/gen/vdivc-minmax-sse-x4.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -53,7 +54,7 @@ void xnn_f32_vdivc_minmax_ukernel__sse_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_div_ps(va0123, vb); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vdivc-minmax-sse-x8.c b/src/f32-vbinary/gen/vdivc-minmax-sse-x8.c index c2b24059b..6b9191d6a 100644 --- a/src/f32-vbinary/gen/vdivc-minmax-sse-x8.c +++ b/src/f32-vbinary/gen/vdivc-minmax-sse-x8.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -58,7 +59,7 @@ void xnn_f32_vdivc_minmax_ukernel__sse_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_div_ps(va0123, vb); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vmax-sse-x4.c b/src/f32-vbinary/gen/vmax-sse-x4.c index 862013901..6e04ea491 100644 --- a/src/f32-vbinary/gen/vmax-sse-x4.c +++ b/src/f32-vbinary/gen/vmax-sse-x4.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -51,8 +52,8 @@ void xnn_f32_vmax_ukernel__sse_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); - const __m128 vb0123 = _mm_loadu_ps(b); + const __m128 va0123 = _mm_loadu_ps_notsan(a); + const __m128 vb0123 = _mm_loadu_ps_notsan(b); __m128 vy0123 = _mm_max_ps(va0123, vb0123); if (n & (2 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/vmax-sse-x8.c b/src/f32-vbinary/gen/vmax-sse-x8.c index 7a506af9c..20ceef219 100644 --- a/src/f32-vbinary/gen/vmax-sse-x8.c +++ b/src/f32-vbinary/gen/vmax-sse-x8.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -55,8 +56,8 @@ void xnn_f32_vmax_ukernel__sse_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); - const __m128 vb0123 = _mm_loadu_ps(b); + const __m128 va0123 = _mm_loadu_ps_notsan(a); + const __m128 vb0123 = _mm_loadu_ps_notsan(b); __m128 vy0123 = _mm_max_ps(va0123, vb0123); if (n & (2 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/vmaxc-sse-x4.c b/src/f32-vbinary/gen/vmaxc-sse-x4.c index 2702effe4..235fce19e 100644 --- a/src/f32-vbinary/gen/vmaxc-sse-x4.c +++ b/src/f32-vbinary/gen/vmaxc-sse-x4.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -46,7 +47,7 @@ void xnn_f32_vmaxc_ukernel__sse_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_max_ps(va0123, vb); if (n & (2 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/vmaxc-sse-x8.c b/src/f32-vbinary/gen/vmaxc-sse-x8.c index 642bfe669..30bbf23bb 100644 --- a/src/f32-vbinary/gen/vmaxc-sse-x8.c +++ b/src/f32-vbinary/gen/vmaxc-sse-x8.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -49,7 +50,7 @@ void xnn_f32_vmaxc_ukernel__sse_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_max_ps(va0123, vb); if (n & (2 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/vmin-sse-x4.c b/src/f32-vbinary/gen/vmin-sse-x4.c index c9ca659ef..7875e8869 100644 --- a/src/f32-vbinary/gen/vmin-sse-x4.c +++ b/src/f32-vbinary/gen/vmin-sse-x4.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -51,8 +52,8 @@ void xnn_f32_vmin_ukernel__sse_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); - const __m128 vb0123 = _mm_loadu_ps(b); + const __m128 va0123 = _mm_loadu_ps_notsan(a); + const __m128 vb0123 = _mm_loadu_ps_notsan(b); __m128 vy0123 = _mm_min_ps(va0123, vb0123); if (n & (2 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/vmin-sse-x8.c b/src/f32-vbinary/gen/vmin-sse-x8.c index 92770a692..28b51877e 100644 --- a/src/f32-vbinary/gen/vmin-sse-x8.c +++ b/src/f32-vbinary/gen/vmin-sse-x8.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -55,8 +56,8 @@ void xnn_f32_vmin_ukernel__sse_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); - const __m128 vb0123 = _mm_loadu_ps(b); + const __m128 va0123 = _mm_loadu_ps_notsan(a); + const __m128 vb0123 = _mm_loadu_ps_notsan(b); __m128 vy0123 = _mm_min_ps(va0123, vb0123); if (n & (2 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/vminc-sse-x4.c b/src/f32-vbinary/gen/vminc-sse-x4.c index 4fac969d2..cea189135 100644 --- a/src/f32-vbinary/gen/vminc-sse-x4.c +++ b/src/f32-vbinary/gen/vminc-sse-x4.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -46,7 +47,7 @@ void xnn_f32_vminc_ukernel__sse_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_min_ps(va0123, vb); if (n & (2 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/vminc-sse-x8.c b/src/f32-vbinary/gen/vminc-sse-x8.c index e3c579925..5b3e65646 100644 --- a/src/f32-vbinary/gen/vminc-sse-x8.c +++ b/src/f32-vbinary/gen/vminc-sse-x8.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -49,7 +50,7 @@ void xnn_f32_vminc_ukernel__sse_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_min_ps(va0123, vb); if (n & (2 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/vmul-minmax-sse-x4.c b/src/f32-vbinary/gen/vmul-minmax-sse-x4.c index e50094109..a1fc3057f 100644 --- a/src/f32-vbinary/gen/vmul-minmax-sse-x4.c +++ b/src/f32-vbinary/gen/vmul-minmax-sse-x4.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -58,8 +59,8 @@ void xnn_f32_vmul_minmax_ukernel__sse_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); - const __m128 vb0123 = _mm_loadu_ps(b); + const __m128 va0123 = _mm_loadu_ps_notsan(a); + const __m128 vb0123 = _mm_loadu_ps_notsan(b); __m128 vy0123 = _mm_mul_ps(va0123, vb0123); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vmul-minmax-sse-x8.c b/src/f32-vbinary/gen/vmul-minmax-sse-x8.c index 7c3186708..2678c4b25 100644 --- a/src/f32-vbinary/gen/vmul-minmax-sse-x8.c +++ b/src/f32-vbinary/gen/vmul-minmax-sse-x8.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -64,8 +65,8 @@ void xnn_f32_vmul_minmax_ukernel__sse_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); - const __m128 vb0123 = _mm_loadu_ps(b); + const __m128 va0123 = _mm_loadu_ps_notsan(a); + const __m128 vb0123 = _mm_loadu_ps_notsan(b); __m128 vy0123 = _mm_mul_ps(va0123, vb0123); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vmulc-minmax-sse-x4.c b/src/f32-vbinary/gen/vmulc-minmax-sse-x4.c index 19c9cb1af..e01cf39b6 100644 --- a/src/f32-vbinary/gen/vmulc-minmax-sse-x4.c +++ b/src/f32-vbinary/gen/vmulc-minmax-sse-x4.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -53,7 +54,7 @@ void xnn_f32_vmulc_minmax_ukernel__sse_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_mul_ps(va0123, vb); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vmulc-minmax-sse-x8.c b/src/f32-vbinary/gen/vmulc-minmax-sse-x8.c index a1fb4a733..6a0d03367 100644 --- a/src/f32-vbinary/gen/vmulc-minmax-sse-x8.c +++ b/src/f32-vbinary/gen/vmulc-minmax-sse-x8.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -58,7 +59,7 @@ void xnn_f32_vmulc_minmax_ukernel__sse_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_mul_ps(va0123, vb); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vrdivc-minmax-sse-x4.c b/src/f32-vbinary/gen/vrdivc-minmax-sse-x4.c index 51f0811c5..d18b9d1ed 100644 --- a/src/f32-vbinary/gen/vrdivc-minmax-sse-x4.c +++ b/src/f32-vbinary/gen/vrdivc-minmax-sse-x4.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -53,7 +54,7 @@ void xnn_f32_vrdivc_minmax_ukernel__sse_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_div_ps(vb, va0123); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vrdivc-minmax-sse-x8.c b/src/f32-vbinary/gen/vrdivc-minmax-sse-x8.c index 48eb7411b..01ac18c1e 100644 --- a/src/f32-vbinary/gen/vrdivc-minmax-sse-x8.c +++ b/src/f32-vbinary/gen/vrdivc-minmax-sse-x8.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -58,7 +59,7 @@ void xnn_f32_vrdivc_minmax_ukernel__sse_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_div_ps(vb, va0123); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vrsubc-minmax-sse-x4.c b/src/f32-vbinary/gen/vrsubc-minmax-sse-x4.c index a9c0df140..0523b6900 100644 --- a/src/f32-vbinary/gen/vrsubc-minmax-sse-x4.c +++ b/src/f32-vbinary/gen/vrsubc-minmax-sse-x4.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -53,7 +54,7 @@ void xnn_f32_vrsubc_minmax_ukernel__sse_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_sub_ps(vb, va0123); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vrsubc-minmax-sse-x8.c b/src/f32-vbinary/gen/vrsubc-minmax-sse-x8.c index 7be1c4681..e03085dc3 100644 --- a/src/f32-vbinary/gen/vrsubc-minmax-sse-x8.c +++ b/src/f32-vbinary/gen/vrsubc-minmax-sse-x8.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -58,7 +59,7 @@ void xnn_f32_vrsubc_minmax_ukernel__sse_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_sub_ps(vb, va0123); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vsub-minmax-sse-x4.c b/src/f32-vbinary/gen/vsub-minmax-sse-x4.c index 37268fced..e3ca419dc 100644 --- a/src/f32-vbinary/gen/vsub-minmax-sse-x4.c +++ b/src/f32-vbinary/gen/vsub-minmax-sse-x4.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -58,8 +59,8 @@ void xnn_f32_vsub_minmax_ukernel__sse_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); - const __m128 vb0123 = _mm_loadu_ps(b); + const __m128 va0123 = _mm_loadu_ps_notsan(a); + const __m128 vb0123 = _mm_loadu_ps_notsan(b); __m128 vy0123 = _mm_sub_ps(va0123, vb0123); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vsub-minmax-sse-x8.c b/src/f32-vbinary/gen/vsub-minmax-sse-x8.c index ff1866560..33568d0ba 100644 --- a/src/f32-vbinary/gen/vsub-minmax-sse-x8.c +++ b/src/f32-vbinary/gen/vsub-minmax-sse-x8.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -64,8 +65,8 @@ void xnn_f32_vsub_minmax_ukernel__sse_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); - const __m128 vb0123 = _mm_loadu_ps(b); + const __m128 va0123 = _mm_loadu_ps_notsan(a); + const __m128 vb0123 = _mm_loadu_ps_notsan(b); __m128 vy0123 = _mm_sub_ps(va0123, vb0123); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vsubc-minmax-sse-x4.c b/src/f32-vbinary/gen/vsubc-minmax-sse-x4.c index b404ad45b..fde8b8efd 100644 --- a/src/f32-vbinary/gen/vsubc-minmax-sse-x4.c +++ b/src/f32-vbinary/gen/vsubc-minmax-sse-x4.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -53,7 +54,7 @@ void xnn_f32_vsubc_minmax_ukernel__sse_x4( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_sub_ps(va0123, vb); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/gen/vsubc-minmax-sse-x8.c b/src/f32-vbinary/gen/vsubc-minmax-sse-x8.c index 4db9df9c0..11cc16d43 100644 --- a/src/f32-vbinary/gen/vsubc-minmax-sse-x8.c +++ b/src/f32-vbinary/gen/vsubc-minmax-sse-x8.c @@ -12,6 +12,7 @@ #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -58,7 +59,7 @@ void xnn_f32_vsubc_minmax_ukernel__sse_x8( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = _mm_sub_ps(va0123, vb); vy0123 = _mm_max_ps(vy0123, vy_min); diff --git a/src/f32-vbinary/vop-sse.c.in b/src/f32-vbinary/vop-sse.c.in index e0425d292..fd096ea1a 100644 --- a/src/f32-vbinary/vop-sse.c.in +++ b/src/f32-vbinary/vop-sse.c.in @@ -13,6 +13,7 @@ $assert ACTIVATION in ["LINEAR", "MINMAX"] #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -82,8 +83,8 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__sse_x${BATCH_TILE}( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); - const __m128 vb0123 = _mm_loadu_ps(b); + const __m128 va0123 = _mm_loadu_ps_notsan(a); + const __m128 vb0123 = _mm_loadu_ps_notsan(b); __m128 vy0123 = ${_MM_OP_PS("va0123", "vb0123")}; $if ACTIVATION == "MINMAX": diff --git a/src/f32-vbinary/vopc-sse.c.in b/src/f32-vbinary/vopc-sse.c.in index 75fb3fe16..0f27ca3d9 100644 --- a/src/f32-vbinary/vopc-sse.c.in +++ b/src/f32-vbinary/vopc-sse.c.in @@ -13,6 +13,7 @@ $assert ACTIVATION in ["LINEAR", "MINMAX"] #include <xmmintrin.h> #include <xnnpack/common.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/vbinary.h> @@ -77,7 +78,7 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__sse_x${BATCH_TILE}( y += 4; } if XNN_UNLIKELY(n != 0) { - const __m128 va0123 = _mm_loadu_ps(a); + const __m128 va0123 = _mm_loadu_ps_notsan(a); __m128 vy0123 = ${_MM_OP_PS("va0123")}; $if ACTIVATION == "MINMAX": diff --git a/src/u8-maxpool/9p8x-minmax-sse2-c16.c b/src/u8-maxpool/9p8x-minmax-sse2-c16.c index 1997fe73d..ba79f6986 100644 --- a/src/u8-maxpool/9p8x-minmax-sse2-c16.c +++ b/src/u8-maxpool/9p8x-minmax-sse2-c16.c @@ -10,10 +10,11 @@ #include <emmintrin.h> +#include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/maxpool.h> -XNN_DISABLE_TSAN void xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16( +void xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16( size_t output_pixels, size_t kernel_elements, size_t channels, @@ -102,15 +103,15 @@ XNN_DISABLE_TSAN void xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16( _mm_storeu_si128((__m128i*) o, vout); o += 16; } if (c != 0) { - const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0); - const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1); - const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2); - const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3); - const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4); - const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5); - const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6); - const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7); - const __m128i vi8 = _mm_loadu_si128((const __m128i*) i8); + const __m128i vi0 = _mm_loadu_si128_notsan((const __m128i*) i0); + const __m128i vi1 = _mm_loadu_si128_notsan((const __m128i*) i1); + const __m128i vi2 = _mm_loadu_si128_notsan((const __m128i*) i2); + const __m128i vi3 = _mm_loadu_si128_notsan((const __m128i*) i3); + const __m128i vi4 = _mm_loadu_si128_notsan((const __m128i*) i4); + const __m128i vi5 = _mm_loadu_si128_notsan((const __m128i*) i5); + const __m128i vi6 = _mm_loadu_si128_notsan((const __m128i*) i6); + const __m128i vi7 = _mm_loadu_si128_notsan((const __m128i*) i7); + const __m128i vi8 = _mm_loadu_si128_notsan((const __m128i*) i8); const __m128i vmax018 = _mm_max_epu8(_mm_max_epu8(vi0, vi1), vi8); const __m128i vmax23 = _mm_max_epu8(vi2, vi3); @@ -210,15 +211,15 @@ XNN_DISABLE_TSAN void xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16( o += 16; } if (c != 0) { - const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0); - const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1); - const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2); - const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3); - const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4); - const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5); - const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6); - const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7); - const __m128i vo = _mm_loadu_si128((const __m128i*) o); + const __m128i vi0 = _mm_loadu_si128_notsan((const __m128i*) i0); + const __m128i vi1 = _mm_loadu_si128_notsan((const __m128i*) i1); + const __m128i vi2 = _mm_loadu_si128_notsan((const __m128i*) i2); + const __m128i vi3 = _mm_loadu_si128_notsan((const __m128i*) i3); + const __m128i vi4 = _mm_loadu_si128_notsan((const __m128i*) i4); + const __m128i vi5 = _mm_loadu_si128_notsan((const __m128i*) i5); + const __m128i vi6 = _mm_loadu_si128_notsan((const __m128i*) i6); + const __m128i vi7 = _mm_loadu_si128_notsan((const __m128i*) i7); + const __m128i vo = _mm_loadu_si128_notsan((const __m128i*) o); const __m128i vmax01 = _mm_max_epu8(_mm_max_epu8(vi0, vi1), vo); const __m128i vmax23 = _mm_max_epu8(vi2, vi3); diff --git a/src/xnnpack/common.h b/src/xnnpack/common.h index 86e9b3ac7..4063a5e14 100644 --- a/src/xnnpack/common.h +++ b/src/xnnpack/common.h @@ -188,6 +188,14 @@ #endif #if defined(__GNUC__) + #define XNN_INTRINSIC inline __attribute__((__always_inline__, __artificial__)) +#elif defined(_MSC_VER) + #define XNN_INTRINSIC __forceinline +#else + #define XNN_INTRINSIC inline +#endif + +#if defined(__GNUC__) #define XNN_INLINE inline __attribute__((__always_inline__)) #elif defined(_MSC_VER) #define XNN_INLINE __forceinline diff --git a/src/xnnpack/intrinsics-polyfill.h b/src/xnnpack/intrinsics-polyfill.h index 51da04e82..caa192d30 100644 --- a/src/xnnpack/intrinsics-polyfill.h +++ b/src/xnnpack/intrinsics-polyfill.h @@ -5,6 +5,27 @@ #pragma once +#include <xnnpack/common.h> + + +#if defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1)) +#include <xmmintrin.h> + +static XNN_INTRINSIC XNN_DISABLE_TSAN +__m128 _mm_loadu_ps_notsan(const float* address) { + return _mm_loadu_ps(address); +} +#endif + +#if defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) +#include <emmintrin.h> + +static XNN_INTRINSIC XNN_DISABLE_TSAN +__m128i _mm_loadu_si128_notsan(const __m128i* address) { + return _mm_loadu_si128(address); +} +#endif + #ifdef __AVX512F__ #include <immintrin.h> @@ -15,8 +36,8 @@ (defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \ (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800)) -static inline __mmask16 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_cvtu32_mask16(unsigned int mask) { +static XNN_INTRINSIC +__mmask16 _cvtu32_mask16(unsigned int mask) { return (__mmask16) mask; } @@ -27,8 +48,8 @@ _cvtu32_mask16(unsigned int mask) { (defined(__clang__) && (__clang_major__ < 4)) || \ (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800)) -static inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_add_ps(__m512 v) { +static XNN_INTRINSIC +float _mm512_reduce_add_ps(__m512 v) { #if __AVX512DQ__ const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1)); #else @@ -40,8 +61,8 @@ _mm512_reduce_add_ps(__m512 v) { return _mm_cvtss_f32(sum16); } -static inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_max_ps(__m512 v) { +static XNN_INTRINSIC +float _mm512_reduce_max_ps(__m512 v) { #if __AVX512DQ__ const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1)); #else |