From 5fb07d7117dab7e1511895e8106224876abadd00 Mon Sep 17 00:00:00 2001
From: dario mambro <dario.mambro@gmail.com>
Date: Sat, 19 Dec 2020 01:52:10 +0100
Subject: added support for doubles on neon

(cherry picked from commit d43dfeca5679624cb04ef282d6807910fd218871)
---
 simd/pf_double.h               |   1 +
 simd/pf_neon_double.h          | 200 +++++++++++++++++++++++++++++++++++++++++
 simd/pf_neon_double_from_avx.h | 136 ++++++++++++++++++++++++++++
 3 files changed, 337 insertions(+)
 create mode 100644 simd/pf_neon_double.h
 create mode 100644 simd/pf_neon_double_from_avx.h

diff --git a/simd/pf_double.h b/simd/pf_double.h
index 2052bbd..c6c73ab 100644
--- a/simd/pf_double.h
+++ b/simd/pf_double.h
@@ -60,6 +60,7 @@
 typedef double vsfscalar;
 
 #include "pf_avx_double.h"
+#include "pf_neon_double.h"
 
 #ifndef SIMD_SZ
 #  if !defined(PFFFT_SIMD_DISABLE)
diff --git a/simd/pf_neon_double.h b/simd/pf_neon_double.h
new file mode 100644
index 0000000..1c8b852
--- /dev/null
+++ b/simd/pf_neon_double.h
@@ -0,0 +1,200 @@
+/*
+   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
+*/
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_NEON_DBL_H
+#define PF_NEON_DBL_H
+
+/*
+  NEON 64bit support macros
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
+
+#pragma message __FILE__ ": NEON (from AVX) macros are defined"
+
+#include "pf_neon_double_from_avx.h"
+typedef __m256d v4sf;
+
+/* 4 doubles by simd vector */
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  double f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "NEON"
+#  define VREQUIRES_ALIGN 1
+#  define VZERO() _mm256_setzero_pd()
+#  define VMUL(a,b) _mm256_mul_pd(a,b)
+#  define VADD(a,b) _mm256_add_pd(a,b)
+#  define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
+#  define VSUB(a,b) _mm256_sub_pd(a,b)
+#  define LD_PS1(p) _mm256_set1_pd(p)
+#  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
+#  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
+
+
+FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
+{
+    float64x1_t al = vget_low_f64(a);
+    float64x1_t bl = vget_low_f64(b);
+    return vcombine_f64(al, bl);
+}
+
+FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
+{
+    float64x1_t ah = vget_high_f64(a);
+    float64x1_t bh = vget_high_f64(b);
+    return vcombine_f64(ah, bh);
+}
+
+FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
+{
+    __m256d res;
+    res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
+    res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
+{
+    __m256d res;
+    res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
+    res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
+    __m256d res;
+    res.vect_f64[0] = a.vect_f64[0];
+    res.vect_f64[1] = b.vect_f64[0];
+    return res;
+}
+
+
+FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
+{
+    __m256d res;
+    res.vect_f64[0] = a.vect_f64[1];
+    res.vect_f64[1] = b.vect_f64[1];
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_reverse(__m256d x)
+{
+    __m256d res;
+    float64x2_t low = x.vect_f64[0];
+    float64x2_t high = x.vect_f64[1];
+    float64x1_t a = vget_low_f64(low);
+    float64x1_t b = vget_high_f64(low);
+    float64x1_t c = vget_low_f64(high);
+    float64x1_t d = vget_high_f64(high);
+    res.vect_f64[0] =  vcombine_f64(d, c);
+    res.vect_f64[1] =  vcombine_f64(b, a);
+    return res;
+}
+
+/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in2[0], in1[1], in2[1] ]
+out2 = [ in1[2], in2[2], in1[3], in2[3] ]
+*/
+#  define INTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
+	__m256d tmp__ = _mm256_insertf128_pd(								\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)),		\
+		_mm_shuffle_pd_11(low1__, low2__),								\
+		1);																\
+	out2 = _mm256_insertf128_pd(										\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)),	\
+		_mm_shuffle_pd_11(high1__, high2__),							\
+		1);																\
+	out1 = tmp__;														\
+}
+
+/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in1[2], in2[0], in2[2] ]
+out2 = [ in1[1], in1[3], in2[1], in2[3] ]
+*/
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
+	__m256d tmp__ = _mm256_insertf128_pd(								\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)),		\
+		_mm_shuffle_pd_00(low2__, high2__),								\
+		1);																\
+	out2 = _mm256_insertf128_pd(										\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)),		\
+		_mm_shuffle_pd_11(low2__, high2__),								\
+		1);																\
+	out1 = tmp__;														\
+}
+
+#  define VTRANSPOSE4(row0, row1, row2, row3) {							\
+        __m256d tmp3, tmp2, tmp1, tmp0;                     			\
+                                                            			\
+        tmp0 = _mm256_shuffle_pd_00((row0),(row1));       				\
+        tmp2 = _mm256_shuffle_pd_11((row0),(row1));       				\
+        tmp1 = _mm256_shuffle_pd_00((row2),(row3));       				\
+        tmp3 = _mm256_shuffle_pd_11((row2),(row3));       				\
+                                                            			\
+        (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
+        (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
+        (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
+        (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
+    }
+
+/*VSWAPHL(a, b) pseudo code:
+return [ b[0], b[1], a[2], a[3] ]
+*/
+#  define VSWAPHL(a,b)	\
+   _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
+
+/* reverse/flip all floats */
+#  define VREV_S(a)   _mm256_reverse(a)
+
+/* reverse/flip complex floats */
+#  define VREV_C(a)    _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
+
+#endif
+
+#endif /* PF_AVX_DBL_H */
+
diff --git a/simd/pf_neon_double_from_avx.h b/simd/pf_neon_double_from_avx.h
new file mode 100644
index 0000000..c8cd74e
--- /dev/null
+++ b/simd/pf_neon_double_from_avx.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+
+ * http://www.apache.org/licenses/LICENSE-2.0
+
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+
+ */
+
+//see https://github.com/kunpengcompute/AvxToNeon
+
+#ifndef PF_NEON_DBL_FROM_AVX_H
+#define PF_NEON_DBL_FROM_AVX_H
+#include <arm_neon.h>
+
+
+#if defined(__GNUC__) || defined(__clang__)
+
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+
+#else
+
+#error "Macro name collisions may happens with unknown compiler"
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+
+#define FORCE_INLINE static inline
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+
+#endif
+
+typedef struct {
+	float32x4_t vect_f32[2];
+} __m256;
+
+typedef struct {
+	float64x2_t vect_f64[2];
+} __m256d;
+
+typedef float64x2_t __m128d;
+
+FORCE_INLINE __m256d _mm256_setzero_pd(void)
+{
+	__m256d ret;
+	ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
+	return ret;
+}
+
+FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
+{
+	__m256d res_m256d;
+	res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
+	res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
+	return res_m256d;
+}
+
+FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
+{
+	__m256d res_m256d;
+	res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
+	res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
+	return res_m256d;
+}
+
+FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
+{
+	__m256d res_m256d;
+	res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
+	res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
+	return res_m256d;
+}
+
+FORCE_INLINE __m256d _mm256_set1_pd(double a)
+{
+	__m256d ret;
+	ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
+	return ret;
+}
+
+FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
+{
+	__m256d res;
+	res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+	res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+	return res;
+}
+FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
+{
+	__m256d res;
+	res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+	res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+	return res;
+}
+
+FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
+{
+	return a.vect_f64[0];
+}
+
+FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
+{
+	assert(imm8 >= 0 && imm8 <= 1);
+	return a.vect_f64[imm8];
+}
+FORCE_INLINE __m256d _mm256_insertf128_pd(__m256d a, __m128d b, int imm8)
+{
+	assert(imm8 == 0 || imm8 == 1);
+	__m256d res;
+	uint64x2_t vmask = vceqq_s64(vdupq_n_s64(imm8), vdupq_n_s64(0));
+	res.vect_f64[0] = vbslq_f64(vmask, b, a.vect_f64[0]);
+	res.vect_f64[1] = vbslq_f64(vmask, a.vect_f64[1], b);
+	return res;
+}
+FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
+{
+	__m256d res;
+	res.vect_f64[0] = a;
+	return res;
+}
+
+#endif /* PF_AVX_DBL_H */
+
-- 
cgit v1.2.3


From 8dc269e569cfe6c796bf37a30c5cc4798be8a750 Mon Sep 17 00:00:00 2001
From: dario mambro <dario.mambro@gmail.com>
Date: Sat, 19 Dec 2020 15:25:21 +0100
Subject: simplified some neon code, changed some tabs into spaces

(cherry picked from commit c92f08c8226e4c069436751b09554ada362ae7c8)
---
 simd/pf_neon_double.h          | 31 +++++++++-------
 simd/pf_neon_double_from_avx.h | 83 ++++++++++++++++++------------------------
 2 files changed, 52 insertions(+), 62 deletions(-)

diff --git a/simd/pf_neon_double.h b/simd/pf_neon_double.h
index 1c8b852..140f465 100644
--- a/simd/pf_neon_double.h
+++ b/simd/pf_neon_double.h
@@ -65,6 +65,13 @@ typedef union v4sf_union {
 #  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
 #  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
 
+FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
+{
+    __m256d res;
+    res.vect_f64[0] = a.vect_f64[0];
+    res.vect_f64[1] = b;
+    return res;
+}
 
 FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
 {
@@ -135,14 +142,12 @@ out2 = [ in1[2], in2[2], in1[3], in2[3] ]
 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
 	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
-	__m256d tmp__ = _mm256_insertf128_pd(								\
+	__m256d tmp__ = _mm256_insertf128_pd_1(								\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)),		\
-		_mm_shuffle_pd_11(low1__, low2__),								\
-		1);																\
-	out2 = _mm256_insertf128_pd(										\
+		_mm_shuffle_pd_11(low1__, low2__));								\
+	out2 = _mm256_insertf128_pd_1(										\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)),	\
-		_mm_shuffle_pd_11(high1__, high2__),							\
-		1);																\
+		_mm_shuffle_pd_11(high1__, high2__));							\
 	out1 = tmp__;														\
 }
 
@@ -155,14 +160,12 @@ out2 = [ in1[1], in1[3], in2[1], in2[3] ]
 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
 	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
-	__m256d tmp__ = _mm256_insertf128_pd(								\
+	__m256d tmp__ = _mm256_insertf128_pd_1(								\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)),		\
-		_mm_shuffle_pd_00(low2__, high2__),								\
-		1);																\
-	out2 = _mm256_insertf128_pd(										\
+		_mm_shuffle_pd_00(low2__, high2__));							\
+	out2 = _mm256_insertf128_pd_1(										\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)),		\
-		_mm_shuffle_pd_11(low2__, high2__),								\
-		1);																\
+		_mm_shuffle_pd_11(low2__, high2__));							\
 	out1 = tmp__;														\
 }
 
@@ -184,13 +187,13 @@ out2 = [ in1[1], in1[3], in2[1], in2[3] ]
 return [ b[0], b[1], a[2], a[3] ]
 */
 #  define VSWAPHL(a,b)	\
-   _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
+   _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
 
 /* reverse/flip all floats */
 #  define VREV_S(a)   _mm256_reverse(a)
 
 /* reverse/flip complex floats */
-#  define VREV_C(a)    _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
+#  define VREV_C(a)    _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
 
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
 
diff --git a/simd/pf_neon_double_from_avx.h b/simd/pf_neon_double_from_avx.h
index c8cd74e..5cce17e 100644
--- a/simd/pf_neon_double_from_avx.h
+++ b/simd/pf_neon_double_from_avx.h
@@ -25,9 +25,7 @@
 #if defined(__GNUC__) || defined(__clang__)
 
 #pragma push_macro("FORCE_INLINE")
-#pragma push_macro("ALIGN_STRUCT")
 #define FORCE_INLINE static inline __attribute__((always_inline))
-#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
 
 #else
 
@@ -37,99 +35,88 @@
 #endif
 
 #define FORCE_INLINE static inline
-#ifndef ALIGN_STRUCT
-#define ALIGN_STRUCT(x) __declspec(align(x))
-#endif
 
 #endif
 
 typedef struct {
-	float32x4_t vect_f32[2];
+    float32x4_t vect_f32[2];
 } __m256;
 
 typedef struct {
-	float64x2_t vect_f64[2];
+    float64x2_t vect_f64[2];
 } __m256d;
 
 typedef float64x2_t __m128d;
 
 FORCE_INLINE __m256d _mm256_setzero_pd(void)
 {
-	__m256d ret;
-	ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
-	return ret;
+    __m256d ret;
+    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
+    return ret;
 }
 
 FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
 {
-	__m256d res_m256d;
-	res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
-	res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
-	return res_m256d;
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
 }
 
 FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
 {
-	__m256d res_m256d;
-	res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
-	res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
-	return res_m256d;
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
 }
 
 FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
 {
-	__m256d res_m256d;
-	res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
-	res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
-	return res_m256d;
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
 }
 
 FORCE_INLINE __m256d _mm256_set1_pd(double a)
 {
-	__m256d ret;
-	ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
-	return ret;
+    __m256d ret;
+    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
+    return ret;
 }
 
 FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
 {
-	__m256d res;
-	res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
-	res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
-	return res;
+    __m256d res;
+    res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+    res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+    return res;
 }
 FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
 {
-	__m256d res;
-	res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
-	res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
-	return res;
+    __m256d res;
+    res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+    res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+    return res;
 }
 
 FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
 {
-	return a.vect_f64[0];
+    return a.vect_f64[0];
 }
 
 FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
 {
-	assert(imm8 >= 0 && imm8 <= 1);
-	return a.vect_f64[imm8];
-}
-FORCE_INLINE __m256d _mm256_insertf128_pd(__m256d a, __m128d b, int imm8)
-{
-	assert(imm8 == 0 || imm8 == 1);
-	__m256d res;
-	uint64x2_t vmask = vceqq_s64(vdupq_n_s64(imm8), vdupq_n_s64(0));
-	res.vect_f64[0] = vbslq_f64(vmask, b, a.vect_f64[0]);
-	res.vect_f64[1] = vbslq_f64(vmask, a.vect_f64[1], b);
-	return res;
+    assert(imm8 >= 0 && imm8 <= 1);
+    return a.vect_f64[imm8];
 }
+
 FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
 {
-	__m256d res;
-	res.vect_f64[0] = a;
-	return res;
+    __m256d res;
+    res.vect_f64[0] = a;
+    return res;
 }
 
 #endif /* PF_AVX_DBL_H */
-- 
cgit v1.2.3


From 6d78ad89cc4f3802cdedd0ec5e2bcf9893ba20cc Mon Sep 17 00:00:00 2001
From: dario mambro <dario.mambro@gmail.com>
Date: Sat, 19 Dec 2020 22:23:29 +0100
Subject: added support for doubles with sse2

(cherry picked from commit a52039328c08555b1143dbc9100b8f6df44f2b90)
---
 CMakeLists.txt        |  10 +-
 simd/pf_avx_double.h  |   2 +-
 simd/pf_double.h      |   1 +
 simd/pf_sse2_double.h | 272 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 283 insertions(+), 2 deletions(-)
 create mode 100644 simd/pf_sse2_double.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 47cecb5..11dad3b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,7 @@ option(USE_TYPE_DOUBLE "activate 'double' precision float?" ON)
 
 # architecture/optimization options
 option(USE_SIMD        "use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? - " ON)
+option(DISABLE_SIMD_AVX "disable AVX CPU features? - " OFF)
 option(USE_SIMD_NEON   "force using NEON on ARM? (requires USE_SIMD)" OFF)
 option(USE_SCALAR_VECT "use 4-element vector scalar operations (if no other SIMD)" ON)
 
@@ -133,10 +134,17 @@ if (USE_SIMD AND USE_SIMD_NEON)
 endif()
 if (USE_SIMD AND USE_TYPE_DOUBLE)
   if(WIN32)
-    set_property(SOURCE pffft_double.c PROPERTY COMPILE_FLAGS "/arch:AVX")
+    if(DISABLE_SIMD_AVX)
+      set_property(SOURCE pffft_double.c PROPERTY COMPILE_FLAGS "/arch:SSE2")
+    else()
+      set_property(SOURCE pffft_double.c PROPERTY COMPILE_FLAGS "/arch:AVX")
+    endif()
   else()
     set_property(SOURCE pffft_double.c PROPERTY COMPILE_FLAGS "-march=native")
   endif()
+  if(DISABLE_SIMD_AVX)
+    target_compile_definitions(PFFFT PRIVATE PFFFT_AVX_DISABLE=1)
+  endif()
 endif()
 target_link_libraries( PFFFT ${MATHLIB} )
 set_property(TARGET PFFFT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
diff --git a/simd/pf_avx_double.h b/simd/pf_avx_double.h
index fe0efa8..251f0b9 100644
--- a/simd/pf_avx_double.h
+++ b/simd/pf_avx_double.h
@@ -46,7 +46,7 @@
 /*
   AVX support macros
 */
-#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && defined(__AVX__)
+#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && !defined(PFFFT_AVX_DISABLE) && defined(__AVX__)
 #pragma message( __FILE__ ": AVX macros are defined" )
 
 #include <immintrin.h>
diff --git a/simd/pf_double.h b/simd/pf_double.h
index c6c73ab..1025827 100644
--- a/simd/pf_double.h
+++ b/simd/pf_double.h
@@ -60,6 +60,7 @@
 typedef double vsfscalar;
 
 #include "pf_avx_double.h"
+#include "pf_sse2_double.h"
 #include "pf_neon_double.h"
 
 #ifndef SIMD_SZ
diff --git a/simd/pf_sse2_double.h b/simd/pf_sse2_double.h
new file mode 100644
index 0000000..1c1739d
--- /dev/null
+++ b/simd/pf_sse2_double.h
@@ -0,0 +1,272 @@
+/*
+   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
+*/
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_NEON_DBL_H
+#define PF_NEON_DBL_H
+
+/*
+  SSE2 64bit support macros
+*/
+#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (( __SSE2__ ) || defined ( __x86_64__ ))
+#pragma message __FILE__ ": SSE2 double macros are defined"
+
+#include <emmintrin.h>
+
+typedef struct {
+    __m128d d128[2];
+} __m256d;
+
+typedef __m256d v4sf;
+
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  double f[SIMD_SZ];
+} v4sf_union;
+
+
+#if defined(__GNUC__) || defined(__clang__)
+
+#pragma push_macro("FORCE_INLINE")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+
+#elif defined (_MSC_VER)
+#define FORCE_INLINE static __forceinline
+
+#else
+#error "Macro name collisions may happens with unknown compiler"
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+#define FORCE_INLINE static inline
+#endif
+
+FORCE_INLINE __m256d _mm256_setzero_pd(void)
+{
+    __m256d ret;
+    ret.d128[0] = ret.d128[1] = _mm_setzero_pd();
+    return ret;
+}
+
+FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
+{
+    __m256d ret;
+    ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]);
+    ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]);
+    return ret;
+}
+
+FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
+{
+    __m256d ret;
+    ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]);
+    ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]);
+    return ret;
+}
+
+FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
+{
+    __m256d ret;
+    ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]);
+    ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]);
+    return ret;
+}
+
+FORCE_INLINE __m256d _mm256_set1_pd(double a)
+{
+    __m256d ret;
+    ret.d128[0] = ret.d128[1] = _mm_set1_pd(a);
+    return ret;
+}
+
+FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
+{
+    __m256d res;
+    res.d128[0] = _mm_load_pd((const double *)mem_addr);
+    res.d128[1] = _mm_load_pd((const double *)mem_addr + 2);
+    return res;
+}
+FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
+{
+    __m256d res;
+    res.d128[0] = _mm_loadu_pd((const double *)mem_addr);
+    res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2);
+    return res;
+}
+
+
+#  define VARCH "SSE2"
+#  define VREQUIRES_ALIGN 1
+#  define VZERO() _mm256_setzero_pd()
+#  define VMUL(a,b) _mm256_mul_pd(a,b)
+#  define VADD(a,b) _mm256_add_pd(a,b)
+#  define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
+#  define VSUB(a,b) _mm256_sub_pd(a,b)
+#  define LD_PS1(p) _mm256_set1_pd(p)
+#  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
+#  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
+
+
+FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
+{
+    return a.d128[0];
+}
+
+FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 1);
+    return a.d128[imm8];
+}
+FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
+{
+    __m256d res;
+    res.d128[0] = a.d128[0];
+    res.d128[1] = b;
+    return res;
+}
+FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
+{
+    __m256d res;
+    res.d128[0] = a;
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
+{
+    __m256d res;
+    res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0);
+    res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0);
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
+{
+    __m256d res;
+    res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3);
+    res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3);
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
+    __m256d res;
+    res.d128[0] = a.d128[0];
+    res.d128[1] = b.d128[0];
+    return res;
+}
+
+
+FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
+{
+    __m256d res;
+    res.d128[0] = a.d128[1];
+    res.d128[1] = b.d128[1];
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_reverse(__m256d x)
+{
+    __m256d res;
+    res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1);
+    res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1);
+    return res;
+}
+
+/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in2[0], in1[1], in2[1] ]
+out2 = [ in1[2], in2[2], in1[3], in2[3] ]
+*/
+#  define INTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
+	__m256d tmp__ = _mm256_insertf128_pd_1(								\
+		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)),		\
+		_mm_shuffle_pd(low1__, low2__, 3));								\
+	out2 = _mm256_insertf128_pd_1(										\
+		_mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)),	\
+		_mm_shuffle_pd(high1__, high2__, 3));							\
+	out1 = tmp__;														\
+}
+
+/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in1[2], in2[0], in2[2] ]
+out2 = [ in1[1], in1[3], in2[1], in2[3] ]
+*/
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
+	__m256d tmp__ = _mm256_insertf128_pd_1(								\
+		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)),		\
+		_mm_shuffle_pd(low2__, high2__, 0));							\
+	out2 = _mm256_insertf128_pd_1(										\
+		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)),		\
+		_mm_shuffle_pd(low2__, high2__, 3));							\
+	out1 = tmp__;														\
+}
+
+#  define VTRANSPOSE4(row0, row1, row2, row3) {							\
+        __m256d tmp3, tmp2, tmp1, tmp0;                     			\
+                                                            			\
+        tmp0 = _mm256_shuffle_pd_00((row0),(row1));       				\
+        tmp2 = _mm256_shuffle_pd_11((row0),(row1));       				\
+        tmp1 = _mm256_shuffle_pd_00((row2),(row3));       				\
+        tmp3 = _mm256_shuffle_pd_11((row2),(row3));       				\
+                                                            			\
+        (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
+        (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
+        (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
+        (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
+    }
+
+/*VSWAPHL(a, b) pseudo code:
+return [ b[0], b[1], a[2], a[3] ]
+*/
+#  define VSWAPHL(a,b)	\
+   _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
+
+/* reverse/flip all floats */
+#  define VREV_S(a)   _mm256_reverse(a)
+
+/* reverse/flip complex floats */
+#  define VREV_C(a)    _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
+
+#endif
+#endif
-- 
cgit v1.2.3


From 776abf7ae79e9a43972b9d86dfe2cce2962a7897 Mon Sep 17 00:00:00 2001
From: dario mambro <dario.mambro@gmail.com>
Date: Thu, 24 Dec 2020 17:05:10 +0100
Subject: fixes in sse2 and neon implementation for doubles

---
 simd/pf_neon_double.h |   2 +-
 simd/pf_sse2_double.h | 149 ++++++++++++++++++++++++++------------------------
 2 files changed, 80 insertions(+), 71 deletions(-)

diff --git a/simd/pf_neon_double.h b/simd/pf_neon_double.h
index 140f465..e432abc 100644
--- a/simd/pf_neon_double.h
+++ b/simd/pf_neon_double.h
@@ -41,7 +41,7 @@
 */
 #if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
 
-#pragma message __FILE__ ": NEON (from AVX) macros are defined"
+#pragma message (__FILE__ ": NEON (from AVX) macros are defined" )
 
 #include "pf_neon_double_from_avx.h"
 typedef __m256d v4sf;
diff --git a/simd/pf_sse2_double.h b/simd/pf_sse2_double.h
index 1c1739d..6c53e8f 100644
--- a/simd/pf_sse2_double.h
+++ b/simd/pf_sse2_double.h
@@ -36,19 +36,28 @@
 #ifndef PF_NEON_DBL_H
 #define PF_NEON_DBL_H
 
+//detect sse2 support under MSVC
+#if defined ( _M_IX86_FP )
+#  if _M_IX86_FP == 2
+#    if !defined(__SSE2__)
+#      define __SSE2__
+#    endif
+#  endif
+#endif
+
 /*
   SSE2 64bit support macros
 */
-#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (( __SSE2__ ) || defined ( __x86_64__ ))
-#pragma message __FILE__ ": SSE2 double macros are defined"
+#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) |  defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ))
+#pragma message (__FILE__ ": SSE2 double macros are defined" )
 
 #include <emmintrin.h>
 
 typedef struct {
     __m128d d128[2];
-} __m256d;
+} m256d;
 
-typedef __m256d v4sf;
+typedef m256d v4sf;
 
 #  define SIMD_SZ 4
 
@@ -74,54 +83,54 @@ typedef union v4sf_union {
 #define FORCE_INLINE static inline
 #endif
 
-FORCE_INLINE __m256d _mm256_setzero_pd(void)
+FORCE_INLINE m256d mm256_setzero_pd(void)
 {
-    __m256d ret;
+    m256d ret;
     ret.d128[0] = ret.d128[1] = _mm_setzero_pd();
     return ret;
 }
 
-FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
+FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b)
 {
-    __m256d ret;
+    m256d ret;
     ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]);
     ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]);
     return ret;
 }
 
-FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
+FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b)
 {
-    __m256d ret;
+    m256d ret;
     ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]);
     ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]);
     return ret;
 }
 
-FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
+FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b)
 {
-    __m256d ret;
+    m256d ret;
     ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]);
     ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]);
     return ret;
 }
 
-FORCE_INLINE __m256d _mm256_set1_pd(double a)
+FORCE_INLINE m256d mm256_set1_pd(double a)
 {
-    __m256d ret;
+    m256d ret;
     ret.d128[0] = ret.d128[1] = _mm_set1_pd(a);
     return ret;
 }
 
-FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
+FORCE_INLINE m256d mm256_load_pd (double const * mem_addr)
 {
-    __m256d res;
+    m256d res;
     res.d128[0] = _mm_load_pd((const double *)mem_addr);
     res.d128[1] = _mm_load_pd((const double *)mem_addr + 2);
     return res;
 }
-FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
+FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr)
 {
-    __m256d res;
+    m256d res;
     res.d128[0] = _mm_loadu_pd((const double *)mem_addr);
     res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2);
     return res;
@@ -130,75 +139,75 @@ FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
 
 #  define VARCH "SSE2"
 #  define VREQUIRES_ALIGN 1
-#  define VZERO() _mm256_setzero_pd()
-#  define VMUL(a,b) _mm256_mul_pd(a,b)
-#  define VADD(a,b) _mm256_add_pd(a,b)
-#  define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
-#  define VSUB(a,b) _mm256_sub_pd(a,b)
-#  define LD_PS1(p) _mm256_set1_pd(p)
-#  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
-#  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
+#  define VZERO() mm256_setzero_pd()
+#  define VMUL(a,b) mm256_mul_pd(a,b)
+#  define VADD(a,b) mm256_add_pd(a,b)
+#  define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c)
+#  define VSUB(a,b) mm256_sub_pd(a,b)
+#  define LD_PS1(p) mm256_set1_pd(p)
+#  define VLOAD_UNALIGNED(ptr)  mm256_loadu_pd(ptr)
+#  define VLOAD_ALIGNED(ptr)    mm256_load_pd(ptr)
 
 
-FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
+FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a)
 {
     return a.d128[0];
 }
 
-FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
+FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8)
 {
     assert(imm8 >= 0 && imm8 <= 1);
     return a.d128[imm8];
 }
-FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
+FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b)
 {
-    __m256d res;
+    m256d res;
     res.d128[0] = a.d128[0];
     res.d128[1] = b;
     return res;
 }
-FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
+FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a)
 {
-    __m256d res;
+    m256d res;
     res.d128[0] = a;
     return res;
 }
 
-FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
+FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b)
 {
-    __m256d res;
+    m256d res;
     res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0);
     res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0);
     return res;
 }
 
-FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
+FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b)
 {
-    __m256d res;
+    m256d res;
     res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3);
     res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3);
     return res;
 }
 
-FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
-    __m256d res;
+FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) {
+    m256d res;
     res.d128[0] = a.d128[0];
     res.d128[1] = b.d128[0];
     return res;
 }
 
 
-FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
+FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b)
 {
-    __m256d res;
+    m256d res;
     res.d128[0] = a.d128[1];
     res.d128[1] = b.d128[1];
     return res;
 }
 
-FORCE_INLINE __m256d _mm256_reverse(__m256d x)
+FORCE_INLINE m256d mm256_reverse(m256d x)
 {
-    __m256d res;
+    m256d res;
     res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1);
     res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1);
     return res;
@@ -209,15 +218,15 @@ out1 = [ in1[0], in2[0], in1[1], in2[1] ]
 out2 = [ in1[2], in2[2], in1[3], in2[3] ]
 */
 #  define INTERLEAVE2(in1, in2, out1, out2) {							\
-	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
-	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
-	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
-	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
-	__m256d tmp__ = _mm256_insertf128_pd_1(								\
-		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)),		\
+	__m128d low1__ = mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = mm256_extractf128_pd(in2, 1);					\
+	m256d tmp__ = mm256_insertf128_pd_1(								\
+		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)),		\
 		_mm_shuffle_pd(low1__, low2__, 3));								\
-	out2 = _mm256_insertf128_pd_1(										\
-		_mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)),	\
+	out2 = mm256_insertf128_pd_1(										\
+		mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)),	\
 		_mm_shuffle_pd(high1__, high2__, 3));							\
 	out1 = tmp__;														\
 }
@@ -227,44 +236,44 @@ out1 = [ in1[0], in1[2], in2[0], in2[2] ]
 out2 = [ in1[1], in1[3], in2[1], in2[3] ]
 */
 #  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
-	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
-	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
-	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
-	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
-	__m256d tmp__ = _mm256_insertf128_pd_1(								\
-		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)),		\
+	__m128d low1__ = mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = mm256_extractf128_pd(in2, 1); 					\
+	m256d tmp__ = mm256_insertf128_pd_1(								\
+		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)),		\
 		_mm_shuffle_pd(low2__, high2__, 0));							\
-	out2 = _mm256_insertf128_pd_1(										\
-		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)),		\
+	out2 = mm256_insertf128_pd_1(										\
+		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)),		\
 		_mm_shuffle_pd(low2__, high2__, 3));							\
 	out1 = tmp__;														\
 }
 
 #  define VTRANSPOSE4(row0, row1, row2, row3) {							\
-        __m256d tmp3, tmp2, tmp1, tmp0;                     			\
+        m256d tmp3, tmp2, tmp1, tmp0;                     			\
                                                             			\
-        tmp0 = _mm256_shuffle_pd_00((row0),(row1));       				\
-        tmp2 = _mm256_shuffle_pd_11((row0),(row1));       				\
-        tmp1 = _mm256_shuffle_pd_00((row2),(row3));       				\
-        tmp3 = _mm256_shuffle_pd_11((row2),(row3));       				\
+        tmp0 = mm256_shuffle_pd_00((row0),(row1));       				\
+        tmp2 = mm256_shuffle_pd_11((row0),(row1));       				\
+        tmp1 = mm256_shuffle_pd_00((row2),(row3));       				\
+        tmp3 = mm256_shuffle_pd_11((row2),(row3));       				\
                                                             			\
-        (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
-        (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
-        (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
-        (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
+        (row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
+        (row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
+        (row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
+        (row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
     }
 
 /*VSWAPHL(a, b) pseudo code:
 return [ b[0], b[1], a[2], a[3] ]
 */
 #  define VSWAPHL(a,b)	\
-   _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
+   mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1))
 
 /* reverse/flip all floats */
-#  define VREV_S(a)   _mm256_reverse(a)
+#  define VREV_S(a)   mm256_reverse(a)
 
 /* reverse/flip complex floats */
-#  define VREV_C(a)    _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
+#  define VREV_C(a)    mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a))
 
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
 
-- 
cgit v1.2.3


From 00ba746dacda897031a62a54b2e50c6be9d6320b Mon Sep 17 00:00:00 2001
From: dario mambro <dario.mambro@gmail.com>
Date: Thu, 24 Dec 2020 17:06:03 +0100
Subject: added cmake option to fix building with MSVC using clangCL

---
 CMakeLists.txt | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 11dad3b..7856b75 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,6 +19,7 @@ option(USE_BENCH_POCKET "use PocketFFT in fft benchmark? - if exists in subdir"
 
 option(USE_DEBUG_ASAN  "use GCC's address sanitizer?" OFF)
 
+option(DISABLE_LINK_WITH_M "Disables linking with m library to build with clangCL from MSVC" OFF)
 
 # C90 requires the gcc extensions for function attributes like always_inline
 # C99 provides the function attributes: no gcc extensions required
@@ -95,8 +96,11 @@ if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
   )
 
 else()
-  message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
-  set(MATHLIB "m")
+  if(DISABLE_LINK_WITH_M)
+  else()
+    message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
+    set(MATHLIB "m")
+  endif()
 endif()
 
 set( SIMD_FLOAT_HDRS simd/pf_float.h simd/pf_sse1_float.h simd/pf_altivec_float.h simd/pf_neon_float.h simd/pf_scalar_float.h )
-- 
cgit v1.2.3