simplified some neon code, changed some tabs into spaces

(cherry picked from commit c92f08c8226e4c069436751b09554ada362ae7c8)
author: dario mambro <dario.mambro@gmail.com> 2020-12-19 15:25:21 +0100
committer: dario mambro <dario.mambro@gmail.com> 2020-12-24 14:32:36 +0100
commit: 8dc269e569cfe6c796bf37a30c5cc4798be8a750 (patch)
tree: e4389a26f6d726ce347dcf747de4a37b161d760e
parent: 5fb07d7117dab7e1511895e8106224876abadd00 (diff)
download: pffft-8dc269e569cfe6c796bf37a30c5cc4798be8a750.tar.gz
2 files changed, 52 insertions, 62 deletions
diff --git a/simd/pf_neon_double.h b/simd/pf_neon_double.h
index 1c8b852..140f465 100644
--- a/simd/pf_neon_double.h
+++ b/simd/pf_neon_double.h
@@ -65,6 +65,13 @@ typedef union v4sf_union {
 #  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
 #  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
 
+FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
+{
+    __m256d res;
+    res.vect_f64[0] = a.vect_f64[0];
+    res.vect_f64[1] = b;
+    return res;
+}
 
 FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
 {
@@ -135,14 +142,12 @@ out2 = [ in1[2], in2[2], in1[3], in2[3] ]
 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
 	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
-	__m256d tmp__ = _mm256_insertf128_pd(								\
+	__m256d tmp__ = _mm256_insertf128_pd_1(								\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)),		\
-		_mm_shuffle_pd_11(low1__, low2__),								\
-		1);																\
-	out2 = _mm256_insertf128_pd(										\
+		_mm_shuffle_pd_11(low1__, low2__));								\
+	out2 = _mm256_insertf128_pd_1(										\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)),	\
-		_mm_shuffle_pd_11(high1__, high2__),							\
-		1);																\
+		_mm_shuffle_pd_11(high1__, high2__));							\
 	out1 = tmp__;														\
 }
 
@@ -155,14 +160,12 @@ out2 = [ in1[1], in1[3], in2[1], in2[3] ]
 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
 	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
-	__m256d tmp__ = _mm256_insertf128_pd(								\
+	__m256d tmp__ = _mm256_insertf128_pd_1(								\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)),		\
-		_mm_shuffle_pd_00(low2__, high2__),								\
-		1);																\
-	out2 = _mm256_insertf128_pd(										\
+		_mm_shuffle_pd_00(low2__, high2__));							\
+	out2 = _mm256_insertf128_pd_1(										\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)),		\
-		_mm_shuffle_pd_11(low2__, high2__),								\
-		1);																\
+		_mm_shuffle_pd_11(low2__, high2__));							\
 	out1 = tmp__;														\
 }
 
@@ -184,13 +187,13 @@ out2 = [ in1[1], in1[3], in2[1], in2[3] ]
 return [ b[0], b[1], a[2], a[3] ]
 */
 #  define VSWAPHL(a,b)	\
-   _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
+   _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
 
 /* reverse/flip all floats */
 #  define VREV_S(a)   _mm256_reverse(a)
 
 /* reverse/flip complex floats */
-#  define VREV_C(a)    _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
+#  define VREV_C(a)    _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
 
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
 
diff --git a/simd/pf_neon_double_from_avx.h b/simd/pf_neon_double_from_avx.h
index c8cd74e..5cce17e 100644
--- a/simd/pf_neon_double_from_avx.h
+++ b/simd/pf_neon_double_from_avx.h
@@ -25,9 +25,7 @@
 #if defined(__GNUC__) || defined(__clang__)
 
 #pragma push_macro("FORCE_INLINE")
-#pragma push_macro("ALIGN_STRUCT")
 #define FORCE_INLINE static inline __attribute__((always_inline))
-#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
 
 #else
 
@@ -37,99 +35,88 @@
 #endif
 
 #define FORCE_INLINE static inline
-#ifndef ALIGN_STRUCT
-#define ALIGN_STRUCT(x) __declspec(align(x))
-#endif
 
 #endif
 
 typedef struct {
-	float32x4_t vect_f32[2];
+    float32x4_t vect_f32[2];
 } __m256;
 
 typedef struct {
-	float64x2_t vect_f64[2];
+    float64x2_t vect_f64[2];
 } __m256d;
 
 typedef float64x2_t __m128d;
 
 FORCE_INLINE __m256d _mm256_setzero_pd(void)
 {
-	__m256d ret;
-	ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
-	return ret;
+    __m256d ret;
+    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
+    return ret;
 }
 
 FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
 {
-	__m256d res_m256d;
-	res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
-	res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
-	return res_m256d;
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
 }
 
 FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
 {
-	__m256d res_m256d;
-	res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
-	res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
-	return res_m256d;
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
 }
 
 FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
 {
-	__m256d res_m256d;
-	res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
-	res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
-	return res_m256d;
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
 }
 
 FORCE_INLINE __m256d _mm256_set1_pd(double a)
 {
-	__m256d ret;
-	ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
-	return ret;
+    __m256d ret;
+    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
+    return ret;
 }
 
 FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
 {
-	__m256d res;
-	res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
-	res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
-	return res;
+    __m256d res;
+    res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+    res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+    return res;
 }
 FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
 {
-	__m256d res;
-	res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
-	res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
-	return res;
+    __m256d res;
+    res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+    res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+    return res;
 }
 
 FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
 {
-	return a.vect_f64[0];
+    return a.vect_f64[0];
 }
 
 FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
 {
-	assert(imm8 >= 0 && imm8 <= 1);
-	return a.vect_f64[imm8];
-}
-FORCE_INLINE __m256d _mm256_insertf128_pd(__m256d a, __m128d b, int imm8)
-{
-	assert(imm8 == 0 || imm8 == 1);
-	__m256d res;
-	uint64x2_t vmask = vceqq_s64(vdupq_n_s64(imm8), vdupq_n_s64(0));
-	res.vect_f64[0] = vbslq_f64(vmask, b, a.vect_f64[0]);
-	res.vect_f64[1] = vbslq_f64(vmask, a.vect_f64[1], b);
-	return res;
+    assert(imm8 >= 0 && imm8 <= 1);
+    return a.vect_f64[imm8];
 }
+
 FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
 {
-	__m256d res;
-	res.vect_f64[0] = a;
-	return res;
+    __m256d res;
+    res.vect_f64[0] = a;
+    return res;
 }
 
 #endif /* PF_AVX_DBL_H */
author	dario mambro <dario.mambro@gmail.com>	2020-12-19 15:25:21 +0100
committer	dario mambro <dario.mambro@gmail.com>	2020-12-24 14:32:36 +0100
commit	8dc269e569cfe6c796bf37a30c5cc4798be8a750 (patch)
tree	e4389a26f6d726ce347dcf747de4a37b161d760e
parent	5fb07d7117dab7e1511895e8106224876abadd00 (diff)
download	pffft-8dc269e569cfe6c796bf37a30c5cc4798be8a750.tar.gz