diff options
author | dario mambro <dario.mambro@gmail.com> | 2020-12-19 15:25:21 +0100 |
---|---|---|
committer | dario mambro <dario.mambro@gmail.com> | 2020-12-24 14:32:36 +0100 |
commit | 8dc269e569cfe6c796bf37a30c5cc4798be8a750 (patch) | |
tree | e4389a26f6d726ce347dcf747de4a37b161d760e | |
parent | 5fb07d7117dab7e1511895e8106224876abadd00 (diff) | |
download | pffft-8dc269e569cfe6c796bf37a30c5cc4798be8a750.tar.gz |
simplified some neon code, changed some tabs into spaces
(cherry picked from commit c92f08c8226e4c069436751b09554ada362ae7c8)
-rw-r--r-- | simd/pf_neon_double.h | 31 | ||||
-rw-r--r-- | simd/pf_neon_double_from_avx.h | 83 |
2 files changed, 52 insertions, 62 deletions
diff --git a/simd/pf_neon_double.h b/simd/pf_neon_double.h index 1c8b852..140f465 100644 --- a/simd/pf_neon_double.h +++ b/simd/pf_neon_double.h @@ -65,6 +65,13 @@ typedef union v4sf_union { # define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr) # define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr) +FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b) +{ + __m256d res; + res.vect_f64[0] = a.vect_f64[0]; + res.vect_f64[1] = b; + return res; +} FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b) { @@ -135,14 +142,12 @@ out2 = [ in1[2], in2[2], in1[3], in2[3] ] __m128d low2__ = _mm256_castpd256_pd128(in2); \ __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ - __m256d tmp__ = _mm256_insertf128_pd( \ + __m256d tmp__ = _mm256_insertf128_pd_1( \ _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \ - _mm_shuffle_pd_11(low1__, low2__), \ - 1); \ - out2 = _mm256_insertf128_pd( \ + _mm_shuffle_pd_11(low1__, low2__)); \ + out2 = _mm256_insertf128_pd_1( \ _mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \ - _mm_shuffle_pd_11(high1__, high2__), \ - 1); \ + _mm_shuffle_pd_11(high1__, high2__)); \ out1 = tmp__; \ } @@ -155,14 +160,12 @@ out2 = [ in1[1], in1[3], in2[1], in2[3] ] __m128d low2__ = _mm256_castpd256_pd128(in2); \ __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ - __m256d tmp__ = _mm256_insertf128_pd( \ + __m256d tmp__ = _mm256_insertf128_pd_1( \ _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \ - _mm_shuffle_pd_00(low2__, high2__), \ - 1); \ - out2 = _mm256_insertf128_pd( \ + _mm_shuffle_pd_00(low2__, high2__)); \ + out2 = _mm256_insertf128_pd_1( \ _mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \ - _mm_shuffle_pd_11(low2__, high2__), \ - 1); \ + _mm_shuffle_pd_11(low2__, high2__)); \ out1 = tmp__; \ } @@ -184,13 +187,13 @@ out2 = [ in1[1], in1[3], in2[1], in2[3] ] return [ b[0], b[1], a[2], a[3] ] */ # define VSWAPHL(a,b) \ - _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1) + _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1)) /* reverse/flip all floats */ # define VREV_S(a) _mm256_reverse(a) /* reverse/flip complex floats */ -# define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1) +# define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a)) # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0) diff --git a/simd/pf_neon_double_from_avx.h b/simd/pf_neon_double_from_avx.h index c8cd74e..5cce17e 100644 --- a/simd/pf_neon_double_from_avx.h +++ b/simd/pf_neon_double_from_avx.h @@ -25,9 +25,7 @@ #if defined(__GNUC__) || defined(__clang__) #pragma push_macro("FORCE_INLINE") -#pragma push_macro("ALIGN_STRUCT") #define FORCE_INLINE static inline __attribute__((always_inline)) -#define ALIGN_STRUCT(x) __attribute__((aligned(x))) #else @@ -37,99 +35,88 @@ #endif #define FORCE_INLINE static inline -#ifndef ALIGN_STRUCT -#define ALIGN_STRUCT(x) __declspec(align(x)) -#endif #endif typedef struct { - float32x4_t vect_f32[2]; + float32x4_t vect_f32[2]; } __m256; typedef struct { - float64x2_t vect_f64[2]; + float64x2_t vect_f64[2]; } __m256d; typedef float64x2_t __m128d; FORCE_INLINE __m256d _mm256_setzero_pd(void) { - __m256d ret; - ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0); - return ret; + __m256d ret; + ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0); + return ret; } FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b) { - __m256d res_m256d; - res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]); - res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]); - return res_m256d; + __m256d res_m256d; + res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]); + res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]); + return res_m256d; } FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b) { - __m256d res_m256d; - res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]); - res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]); - return res_m256d; + __m256d res_m256d; + res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]); + res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]); + return res_m256d; } FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b) { - __m256d res_m256d; - res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]); - res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]); - return res_m256d; + __m256d res_m256d; + res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]); + res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]); + return res_m256d; } FORCE_INLINE __m256d _mm256_set1_pd(double a) { - __m256d ret; - ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a); - return ret; + __m256d ret; + ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a); + return ret; } FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr) { - __m256d res; - res.vect_f64[0] = vld1q_f64((const double *)mem_addr); - res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); - return res; + __m256d res; + res.vect_f64[0] = vld1q_f64((const double *)mem_addr); + res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); + return res; } FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr) { - __m256d res; - res.vect_f64[0] = vld1q_f64((const double *)mem_addr); - res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); - return res; + __m256d res; + res.vect_f64[0] = vld1q_f64((const double *)mem_addr); + res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); + return res; } FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a) { - return a.vect_f64[0]; + return a.vect_f64[0]; } FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8) { - assert(imm8 >= 0 && imm8 <= 1); - return a.vect_f64[imm8]; -} -FORCE_INLINE __m256d _mm256_insertf128_pd(__m256d a, __m128d b, int imm8) -{ - assert(imm8 == 0 || imm8 == 1); - __m256d res; - uint64x2_t vmask = vceqq_s64(vdupq_n_s64(imm8), vdupq_n_s64(0)); - res.vect_f64[0] = vbslq_f64(vmask, b, a.vect_f64[0]); - res.vect_f64[1] = vbslq_f64(vmask, a.vect_f64[1], b); - return res; + assert(imm8 >= 0 && imm8 <= 1); + return a.vect_f64[imm8]; } + FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a) { - __m256d res; - res.vect_f64[0] = a; - return res; + __m256d res; + res.vect_f64[0] = a; + return res; } #endif /* PF_AVX_DBL_H */ |