diff options
Diffstat (limited to 'simd/pf_neon_double.h')
-rw-r--r-- | simd/pf_neon_double.h | 31 |
1 files changed, 17 insertions, 14 deletions
diff --git a/simd/pf_neon_double.h b/simd/pf_neon_double.h index 1c8b852..140f465 100644 --- a/simd/pf_neon_double.h +++ b/simd/pf_neon_double.h @@ -65,6 +65,13 @@ typedef union v4sf_union { # define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr) # define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr) +FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b) +{ + __m256d res; + res.vect_f64[0] = a.vect_f64[0]; + res.vect_f64[1] = b; + return res; +} FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b) { @@ -135,14 +142,12 @@ out2 = [ in1[2], in2[2], in1[3], in2[3] ] __m128d low2__ = _mm256_castpd256_pd128(in2); \ __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ - __m256d tmp__ = _mm256_insertf128_pd( \ + __m256d tmp__ = _mm256_insertf128_pd_1( \ _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \ - _mm_shuffle_pd_11(low1__, low2__), \ - 1); \ - out2 = _mm256_insertf128_pd( \ + _mm_shuffle_pd_11(low1__, low2__)); \ + out2 = _mm256_insertf128_pd_1( \ _mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \ - _mm_shuffle_pd_11(high1__, high2__), \ - 1); \ + _mm_shuffle_pd_11(high1__, high2__)); \ out1 = tmp__; \ } @@ -155,14 +160,12 @@ out2 = [ in1[1], in1[3], in2[1], in2[3] ] __m128d low2__ = _mm256_castpd256_pd128(in2); \ __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ - __m256d tmp__ = _mm256_insertf128_pd( \ + __m256d tmp__ = _mm256_insertf128_pd_1( \ _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \ - _mm_shuffle_pd_00(low2__, high2__), \ - 1); \ - out2 = _mm256_insertf128_pd( \ + _mm_shuffle_pd_00(low2__, high2__)); \ + out2 = _mm256_insertf128_pd_1( \ _mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \ - _mm_shuffle_pd_11(low2__, high2__), \ - 1); \ + _mm_shuffle_pd_11(low2__, high2__)); \ out1 = tmp__; \ } @@ -184,13 +187,13 @@ out2 = [ in1[1], in1[3], in2[1], in2[3] ] return [ b[0], b[1], a[2], a[3] ] */ # define VSWAPHL(a,b) \ - _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1) + _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1)) /* reverse/flip all floats */ # define VREV_S(a) _mm256_reverse(a) /* reverse/flip complex floats */ -# define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1) +# define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a)) # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0) |