1 files changed, 17 insertions, 14 deletions
diff --git a/simd/pf_neon_double.h b/simd/pf_neon_double.h
index 1c8b852..140f465 100644
--- a/simd/pf_neon_double.h
+++ b/simd/pf_neon_double.h
@@ -65,6 +65,13 @@ typedef union v4sf_union {
 #  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
 #  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
 
+FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
+{
+    __m256d res;
+    res.vect_f64[0] = a.vect_f64[0];
+    res.vect_f64[1] = b;
+    return res;
+}
 
 FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
 {
@@ -135,14 +142,12 @@ out2 = [ in1[2], in2[2], in1[3], in2[3] ]
 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
 	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
-	__m256d tmp__ = _mm256_insertf128_pd(								\
+	__m256d tmp__ = _mm256_insertf128_pd_1(								\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)),		\
-		_mm_shuffle_pd_11(low1__, low2__),								\
-		1);																\
-	out2 = _mm256_insertf128_pd(										\
+		_mm_shuffle_pd_11(low1__, low2__));								\
+	out2 = _mm256_insertf128_pd_1(										\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)),	\
-		_mm_shuffle_pd_11(high1__, high2__),							\
-		1);																\
+		_mm_shuffle_pd_11(high1__, high2__));							\
 	out1 = tmp__;														\
 }
 
@@ -155,14 +160,12 @@ out2 = [ in1[1], in1[3], in2[1], in2[3] ]
 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
 	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
-	__m256d tmp__ = _mm256_insertf128_pd(								\
+	__m256d tmp__ = _mm256_insertf128_pd_1(								\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)),		\
-		_mm_shuffle_pd_00(low2__, high2__),								\
-		1);																\
-	out2 = _mm256_insertf128_pd(										\
+		_mm_shuffle_pd_00(low2__, high2__));							\
+	out2 = _mm256_insertf128_pd_1(										\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)),		\
-		_mm_shuffle_pd_11(low2__, high2__),								\
-		1);																\
+		_mm_shuffle_pd_11(low2__, high2__));							\
 	out1 = tmp__;														\
 }
 
@@ -184,13 +187,13 @@ out2 = [ in1[1], in1[3], in2[1], in2[3] ]
 return [ b[0], b[1], a[2], a[3] ]
 */
 #  define VSWAPHL(a,b)	\
-   _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
+   _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
 
 /* reverse/flip all floats */
 #  define VREV_S(a)   _mm256_reverse(a)
 
 /* reverse/flip complex floats */
-#  define VREV_C(a)    _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
+#  define VREV_C(a)    _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
 
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)