aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordario mambro <dario.mambro@gmail.com>2020-12-19 01:52:10 +0100
committerdario mambro <dario.mambro@gmail.com>2020-12-24 14:31:39 +0100
commit5fb07d7117dab7e1511895e8106224876abadd00 (patch)
treeacca7886ff3c63c1afbca30cd0e02144910f2c9d
parent929060c0c3e6873bd7ee085af92695f76934adf9 (diff)
downloadpffft-5fb07d7117dab7e1511895e8106224876abadd00.tar.gz
added support for doubles on neon
(cherry picked from commit d43dfeca5679624cb04ef282d6807910fd218871)
-rw-r--r--simd/pf_double.h1
-rw-r--r--simd/pf_neon_double.h200
-rw-r--r--simd/pf_neon_double_from_avx.h136
3 files changed, 337 insertions, 0 deletions
diff --git a/simd/pf_double.h b/simd/pf_double.h
index 2052bbd..c6c73ab 100644
--- a/simd/pf_double.h
+++ b/simd/pf_double.h
@@ -60,6 +60,7 @@
typedef double vsfscalar;
#include "pf_avx_double.h"
+#include "pf_neon_double.h"
#ifndef SIMD_SZ
# if !defined(PFFFT_SIMD_DISABLE)
diff --git a/simd/pf_neon_double.h b/simd/pf_neon_double.h
new file mode 100644
index 0000000..1c8b852
--- /dev/null
+++ b/simd/pf_neon_double.h
@@ -0,0 +1,200 @@
+/*
+ Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
+*/
+
+/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
+
+ Redistribution and use of the Software in source and binary forms,
+ with or without modification, is permitted provided that the
+ following conditions are met:
+
+ - Neither the names of NCAR's Computational and Information Systems
+ Laboratory, the University Corporation for Atmospheric Research,
+ nor the names of its sponsors or contributors may be used to
+ endorse or promote products derived from this Software without
+ specific prior written permission.
+
+ - Redistributions of source code must retain the above copyright
+ notices, this list of conditions, and the disclaimer below.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the disclaimer below in the
+ documentation and/or other materials provided with the
+ distribution.
+
+ THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+ HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+ SOFTWARE.
+*/
+
+#ifndef PF_NEON_DBL_H
+#define PF_NEON_DBL_H
+
+/*
+ NEON 64bit support macros
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
+
+#pragma message __FILE__ ": NEON (from AVX) macros are defined"
+
+#include "pf_neon_double_from_avx.h"
+typedef __m256d v4sf;
+
+/* 4 doubles by simd vector */
+# define SIMD_SZ 4
+
+typedef union v4sf_union {
+ v4sf v;
+ double f[SIMD_SZ];
+} v4sf_union;
+
+# define VARCH "NEON"
+# define VREQUIRES_ALIGN 1
+# define VZERO() _mm256_setzero_pd()
+# define VMUL(a,b) _mm256_mul_pd(a,b)
+# define VADD(a,b) _mm256_add_pd(a,b)
+# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
+# define VSUB(a,b) _mm256_sub_pd(a,b)
+# define LD_PS1(p) _mm256_set1_pd(p)
+# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
+# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
+
+
+FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
+{
+ float64x1_t al = vget_low_f64(a);
+ float64x1_t bl = vget_low_f64(b);
+ return vcombine_f64(al, bl);
+}
+
+FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
+{
+ float64x1_t ah = vget_high_f64(a);
+ float64x1_t bh = vget_high_f64(b);
+ return vcombine_f64(ah, bh);
+}
+
+FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
+{
+ __m256d res;
+ res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
+ res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
+ return res;
+}
+
+FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
+{
+ __m256d res;
+ res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
+ res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
+ return res;
+}
+
+FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
+ __m256d res;
+ res.vect_f64[0] = a.vect_f64[0];
+ res.vect_f64[1] = b.vect_f64[0];
+ return res;
+}
+
+
+FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
+{
+ __m256d res;
+ res.vect_f64[0] = a.vect_f64[1];
+ res.vect_f64[1] = b.vect_f64[1];
+ return res;
+}
+
+FORCE_INLINE __m256d _mm256_reverse(__m256d x)
+{
+ __m256d res;
+ float64x2_t low = x.vect_f64[0];
+ float64x2_t high = x.vect_f64[1];
+ float64x1_t a = vget_low_f64(low);
+ float64x1_t b = vget_high_f64(low);
+ float64x1_t c = vget_low_f64(high);
+ float64x1_t d = vget_high_f64(high);
+ res.vect_f64[0] = vcombine_f64(d, c);
+ res.vect_f64[1] = vcombine_f64(b, a);
+ return res;
+}
+
+/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in2[0], in1[1], in2[1] ]
+out2 = [ in1[2], in2[2], in1[3], in2[3] ]
+*/
+# define INTERLEAVE2(in1, in2, out1, out2) { \
+ __m128d low1__ = _mm256_castpd256_pd128(in1); \
+ __m128d low2__ = _mm256_castpd256_pd128(in2); \
+ __m128d high1__ = _mm256_extractf128_pd(in1, 1); \
+ __m128d high2__ = _mm256_extractf128_pd(in2, 1); \
+ __m256d tmp__ = _mm256_insertf128_pd( \
+ _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \
+ _mm_shuffle_pd_11(low1__, low2__), \
+ 1); \
+ out2 = _mm256_insertf128_pd( \
+ _mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \
+ _mm_shuffle_pd_11(high1__, high2__), \
+ 1); \
+ out1 = tmp__; \
+}
+
+/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in1[2], in2[0], in2[2] ]
+out2 = [ in1[1], in1[3], in2[1], in2[3] ]
+*/
+# define UNINTERLEAVE2(in1, in2, out1, out2) { \
+ __m128d low1__ = _mm256_castpd256_pd128(in1); \
+ __m128d low2__ = _mm256_castpd256_pd128(in2); \
+ __m128d high1__ = _mm256_extractf128_pd(in1, 1); \
+ __m128d high2__ = _mm256_extractf128_pd(in2, 1); \
+ __m256d tmp__ = _mm256_insertf128_pd( \
+ _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \
+ _mm_shuffle_pd_00(low2__, high2__), \
+ 1); \
+ out2 = _mm256_insertf128_pd( \
+ _mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \
+ _mm_shuffle_pd_11(low2__, high2__), \
+ 1); \
+ out1 = tmp__; \
+}
+
+# define VTRANSPOSE4(row0, row1, row2, row3) { \
+ __m256d tmp3, tmp2, tmp1, tmp0; \
+ \
+ tmp0 = _mm256_shuffle_pd_00((row0),(row1)); \
+ tmp2 = _mm256_shuffle_pd_11((row0),(row1)); \
+ tmp1 = _mm256_shuffle_pd_00((row2),(row3)); \
+ tmp3 = _mm256_shuffle_pd_11((row2),(row3)); \
+ \
+ (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1); \
+ (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); \
+ (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); \
+ (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); \
+ }
+
+/*VSWAPHL(a, b) pseudo code:
+return [ b[0], b[1], a[2], a[3] ]
+*/
+# define VSWAPHL(a,b) \
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
+
+/* reverse/flip all floats */
+# define VREV_S(a) _mm256_reverse(a)
+
+/* reverse/flip complex floats */
+# define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
+
+# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
+
+#endif
+
+#endif /* PF_AVX_DBL_H */
+
diff --git a/simd/pf_neon_double_from_avx.h b/simd/pf_neon_double_from_avx.h
new file mode 100644
index 0000000..c8cd74e
--- /dev/null
+++ b/simd/pf_neon_double_from_avx.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+
+ * http://www.apache.org/licenses/LICENSE-2.0
+
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+
+ */
+
+//see https://github.com/kunpengcompute/AvxToNeon
+
+#ifndef PF_NEON_DBL_FROM_AVX_H
+#define PF_NEON_DBL_FROM_AVX_H
+#include <arm_neon.h>
+
+
+#if defined(__GNUC__) || defined(__clang__)
+
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+
+#else
+
+#error "Macro name collisions may happens with unknown compiler"
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+
+#define FORCE_INLINE static inline
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+
+#endif
+
+typedef struct {
+ float32x4_t vect_f32[2];
+} __m256;
+
+typedef struct {
+ float64x2_t vect_f64[2];
+} __m256d;
+
+typedef float64x2_t __m128d;
+
+FORCE_INLINE __m256d _mm256_setzero_pd(void)
+{
+ __m256d ret;
+ ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
+ return ret;
+}
+
+FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
+{
+ __m256d res_m256d;
+ res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
+ res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
+ return res_m256d;
+}
+
+FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
+{
+ __m256d res_m256d;
+ res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
+ res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
+ return res_m256d;
+}
+
+FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
+{
+ __m256d res_m256d;
+ res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
+ res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
+ return res_m256d;
+}
+
+FORCE_INLINE __m256d _mm256_set1_pd(double a)
+{
+ __m256d ret;
+ ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
+ return ret;
+}
+
+FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
+{
+ __m256d res;
+ res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+ res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+ return res;
+}
+FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
+{
+ __m256d res;
+ res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+ res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+ return res;
+}
+
+FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
+{
+ return a.vect_f64[0];
+}
+
+FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
+{
+ assert(imm8 >= 0 && imm8 <= 1);
+ return a.vect_f64[imm8];
+}
+FORCE_INLINE __m256d _mm256_insertf128_pd(__m256d a, __m128d b, int imm8)
+{
+ assert(imm8 == 0 || imm8 == 1);
+ __m256d res;
+ uint64x2_t vmask = vceqq_s64(vdupq_n_s64(imm8), vdupq_n_s64(0));
+ res.vect_f64[0] = vbslq_f64(vmask, b, a.vect_f64[0]);
+ res.vect_f64[1] = vbslq_f64(vmask, a.vect_f64[1], b);
+ return res;
+}
+FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
+{
+ __m256d res;
+ res.vect_f64[0] = a;
+ return res;
+}
+
+#endif /* PF_AVX_DBL_H */
+