From 5fb07d7117dab7e1511895e8106224876abadd00 Mon Sep 17 00:00:00 2001 From: dario mambro Date: Sat, 19 Dec 2020 01:52:10 +0100 Subject: added support for doubles on neon (cherry picked from commit d43dfeca5679624cb04ef282d6807910fd218871) --- simd/pf_double.h | 1 + simd/pf_neon_double.h | 200 +++++++++++++++++++++++++++++++++++++++++ simd/pf_neon_double_from_avx.h | 136 ++++++++++++++++++++++++++++ 3 files changed, 337 insertions(+) create mode 100644 simd/pf_neon_double.h create mode 100644 simd/pf_neon_double_from_avx.h diff --git a/simd/pf_double.h b/simd/pf_double.h index 2052bbd..c6c73ab 100644 --- a/simd/pf_double.h +++ b/simd/pf_double.h @@ -60,6 +60,7 @@ typedef double vsfscalar; #include "pf_avx_double.h" +#include "pf_neon_double.h" #ifndef SIMD_SZ # if !defined(PFFFT_SIMD_DISABLE) diff --git a/simd/pf_neon_double.h b/simd/pf_neon_double.h new file mode 100644 index 0000000..1c8b852 --- /dev/null +++ b/simd/pf_neon_double.h @@ -0,0 +1,200 @@ +/* + Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) +*/ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +#ifndef PF_NEON_DBL_H +#define PF_NEON_DBL_H + +/* + NEON 64bit support macros +*/ +#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__)) + +#pragma message __FILE__ ": NEON (from AVX) macros are defined" + +#include "pf_neon_double_from_avx.h" +typedef __m256d v4sf; + +/* 4 doubles by simd vector */ +# define SIMD_SZ 4 + +typedef union v4sf_union { + v4sf v; + double f[SIMD_SZ]; +} v4sf_union; + +# define VARCH "NEON" +# define VREQUIRES_ALIGN 1 +# define VZERO() _mm256_setzero_pd() +# define VMUL(a,b) _mm256_mul_pd(a,b) +# define VADD(a,b) _mm256_add_pd(a,b) +# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c) +# define VSUB(a,b) _mm256_sub_pd(a,b) +# define LD_PS1(p) _mm256_set1_pd(p) +# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr) +# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr) + + +FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b) +{ + float64x1_t al = vget_low_f64(a); + float64x1_t bl = vget_low_f64(b); + return vcombine_f64(al, bl); +} + +FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b) +{ + float64x1_t ah = vget_high_f64(a); + float64x1_t bh = vget_high_f64(b); + return vcombine_f64(ah, bh); +} + +FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b) +{ + __m256d res; + res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]); + res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]); + return res; +} + +FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b) +{ + __m256d res; + res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]); + res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]); + return res; +} + +FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) { + __m256d res; + res.vect_f64[0] = a.vect_f64[0]; + res.vect_f64[1] = b.vect_f64[0]; + return res; +} + + +FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b) +{ + __m256d res; + res.vect_f64[0] = a.vect_f64[1]; + res.vect_f64[1] = b.vect_f64[1]; + return res; +} + +FORCE_INLINE __m256d _mm256_reverse(__m256d x) +{ + __m256d res; + float64x2_t low = x.vect_f64[0]; + float64x2_t high = x.vect_f64[1]; + float64x1_t a = vget_low_f64(low); + float64x1_t b = vget_high_f64(low); + float64x1_t c = vget_low_f64(high); + float64x1_t d = vget_high_f64(high); + res.vect_f64[0] = vcombine_f64(d, c); + res.vect_f64[1] = vcombine_f64(b, a); + return res; +} + +/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code: +out1 = [ in1[0], in2[0], in1[1], in2[1] ] +out2 = [ in1[2], in2[2], in1[3], in2[3] ] +*/ +# define INTERLEAVE2(in1, in2, out1, out2) { \ + __m128d low1__ = _mm256_castpd256_pd128(in1); \ + __m128d low2__ = _mm256_castpd256_pd128(in2); \ + __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ + __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ + __m256d tmp__ = _mm256_insertf128_pd( \ + _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \ + _mm_shuffle_pd_11(low1__, low2__), \ + 1); \ + out2 = _mm256_insertf128_pd( \ + _mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \ + _mm_shuffle_pd_11(high1__, high2__), \ + 1); \ + out1 = tmp__; \ +} + +/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code: +out1 = [ in1[0], in1[2], in2[0], in2[2] ] +out2 = [ in1[1], in1[3], in2[1], in2[3] ] +*/ +# define UNINTERLEAVE2(in1, in2, out1, out2) { \ + __m128d low1__ = _mm256_castpd256_pd128(in1); \ + __m128d low2__ = _mm256_castpd256_pd128(in2); \ + __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ + __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ + __m256d tmp__ = _mm256_insertf128_pd( \ + _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \ + _mm_shuffle_pd_00(low2__, high2__), \ + 1); \ + out2 = _mm256_insertf128_pd( \ + _mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \ + _mm_shuffle_pd_11(low2__, high2__), \ + 1); \ + out1 = tmp__; \ +} + +# define VTRANSPOSE4(row0, row1, row2, row3) { \ + __m256d tmp3, tmp2, tmp1, tmp0; \ + \ + tmp0 = _mm256_shuffle_pd_00((row0),(row1)); \ + tmp2 = _mm256_shuffle_pd_11((row0),(row1)); \ + tmp1 = _mm256_shuffle_pd_00((row2),(row3)); \ + tmp3 = _mm256_shuffle_pd_11((row2),(row3)); \ + \ + (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1); \ + (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); \ + (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); \ + (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); \ + } + +/*VSWAPHL(a, b) pseudo code: +return [ b[0], b[1], a[2], a[3] ] +*/ +# define VSWAPHL(a,b) \ + _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1) + +/* reverse/flip all floats */ +# define VREV_S(a) _mm256_reverse(a) + +/* reverse/flip complex floats */ +# define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1) + +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0) + +#endif + +#endif /* PF_AVX_DBL_H */ + diff --git a/simd/pf_neon_double_from_avx.h b/simd/pf_neon_double_from_avx.h new file mode 100644 index 0000000..c8cd74e --- /dev/null +++ b/simd/pf_neon_double_from_avx.h @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + + */ + +//see https://github.com/kunpengcompute/AvxToNeon + +#ifndef PF_NEON_DBL_FROM_AVX_H +#define PF_NEON_DBL_FROM_AVX_H +#include + + +#if defined(__GNUC__) || defined(__clang__) + +#pragma push_macro("FORCE_INLINE") +#pragma push_macro("ALIGN_STRUCT") +#define FORCE_INLINE static inline __attribute__((always_inline)) +#define ALIGN_STRUCT(x) __attribute__((aligned(x))) + +#else + +#error "Macro name collisions may happens with unknown compiler" +#ifdef FORCE_INLINE +#undef FORCE_INLINE +#endif + +#define FORCE_INLINE static inline +#ifndef ALIGN_STRUCT +#define ALIGN_STRUCT(x) __declspec(align(x)) +#endif + +#endif + +typedef struct { + float32x4_t vect_f32[2]; +} __m256; + +typedef struct { + float64x2_t vect_f64[2]; +} __m256d; + +typedef float64x2_t __m128d; + +FORCE_INLINE __m256d _mm256_setzero_pd(void) +{ + __m256d ret; + ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0); + return ret; +} + +FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b) +{ + __m256d res_m256d; + res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]); + res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]); + return res_m256d; +} + +FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b) +{ + __m256d res_m256d; + res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]); + res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]); + return res_m256d; +} + +FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b) +{ + __m256d res_m256d; + res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]); + res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]); + return res_m256d; +} + +FORCE_INLINE __m256d _mm256_set1_pd(double a) +{ + __m256d ret; + ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a); + return ret; +} + +FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr) +{ + __m256d res; + res.vect_f64[0] = vld1q_f64((const double *)mem_addr); + res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); + return res; +} +FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr) +{ + __m256d res; + res.vect_f64[0] = vld1q_f64((const double *)mem_addr); + res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); + return res; +} + +FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a) +{ + return a.vect_f64[0]; +} + +FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8) +{ + assert(imm8 >= 0 && imm8 <= 1); + return a.vect_f64[imm8]; +} +FORCE_INLINE __m256d _mm256_insertf128_pd(__m256d a, __m128d b, int imm8) +{ + assert(imm8 == 0 || imm8 == 1); + __m256d res; + uint64x2_t vmask = vceqq_s64(vdupq_n_s64(imm8), vdupq_n_s64(0)); + res.vect_f64[0] = vbslq_f64(vmask, b, a.vect_f64[0]); + res.vect_f64[1] = vbslq_f64(vmask, a.vect_f64[1], b); + return res; +} +FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a) +{ + __m256d res; + res.vect_f64[0] = a; + return res; +} + +#endif /* PF_AVX_DBL_H */ + -- cgit v1.2.3