From 5fb07d7117dab7e1511895e8106224876abadd00 Mon Sep 17 00:00:00 2001 From: dario mambro Date: Sat, 19 Dec 2020 01:52:10 +0100 Subject: added support for doubles on neon (cherry picked from commit d43dfeca5679624cb04ef282d6807910fd218871) --- simd/pf_double.h | 1 + simd/pf_neon_double.h | 200 +++++++++++++++++++++++++++++++++++++++++ simd/pf_neon_double_from_avx.h | 136 ++++++++++++++++++++++++++++ 3 files changed, 337 insertions(+) create mode 100644 simd/pf_neon_double.h create mode 100644 simd/pf_neon_double_from_avx.h diff --git a/simd/pf_double.h b/simd/pf_double.h index 2052bbd..c6c73ab 100644 --- a/simd/pf_double.h +++ b/simd/pf_double.h @@ -60,6 +60,7 @@ typedef double vsfscalar; #include "pf_avx_double.h" +#include "pf_neon_double.h" #ifndef SIMD_SZ # if !defined(PFFFT_SIMD_DISABLE) diff --git a/simd/pf_neon_double.h b/simd/pf_neon_double.h new file mode 100644 index 0000000..1c8b852 --- /dev/null +++ b/simd/pf_neon_double.h @@ -0,0 +1,200 @@ +/* + Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) +*/ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +#ifndef PF_NEON_DBL_H +#define PF_NEON_DBL_H + +/* + NEON 64bit support macros +*/ +#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__)) + +#pragma message __FILE__ ": NEON (from AVX) macros are defined" + +#include "pf_neon_double_from_avx.h" +typedef __m256d v4sf; + +/* 4 doubles by simd vector */ +# define SIMD_SZ 4 + +typedef union v4sf_union { + v4sf v; + double f[SIMD_SZ]; +} v4sf_union; + +# define VARCH "NEON" +# define VREQUIRES_ALIGN 1 +# define VZERO() _mm256_setzero_pd() +# define VMUL(a,b) _mm256_mul_pd(a,b) +# define VADD(a,b) _mm256_add_pd(a,b) +# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c) +# define VSUB(a,b) _mm256_sub_pd(a,b) +# define LD_PS1(p) _mm256_set1_pd(p) +# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr) +# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr) + + +FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b) +{ + float64x1_t al = vget_low_f64(a); + float64x1_t bl = vget_low_f64(b); + return vcombine_f64(al, bl); +} + +FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b) +{ + float64x1_t ah = vget_high_f64(a); + float64x1_t bh = vget_high_f64(b); + return vcombine_f64(ah, bh); +} + +FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b) +{ + __m256d res; + res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]); + res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]); + return res; +} + +FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b) +{ + __m256d res; + res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]); + res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]); + return res; +} + +FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) { + __m256d res; + res.vect_f64[0] = a.vect_f64[0]; + res.vect_f64[1] = b.vect_f64[0]; + return res; +} + + +FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b) +{ + __m256d res; + res.vect_f64[0] = a.vect_f64[1]; + res.vect_f64[1] = b.vect_f64[1]; + return res; +} + +FORCE_INLINE __m256d _mm256_reverse(__m256d x) +{ + __m256d res; + float64x2_t low = x.vect_f64[0]; + float64x2_t high = x.vect_f64[1]; + float64x1_t a = vget_low_f64(low); + float64x1_t b = vget_high_f64(low); + float64x1_t c = vget_low_f64(high); + float64x1_t d = vget_high_f64(high); + res.vect_f64[0] = vcombine_f64(d, c); + res.vect_f64[1] = vcombine_f64(b, a); + return res; +} + +/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code: +out1 = [ in1[0], in2[0], in1[1], in2[1] ] +out2 = [ in1[2], in2[2], in1[3], in2[3] ] +*/ +# define INTERLEAVE2(in1, in2, out1, out2) { \ + __m128d low1__ = _mm256_castpd256_pd128(in1); \ + __m128d low2__ = _mm256_castpd256_pd128(in2); \ + __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ + __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ + __m256d tmp__ = _mm256_insertf128_pd( \ + _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \ + _mm_shuffle_pd_11(low1__, low2__), \ + 1); \ + out2 = _mm256_insertf128_pd( \ + _mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \ + _mm_shuffle_pd_11(high1__, high2__), \ + 1); \ + out1 = tmp__; \ +} + +/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code: +out1 = [ in1[0], in1[2], in2[0], in2[2] ] +out2 = [ in1[1], in1[3], in2[1], in2[3] ] +*/ +# define UNINTERLEAVE2(in1, in2, out1, out2) { \ + __m128d low1__ = _mm256_castpd256_pd128(in1); \ + __m128d low2__ = _mm256_castpd256_pd128(in2); \ + __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ + __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ + __m256d tmp__ = _mm256_insertf128_pd( \ + _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \ + _mm_shuffle_pd_00(low2__, high2__), \ + 1); \ + out2 = _mm256_insertf128_pd( \ + _mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \ + _mm_shuffle_pd_11(low2__, high2__), \ + 1); \ + out1 = tmp__; \ +} + +# define VTRANSPOSE4(row0, row1, row2, row3) { \ + __m256d tmp3, tmp2, tmp1, tmp0; \ + \ + tmp0 = _mm256_shuffle_pd_00((row0),(row1)); \ + tmp2 = _mm256_shuffle_pd_11((row0),(row1)); \ + tmp1 = _mm256_shuffle_pd_00((row2),(row3)); \ + tmp3 = _mm256_shuffle_pd_11((row2),(row3)); \ + \ + (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1); \ + (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); \ + (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); \ + (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); \ + } + +/*VSWAPHL(a, b) pseudo code: +return [ b[0], b[1], a[2], a[3] ] +*/ +# define VSWAPHL(a,b) \ + _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1) + +/* reverse/flip all floats */ +# define VREV_S(a) _mm256_reverse(a) + +/* reverse/flip complex floats */ +# define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1) + +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0) + +#endif + +#endif /* PF_AVX_DBL_H */ + diff --git a/simd/pf_neon_double_from_avx.h b/simd/pf_neon_double_from_avx.h new file mode 100644 index 0000000..c8cd74e --- /dev/null +++ b/simd/pf_neon_double_from_avx.h @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + + */ + +//see https://github.com/kunpengcompute/AvxToNeon + +#ifndef PF_NEON_DBL_FROM_AVX_H +#define PF_NEON_DBL_FROM_AVX_H +#include + + +#if defined(__GNUC__) || defined(__clang__) + +#pragma push_macro("FORCE_INLINE") +#pragma push_macro("ALIGN_STRUCT") +#define FORCE_INLINE static inline __attribute__((always_inline)) +#define ALIGN_STRUCT(x) __attribute__((aligned(x))) + +#else + +#error "Macro name collisions may happens with unknown compiler" +#ifdef FORCE_INLINE +#undef FORCE_INLINE +#endif + +#define FORCE_INLINE static inline +#ifndef ALIGN_STRUCT +#define ALIGN_STRUCT(x) __declspec(align(x)) +#endif + +#endif + +typedef struct { + float32x4_t vect_f32[2]; +} __m256; + +typedef struct { + float64x2_t vect_f64[2]; +} __m256d; + +typedef float64x2_t __m128d; + +FORCE_INLINE __m256d _mm256_setzero_pd(void) +{ + __m256d ret; + ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0); + return ret; +} + +FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b) +{ + __m256d res_m256d; + res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]); + res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]); + return res_m256d; +} + +FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b) +{ + __m256d res_m256d; + res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]); + res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]); + return res_m256d; +} + +FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b) +{ + __m256d res_m256d; + res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]); + res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]); + return res_m256d; +} + +FORCE_INLINE __m256d _mm256_set1_pd(double a) +{ + __m256d ret; + ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a); + return ret; +} + +FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr) +{ + __m256d res; + res.vect_f64[0] = vld1q_f64((const double *)mem_addr); + res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); + return res; +} +FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr) +{ + __m256d res; + res.vect_f64[0] = vld1q_f64((const double *)mem_addr); + res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); + return res; +} + +FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a) +{ + return a.vect_f64[0]; +} + +FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8) +{ + assert(imm8 >= 0 && imm8 <= 1); + return a.vect_f64[imm8]; +} +FORCE_INLINE __m256d _mm256_insertf128_pd(__m256d a, __m128d b, int imm8) +{ + assert(imm8 == 0 || imm8 == 1); + __m256d res; + uint64x2_t vmask = vceqq_s64(vdupq_n_s64(imm8), vdupq_n_s64(0)); + res.vect_f64[0] = vbslq_f64(vmask, b, a.vect_f64[0]); + res.vect_f64[1] = vbslq_f64(vmask, a.vect_f64[1], b); + return res; +} +FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a) +{ + __m256d res; + res.vect_f64[0] = a; + return res; +} + +#endif /* PF_AVX_DBL_H */ + -- cgit v1.2.3 From 8dc269e569cfe6c796bf37a30c5cc4798be8a750 Mon Sep 17 00:00:00 2001 From: dario mambro Date: Sat, 19 Dec 2020 15:25:21 +0100 Subject: simplified some neon code, changed some tabs into spaces (cherry picked from commit c92f08c8226e4c069436751b09554ada362ae7c8) --- simd/pf_neon_double.h | 31 +++++++++------- simd/pf_neon_double_from_avx.h | 83 ++++++++++++++++++------------------------ 2 files changed, 52 insertions(+), 62 deletions(-) diff --git a/simd/pf_neon_double.h b/simd/pf_neon_double.h index 1c8b852..140f465 100644 --- a/simd/pf_neon_double.h +++ b/simd/pf_neon_double.h @@ -65,6 +65,13 @@ typedef union v4sf_union { # define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr) # define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr) +FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b) +{ + __m256d res; + res.vect_f64[0] = a.vect_f64[0]; + res.vect_f64[1] = b; + return res; +} FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b) { @@ -135,14 +142,12 @@ out2 = [ in1[2], in2[2], in1[3], in2[3] ] __m128d low2__ = _mm256_castpd256_pd128(in2); \ __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ - __m256d tmp__ = _mm256_insertf128_pd( \ + __m256d tmp__ = _mm256_insertf128_pd_1( \ _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \ - _mm_shuffle_pd_11(low1__, low2__), \ - 1); \ - out2 = _mm256_insertf128_pd( \ + _mm_shuffle_pd_11(low1__, low2__)); \ + out2 = _mm256_insertf128_pd_1( \ _mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \ - _mm_shuffle_pd_11(high1__, high2__), \ - 1); \ + _mm_shuffle_pd_11(high1__, high2__)); \ out1 = tmp__; \ } @@ -155,14 +160,12 @@ out2 = [ in1[1], in1[3], in2[1], in2[3] ] __m128d low2__ = _mm256_castpd256_pd128(in2); \ __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ - __m256d tmp__ = _mm256_insertf128_pd( \ + __m256d tmp__ = _mm256_insertf128_pd_1( \ _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \ - _mm_shuffle_pd_00(low2__, high2__), \ - 1); \ - out2 = _mm256_insertf128_pd( \ + _mm_shuffle_pd_00(low2__, high2__)); \ + out2 = _mm256_insertf128_pd_1( \ _mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \ - _mm_shuffle_pd_11(low2__, high2__), \ - 1); \ + _mm_shuffle_pd_11(low2__, high2__)); \ out1 = tmp__; \ } @@ -184,13 +187,13 @@ out2 = [ in1[1], in1[3], in2[1], in2[3] ] return [ b[0], b[1], a[2], a[3] ] */ # define VSWAPHL(a,b) \ - _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1) + _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1)) /* reverse/flip all floats */ # define VREV_S(a) _mm256_reverse(a) /* reverse/flip complex floats */ -# define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1) +# define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a)) # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0) diff --git a/simd/pf_neon_double_from_avx.h b/simd/pf_neon_double_from_avx.h index c8cd74e..5cce17e 100644 --- a/simd/pf_neon_double_from_avx.h +++ b/simd/pf_neon_double_from_avx.h @@ -25,9 +25,7 @@ #if defined(__GNUC__) || defined(__clang__) #pragma push_macro("FORCE_INLINE") -#pragma push_macro("ALIGN_STRUCT") #define FORCE_INLINE static inline __attribute__((always_inline)) -#define ALIGN_STRUCT(x) __attribute__((aligned(x))) #else @@ -37,99 +35,88 @@ #endif #define FORCE_INLINE static inline -#ifndef ALIGN_STRUCT -#define ALIGN_STRUCT(x) __declspec(align(x)) -#endif #endif typedef struct { - float32x4_t vect_f32[2]; + float32x4_t vect_f32[2]; } __m256; typedef struct { - float64x2_t vect_f64[2]; + float64x2_t vect_f64[2]; } __m256d; typedef float64x2_t __m128d; FORCE_INLINE __m256d _mm256_setzero_pd(void) { - __m256d ret; - ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0); - return ret; + __m256d ret; + ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0); + return ret; } FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b) { - __m256d res_m256d; - res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]); - res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]); - return res_m256d; + __m256d res_m256d; + res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]); + res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]); + return res_m256d; } FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b) { - __m256d res_m256d; - res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]); - res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]); - return res_m256d; + __m256d res_m256d; + res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]); + res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]); + return res_m256d; } FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b) { - __m256d res_m256d; - res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]); - res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]); - return res_m256d; + __m256d res_m256d; + res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]); + res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]); + return res_m256d; } FORCE_INLINE __m256d _mm256_set1_pd(double a) { - __m256d ret; - ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a); - return ret; + __m256d ret; + ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a); + return ret; } FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr) { - __m256d res; - res.vect_f64[0] = vld1q_f64((const double *)mem_addr); - res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); - return res; + __m256d res; + res.vect_f64[0] = vld1q_f64((const double *)mem_addr); + res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); + return res; } FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr) { - __m256d res; - res.vect_f64[0] = vld1q_f64((const double *)mem_addr); - res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); - return res; + __m256d res; + res.vect_f64[0] = vld1q_f64((const double *)mem_addr); + res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); + return res; } FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a) { - return a.vect_f64[0]; + return a.vect_f64[0]; } FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8) { - assert(imm8 >= 0 && imm8 <= 1); - return a.vect_f64[imm8]; -} -FORCE_INLINE __m256d _mm256_insertf128_pd(__m256d a, __m128d b, int imm8) -{ - assert(imm8 == 0 || imm8 == 1); - __m256d res; - uint64x2_t vmask = vceqq_s64(vdupq_n_s64(imm8), vdupq_n_s64(0)); - res.vect_f64[0] = vbslq_f64(vmask, b, a.vect_f64[0]); - res.vect_f64[1] = vbslq_f64(vmask, a.vect_f64[1], b); - return res; + assert(imm8 >= 0 && imm8 <= 1); + return a.vect_f64[imm8]; } + FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a) { - __m256d res; - res.vect_f64[0] = a; - return res; + __m256d res; + res.vect_f64[0] = a; + return res; } #endif /* PF_AVX_DBL_H */ -- cgit v1.2.3 From 6d78ad89cc4f3802cdedd0ec5e2bcf9893ba20cc Mon Sep 17 00:00:00 2001 From: dario mambro Date: Sat, 19 Dec 2020 22:23:29 +0100 Subject: added support for doubles with sse2 (cherry picked from commit a52039328c08555b1143dbc9100b8f6df44f2b90) --- CMakeLists.txt | 10 +- simd/pf_avx_double.h | 2 +- simd/pf_double.h | 1 + simd/pf_sse2_double.h | 272 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 283 insertions(+), 2 deletions(-) create mode 100644 simd/pf_sse2_double.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 47cecb5..11dad3b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,7 @@ option(USE_TYPE_DOUBLE "activate 'double' precision float?" ON) # architecture/optimization options option(USE_SIMD "use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? - " ON) +option(DISABLE_SIMD_AVX "disable AVX CPU features? - " OFF) option(USE_SIMD_NEON "force using NEON on ARM? (requires USE_SIMD)" OFF) option(USE_SCALAR_VECT "use 4-element vector scalar operations (if no other SIMD)" ON) @@ -133,10 +134,17 @@ if (USE_SIMD AND USE_SIMD_NEON) endif() if (USE_SIMD AND USE_TYPE_DOUBLE) if(WIN32) - set_property(SOURCE pffft_double.c PROPERTY COMPILE_FLAGS "/arch:AVX") + if(DISABLE_SIMD_AVX) + set_property(SOURCE pffft_double.c PROPERTY COMPILE_FLAGS "/arch:SSE2") + else() + set_property(SOURCE pffft_double.c PROPERTY COMPILE_FLAGS "/arch:AVX") + endif() else() set_property(SOURCE pffft_double.c PROPERTY COMPILE_FLAGS "-march=native") endif() + if(DISABLE_SIMD_AVX) + target_compile_definitions(PFFFT PRIVATE PFFFT_AVX_DISABLE=1) + endif() endif() target_link_libraries( PFFFT ${MATHLIB} ) set_property(TARGET PFFFT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES diff --git a/simd/pf_avx_double.h b/simd/pf_avx_double.h index fe0efa8..251f0b9 100644 --- a/simd/pf_avx_double.h +++ b/simd/pf_avx_double.h @@ -46,7 +46,7 @@ /* AVX support macros */ -#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && defined(__AVX__) +#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && !defined(PFFFT_AVX_DISABLE) && defined(__AVX__) #pragma message( __FILE__ ": AVX macros are defined" ) #include diff --git a/simd/pf_double.h b/simd/pf_double.h index c6c73ab..1025827 100644 --- a/simd/pf_double.h +++ b/simd/pf_double.h @@ -60,6 +60,7 @@ typedef double vsfscalar; #include "pf_avx_double.h" +#include "pf_sse2_double.h" #include "pf_neon_double.h" #ifndef SIMD_SZ diff --git a/simd/pf_sse2_double.h b/simd/pf_sse2_double.h new file mode 100644 index 0000000..1c1739d --- /dev/null +++ b/simd/pf_sse2_double.h @@ -0,0 +1,272 @@ +/* + Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) +*/ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +#ifndef PF_NEON_DBL_H +#define PF_NEON_DBL_H + +/* + SSE2 64bit support macros +*/ +#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (( __SSE2__ ) || defined ( __x86_64__ )) +#pragma message __FILE__ ": SSE2 double macros are defined" + +#include + +typedef struct { + __m128d d128[2]; +} __m256d; + +typedef __m256d v4sf; + +# define SIMD_SZ 4 + +typedef union v4sf_union { + v4sf v; + double f[SIMD_SZ]; +} v4sf_union; + + +#if defined(__GNUC__) || defined(__clang__) + +#pragma push_macro("FORCE_INLINE") +#define FORCE_INLINE static inline __attribute__((always_inline)) + +#elif defined (_MSC_VER) +#define FORCE_INLINE static __forceinline + +#else +#error "Macro name collisions may happens with unknown compiler" +#ifdef FORCE_INLINE +#undef FORCE_INLINE +#endif +#define FORCE_INLINE static inline +#endif + +FORCE_INLINE __m256d _mm256_setzero_pd(void) +{ + __m256d ret; + ret.d128[0] = ret.d128[1] = _mm_setzero_pd(); + return ret; +} + +FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b) +{ + __m256d ret; + ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]); + ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]); + return ret; +} + +FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b) +{ + __m256d ret; + ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]); + ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]); + return ret; +} + +FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b) +{ + __m256d ret; + ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]); + ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]); + return ret; +} + +FORCE_INLINE __m256d _mm256_set1_pd(double a) +{ + __m256d ret; + ret.d128[0] = ret.d128[1] = _mm_set1_pd(a); + return ret; +} + +FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr) +{ + __m256d res; + res.d128[0] = _mm_load_pd((const double *)mem_addr); + res.d128[1] = _mm_load_pd((const double *)mem_addr + 2); + return res; +} +FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr) +{ + __m256d res; + res.d128[0] = _mm_loadu_pd((const double *)mem_addr); + res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2); + return res; +} + + +# define VARCH "SSE2" +# define VREQUIRES_ALIGN 1 +# define VZERO() _mm256_setzero_pd() +# define VMUL(a,b) _mm256_mul_pd(a,b) +# define VADD(a,b) _mm256_add_pd(a,b) +# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c) +# define VSUB(a,b) _mm256_sub_pd(a,b) +# define LD_PS1(p) _mm256_set1_pd(p) +# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr) +# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr) + + +FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a) +{ + return a.d128[0]; +} + +FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8) +{ + assert(imm8 >= 0 && imm8 <= 1); + return a.d128[imm8]; +} +FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b) +{ + __m256d res; + res.d128[0] = a.d128[0]; + res.d128[1] = b; + return res; +} +FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a) +{ + __m256d res; + res.d128[0] = a; + return res; +} + +FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b) +{ + __m256d res; + res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0); + res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0); + return res; +} + +FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b) +{ + __m256d res; + res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3); + res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3); + return res; +} + +FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) { + __m256d res; + res.d128[0] = a.d128[0]; + res.d128[1] = b.d128[0]; + return res; +} + + +FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b) +{ + __m256d res; + res.d128[0] = a.d128[1]; + res.d128[1] = b.d128[1]; + return res; +} + +FORCE_INLINE __m256d _mm256_reverse(__m256d x) +{ + __m256d res; + res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1); + res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1); + return res; +} + +/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code: +out1 = [ in1[0], in2[0], in1[1], in2[1] ] +out2 = [ in1[2], in2[2], in1[3], in2[3] ] +*/ +# define INTERLEAVE2(in1, in2, out1, out2) { \ + __m128d low1__ = _mm256_castpd256_pd128(in1); \ + __m128d low2__ = _mm256_castpd256_pd128(in2); \ + __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ + __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ + __m256d tmp__ = _mm256_insertf128_pd_1( \ + _mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \ + _mm_shuffle_pd(low1__, low2__, 3)); \ + out2 = _mm256_insertf128_pd_1( \ + _mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \ + _mm_shuffle_pd(high1__, high2__, 3)); \ + out1 = tmp__; \ +} + +/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code: +out1 = [ in1[0], in1[2], in2[0], in2[2] ] +out2 = [ in1[1], in1[3], in2[1], in2[3] ] +*/ +# define UNINTERLEAVE2(in1, in2, out1, out2) { \ + __m128d low1__ = _mm256_castpd256_pd128(in1); \ + __m128d low2__ = _mm256_castpd256_pd128(in2); \ + __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ + __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ + __m256d tmp__ = _mm256_insertf128_pd_1( \ + _mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \ + _mm_shuffle_pd(low2__, high2__, 0)); \ + out2 = _mm256_insertf128_pd_1( \ + _mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \ + _mm_shuffle_pd(low2__, high2__, 3)); \ + out1 = tmp__; \ +} + +# define VTRANSPOSE4(row0, row1, row2, row3) { \ + __m256d tmp3, tmp2, tmp1, tmp0; \ + \ + tmp0 = _mm256_shuffle_pd_00((row0),(row1)); \ + tmp2 = _mm256_shuffle_pd_11((row0),(row1)); \ + tmp1 = _mm256_shuffle_pd_00((row2),(row3)); \ + tmp3 = _mm256_shuffle_pd_11((row2),(row3)); \ + \ + (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1); \ + (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); \ + (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); \ + (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); \ + } + +/*VSWAPHL(a, b) pseudo code: +return [ b[0], b[1], a[2], a[3] ] +*/ +# define VSWAPHL(a,b) \ + _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1)) + +/* reverse/flip all floats */ +# define VREV_S(a) _mm256_reverse(a) + +/* reverse/flip complex floats */ +# define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a)) + +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0) + +#endif +#endif -- cgit v1.2.3 From 776abf7ae79e9a43972b9d86dfe2cce2962a7897 Mon Sep 17 00:00:00 2001 From: dario mambro Date: Thu, 24 Dec 2020 17:05:10 +0100 Subject: fixes in sse2 and neon implementation for doubles --- simd/pf_neon_double.h | 2 +- simd/pf_sse2_double.h | 149 ++++++++++++++++++++++++++------------------------ 2 files changed, 80 insertions(+), 71 deletions(-) diff --git a/simd/pf_neon_double.h b/simd/pf_neon_double.h index 140f465..e432abc 100644 --- a/simd/pf_neon_double.h +++ b/simd/pf_neon_double.h @@ -41,7 +41,7 @@ */ #if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__)) -#pragma message __FILE__ ": NEON (from AVX) macros are defined" +#pragma message (__FILE__ ": NEON (from AVX) macros are defined" ) #include "pf_neon_double_from_avx.h" typedef __m256d v4sf; diff --git a/simd/pf_sse2_double.h b/simd/pf_sse2_double.h index 1c1739d..6c53e8f 100644 --- a/simd/pf_sse2_double.h +++ b/simd/pf_sse2_double.h @@ -36,19 +36,28 @@ #ifndef PF_NEON_DBL_H #define PF_NEON_DBL_H +//detect sse2 support under MSVC +#if defined ( _M_IX86_FP ) +# if _M_IX86_FP == 2 +# if !defined(__SSE2__) +# define __SSE2__ +# endif +# endif +#endif + /* SSE2 64bit support macros */ -#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (( __SSE2__ ) || defined ( __x86_64__ )) -#pragma message __FILE__ ": SSE2 double macros are defined" +#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) | defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ )) +#pragma message (__FILE__ ": SSE2 double macros are defined" ) #include typedef struct { __m128d d128[2]; -} __m256d; +} m256d; -typedef __m256d v4sf; +typedef m256d v4sf; # define SIMD_SZ 4 @@ -74,54 +83,54 @@ typedef union v4sf_union { #define FORCE_INLINE static inline #endif -FORCE_INLINE __m256d _mm256_setzero_pd(void) +FORCE_INLINE m256d mm256_setzero_pd(void) { - __m256d ret; + m256d ret; ret.d128[0] = ret.d128[1] = _mm_setzero_pd(); return ret; } -FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b) +FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b) { - __m256d ret; + m256d ret; ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]); ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]); return ret; } -FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b) +FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b) { - __m256d ret; + m256d ret; ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]); ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]); return ret; } -FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b) +FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b) { - __m256d ret; + m256d ret; ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]); ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]); return ret; } -FORCE_INLINE __m256d _mm256_set1_pd(double a) +FORCE_INLINE m256d mm256_set1_pd(double a) { - __m256d ret; + m256d ret; ret.d128[0] = ret.d128[1] = _mm_set1_pd(a); return ret; } -FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr) +FORCE_INLINE m256d mm256_load_pd (double const * mem_addr) { - __m256d res; + m256d res; res.d128[0] = _mm_load_pd((const double *)mem_addr); res.d128[1] = _mm_load_pd((const double *)mem_addr + 2); return res; } -FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr) +FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr) { - __m256d res; + m256d res; res.d128[0] = _mm_loadu_pd((const double *)mem_addr); res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2); return res; @@ -130,75 +139,75 @@ FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr) # define VARCH "SSE2" # define VREQUIRES_ALIGN 1 -# define VZERO() _mm256_setzero_pd() -# define VMUL(a,b) _mm256_mul_pd(a,b) -# define VADD(a,b) _mm256_add_pd(a,b) -# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c) -# define VSUB(a,b) _mm256_sub_pd(a,b) -# define LD_PS1(p) _mm256_set1_pd(p) -# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr) -# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr) +# define VZERO() mm256_setzero_pd() +# define VMUL(a,b) mm256_mul_pd(a,b) +# define VADD(a,b) mm256_add_pd(a,b) +# define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c) +# define VSUB(a,b) mm256_sub_pd(a,b) +# define LD_PS1(p) mm256_set1_pd(p) +# define VLOAD_UNALIGNED(ptr) mm256_loadu_pd(ptr) +# define VLOAD_ALIGNED(ptr) mm256_load_pd(ptr) -FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a) +FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a) { return a.d128[0]; } -FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8) +FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8) { assert(imm8 >= 0 && imm8 <= 1); return a.d128[imm8]; } -FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b) +FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b) { - __m256d res; + m256d res; res.d128[0] = a.d128[0]; res.d128[1] = b; return res; } -FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a) +FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a) { - __m256d res; + m256d res; res.d128[0] = a; return res; } -FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b) +FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b) { - __m256d res; + m256d res; res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0); res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0); return res; } -FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b) +FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b) { - __m256d res; + m256d res; res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3); res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3); return res; } -FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) { - __m256d res; +FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) { + m256d res; res.d128[0] = a.d128[0]; res.d128[1] = b.d128[0]; return res; } -FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b) +FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b) { - __m256d res; + m256d res; res.d128[0] = a.d128[1]; res.d128[1] = b.d128[1]; return res; } -FORCE_INLINE __m256d _mm256_reverse(__m256d x) +FORCE_INLINE m256d mm256_reverse(m256d x) { - __m256d res; + m256d res; res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1); res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1); return res; @@ -209,15 +218,15 @@ out1 = [ in1[0], in2[0], in1[1], in2[1] ] out2 = [ in1[2], in2[2], in1[3], in2[3] ] */ # define INTERLEAVE2(in1, in2, out1, out2) { \ - __m128d low1__ = _mm256_castpd256_pd128(in1); \ - __m128d low2__ = _mm256_castpd256_pd128(in2); \ - __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ - __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ - __m256d tmp__ = _mm256_insertf128_pd_1( \ - _mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \ + __m128d low1__ = mm256_castpd256_pd128(in1); \ + __m128d low2__ = mm256_castpd256_pd128(in2); \ + __m128d high1__ = mm256_extractf128_pd(in1, 1); \ + __m128d high2__ = mm256_extractf128_pd(in2, 1); \ + m256d tmp__ = mm256_insertf128_pd_1( \ + mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \ _mm_shuffle_pd(low1__, low2__, 3)); \ - out2 = _mm256_insertf128_pd_1( \ - _mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \ + out2 = mm256_insertf128_pd_1( \ + mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \ _mm_shuffle_pd(high1__, high2__, 3)); \ out1 = tmp__; \ } @@ -227,44 +236,44 @@ out1 = [ in1[0], in1[2], in2[0], in2[2] ] out2 = [ in1[1], in1[3], in2[1], in2[3] ] */ # define UNINTERLEAVE2(in1, in2, out1, out2) { \ - __m128d low1__ = _mm256_castpd256_pd128(in1); \ - __m128d low2__ = _mm256_castpd256_pd128(in2); \ - __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ - __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ - __m256d tmp__ = _mm256_insertf128_pd_1( \ - _mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \ + __m128d low1__ = mm256_castpd256_pd128(in1); \ + __m128d low2__ = mm256_castpd256_pd128(in2); \ + __m128d high1__ = mm256_extractf128_pd(in1, 1); \ + __m128d high2__ = mm256_extractf128_pd(in2, 1); \ + m256d tmp__ = mm256_insertf128_pd_1( \ + mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \ _mm_shuffle_pd(low2__, high2__, 0)); \ - out2 = _mm256_insertf128_pd_1( \ - _mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \ + out2 = mm256_insertf128_pd_1( \ + mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \ _mm_shuffle_pd(low2__, high2__, 3)); \ out1 = tmp__; \ } # define VTRANSPOSE4(row0, row1, row2, row3) { \ - __m256d tmp3, tmp2, tmp1, tmp0; \ + m256d tmp3, tmp2, tmp1, tmp0; \ \ - tmp0 = _mm256_shuffle_pd_00((row0),(row1)); \ - tmp2 = _mm256_shuffle_pd_11((row0),(row1)); \ - tmp1 = _mm256_shuffle_pd_00((row2),(row3)); \ - tmp3 = _mm256_shuffle_pd_11((row2),(row3)); \ + tmp0 = mm256_shuffle_pd_00((row0),(row1)); \ + tmp2 = mm256_shuffle_pd_11((row0),(row1)); \ + tmp1 = mm256_shuffle_pd_00((row2),(row3)); \ + tmp3 = mm256_shuffle_pd_11((row2),(row3)); \ \ - (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1); \ - (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); \ - (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); \ - (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); \ + (row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1); \ + (row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); \ + (row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); \ + (row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); \ } /*VSWAPHL(a, b) pseudo code: return [ b[0], b[1], a[2], a[3] ] */ # define VSWAPHL(a,b) \ - _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1)) + mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1)) /* reverse/flip all floats */ -# define VREV_S(a) _mm256_reverse(a) +# define VREV_S(a) mm256_reverse(a) /* reverse/flip complex floats */ -# define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a)) +# define VREV_C(a) mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a)) # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0) -- cgit v1.2.3 From 00ba746dacda897031a62a54b2e50c6be9d6320b Mon Sep 17 00:00:00 2001 From: dario mambro Date: Thu, 24 Dec 2020 17:06:03 +0100 Subject: added cmake option to fix building with MSVC using clangCL --- CMakeLists.txt | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 11dad3b..7856b75 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,7 @@ option(USE_BENCH_POCKET "use PocketFFT in fft benchmark? - if exists in subdir" option(USE_DEBUG_ASAN "use GCC's address sanitizer?" OFF) +option(DISABLE_LINK_WITH_M "Disables linking with m library to build with clangCL from MSVC" OFF) # C90 requires the gcc extensions for function attributes like always_inline # C99 provides the function attributes: no gcc extensions required @@ -95,8 +96,11 @@ if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" ) ) else() - message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m") - set(MATHLIB "m") + if(DISABLE_LINK_WITH_M) + else() + message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m") + set(MATHLIB "m") + endif() endif() set( SIMD_FLOAT_HDRS simd/pf_float.h simd/pf_sse1_float.h simd/pf_altivec_float.h simd/pf_neon_float.h simd/pf_scalar_float.h ) -- cgit v1.2.3