simd/pf_neon_double_from_avx.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123

/*
 * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.

 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at

 * http://www.apache.org/licenses/LICENSE-2.0

 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.

 */

//see https://github.com/kunpengcompute/AvxToNeon

#ifndef PF_NEON_DBL_FROM_AVX_H
#define PF_NEON_DBL_FROM_AVX_H
#include <arm_neon.h>


#if defined(__GNUC__) || defined(__clang__)

#pragma push_macro("FORCE_INLINE")
#define FORCE_INLINE static inline __attribute__((always_inline))

#else

#error "Macro name collisions may happens with unknown compiler"
#ifdef FORCE_INLINE
#undef FORCE_INLINE
#endif

#define FORCE_INLINE static inline

#endif

typedef struct {
    float32x4_t vect_f32[2];
} __m256;

typedef struct {
    float64x2_t vect_f64[2];
} __m256d;

typedef float64x2_t __m128d;

FORCE_INLINE __m256d _mm256_setzero_pd(void)
{
    __m256d ret;
    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
    return ret;
}

FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
{
    __m256d res_m256d;
    res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
    res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
    return res_m256d;
}

FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
{
    __m256d res_m256d;
    res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
    res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
    return res_m256d;
}

FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
{
    __m256d res_m256d;
    res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
    res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
    return res_m256d;
}

FORCE_INLINE __m256d _mm256_set1_pd(double a)
{
    __m256d ret;
    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
    return ret;
}

FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
{
    __m256d res;
    res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
    res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
    return res;
}
FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
{
    __m256d res;
    res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
    res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
    return res;
}

FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
{
    return a.vect_f64[0];
}

FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
{
    assert(imm8 >= 0 && imm8 <= 1);
    return a.vect_f64[imm8];
}

FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
{
    __m256d res;
    res.vect_f64[0] = a;
    return res;
}

#endif /* PF_AVX_DBL_H */