simd/pf_neon_double_from_avx.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

/*
 * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.

 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at

 * http://www.apache.org/licenses/LICENSE-2.0

 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.

 */

//see https://github.com/kunpengcompute/AvxToNeon

#ifndef PF_NEON_DBL_FROM_AVX_H
#define PF_NEON_DBL_FROM_AVX_H
#include <arm_neon.h>


#if defined(__GNUC__) || defined(__clang__)

#pragma push_macro("FORCE_INLINE")
#pragma push_macro("ALIGN_STRUCT")
#define FORCE_INLINE static inline __attribute__((always_inline))
#define ALIGN_STRUCT(x) __attribute__((aligned(x)))

#else

#error "Macro name collisions may happens with unknown compiler"
#ifdef FORCE_INLINE
#undef FORCE_INLINE
#endif

#define FORCE_INLINE static inline
#ifndef ALIGN_STRUCT
#define ALIGN_STRUCT(x) __declspec(align(x))
#endif

#endif

typedef struct {
	float32x4_t vect_f32[2];
} __m256;

typedef struct {
	float64x2_t vect_f64[2];
} __m256d;

typedef float64x2_t __m128d;

FORCE_INLINE __m256d _mm256_setzero_pd(void)
{
	__m256d ret;
	ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
	return ret;
}

FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
{
	__m256d res_m256d;
	res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
	res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
	return res_m256d;
}

FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
{
	__m256d res_m256d;
	res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
	res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
	return res_m256d;
}

FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
{
	__m256d res_m256d;
	res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
	res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
	return res_m256d;
}

FORCE_INLINE __m256d _mm256_set1_pd(double a)
{
	__m256d ret;
	ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
	return ret;
}

FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
{
	__m256d res;
	res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
	res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
	return res;
}
FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
{
	__m256d res;
	res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
	res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
	return res;
}

FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
{
	return a.vect_f64[0];
}

FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
{
	assert(imm8 >= 0 && imm8 <= 1);
	return a.vect_f64[imm8];
}
FORCE_INLINE __m256d _mm256_insertf128_pd(__m256d a, __m128d b, int imm8)
{
	assert(imm8 == 0 || imm8 == 1);
	__m256d res;
	uint64x2_t vmask = vceqq_s64(vdupq_n_s64(imm8), vdupq_n_s64(0));
	res.vect_f64[0] = vbslq_f64(vmask, b, a.vect_f64[0]);
	res.vect_f64[1] = vbslq_f64(vmask, a.vect_f64[1], b);
	return res;
}
FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
{
	__m256d res;
	res.vect_f64[0] = a;
	return res;
}

#endif /* PF_AVX_DBL_H */